import pymongo
import pandas as pd
import matplotlib.pyplot as plt
import math
import numpy as np
from sklearn import model_selection
from IPython.display import display
import csv
At this point, we have already started the MongoDB daemon. First, we will open a connection with MongoDB, then we will get the database and the collection. Subsequently, we will check the number of documents in the collection and have a look to some of them. Finally, we will store the collection into a dataframe.
#Open a connection with mongodb
client = pymongo.MongoClient('localhost', 27017)
#get the database
print(client.list_database_names())
db = client.customers
#get the collection
print(db.list_collection_names())
colle = db.natural_person
#count the documents of the collection
print(colle.count_documents({}))
['admin', 'config', 'customers', 'local', 'test'] ['customers_data', 'legal_entity', 'natural_person'] 10000
#query some documents just to have a look
[colle.find()[i] for i in range(3)]
[{'_id': ObjectId('60152a83127b7b1168e813d2'), 'CustomerId': '15683544', 'Name': 'Buccho', 'CreditScore': 626, 'Geography': 'Spain', 'Gender': 'Male', 'Age': 62, 'Tenure': 3, 'Balance': 0.0, 'NumOfProducts': 1, 'HasCrCard': 1, 'IsActiveMember': 1, 'EstimatedIncome': 65010.74, 'Exited': 0, 'person': 'natural'}, {'_id': ObjectId('60152a83127b7b1168e813d3'), 'CustomerId': '15737489', 'Name': 'Ramsden', 'CreditScore': 610, 'Geography': 'Spain', 'Gender': 'Female', 'Age': 46, 'Tenure': 5, 'Balance': 116886.59, 'NumOfProducts': 1, 'HasCrCard': 0, 'IsActiveMember': 0, 'EstimatedIncome': 107973.44, 'Exited': 0, 'person': 'natural'}, {'_id': ObjectId('60152a83127b7b1168e813d4'), 'CustomerId': '15642391', 'Name': 'Lettiere', 'CreditScore': 621, 'Geography': 'Germany', 'Gender': 'Male', 'Age': 51, 'Tenure': 4, 'Balance': 109978.83, 'NumOfProducts': 1, 'HasCrCard': 0, 'IsActiveMember': 0, 'EstimatedIncome': 177740.58, 'Exited': 1, 'person': 'natural'}]
#query the whole collection and store it in a dataframe
datos=pd.DataFrame(list(colle.find()))
Now let's close the connection:
#close the connection
client.close()
To start with, have a look at the data and let's explore the types of the data and how many unique values there are in each feature:
datos.head()
_id | CustomerId | Name | CreditScore | Geography | Gender | Age | Tenure | Balance | NumOfProducts | HasCrCard | IsActiveMember | EstimatedIncome | Exited | person | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 60152a83127b7b1168e813d2 | 15683544 | Buccho | 626 | Spain | Male | 62 | 3 | 0.00 | 1 | 1 | 1 | 65010.74 | 0 | natural |
1 | 60152a83127b7b1168e813d3 | 15737489 | Ramsden | 610 | Spain | Female | 46 | 5 | 116886.59 | 1 | 0 | 0 | 107973.44 | 0 | natural |
2 | 60152a83127b7b1168e813d4 | 15642391 | Lettiere | 621 | Germany | Male | 51 | 4 | 109978.83 | 1 | 0 | 0 | 177740.58 | 1 | natural |
3 | 60152a83127b7b1168e813d5 | 15578788 | Bibi | 786 | Spain | Male | 40 | 6 | 0.00 | 2 | 0 | 0 | 41248.80 | 0 | natural |
4 | 60152a83127b7b1168e813d6 | 15693322 | Shaver | 635 | Germany | Female | 37 | 9 | 146748.07 | 1 | 0 | 1 | 11407.58 | 0 | natural |
#see the types of data in the data frame and their unique values
see_unique=dict()
for i in range(len(datos.columns)):
a=datos.columns[i]
b=len(list(datos.iloc[:,i].unique()))
c=str(datos.iloc[:,i].dtypes)
see_unique.update({a:[b,c]})
for k , v in iter(see_unique.items()):
print(str(k)+" : "+str(v))
_id : [10000, 'object'] CustomerId : [10000, 'object'] Name : [2932, 'object'] CreditScore : [460, 'int64'] Geography : [3, 'object'] Gender : [2, 'object'] Age : [70, 'int64'] Tenure : [11, 'int64'] Balance : [6382, 'float64'] NumOfProducts : [4, 'int64'] HasCrCard : [2, 'int64'] IsActiveMember : [2, 'int64'] EstimatedIncome : [9999, 'float64'] Exited : [2, 'int64'] person : [1, 'object']
We know that person is an identifier indicating if the client is a natutal person or a legal entity. We have already selected only the natural person, so the feature has just a unique value. Now let's see the unique value of the other features that have few unique values:
#let's se the unique values of variables with few unique values
some_variables=["Gender","Geography","NumOfProducts","HasCrCard","IsActiveMember","Exited"]
for nombre in some_variables:
print(datos[nombre].unique())
['Male' 'Female'] ['Spain' 'Germany' 'France'] [1 2 3 4] [1 0] [1 0] [0 1]
Exited is the binary (0, 1) response feature. Let's explore the relationships between the response feature and the other features listed here. We can do this with simple bar plots, using the plot method provided by pandas:
counts={}
for _ in ["Gender","Geography","NumOfProducts","HasCrCard","IsActiveMember"]:
item=datos[[_,'Exited']].groupby([_,'Exited']).size()
counts[_]=item
ks = list(counts.keys())
fig, ax = plt.subplots(nrows=3, ncols=2, figsize=[7, 10]) #figure size in inches
for i in range(len(counts)):
if i==2:
counts[ks[i]].plot(kind='bar', color=['green','green','blue','blue'], ax=ax[0,1])
elif i==3:
counts[ks[i]].plot(kind='bar', color=['green','green','blue','blue'], ax=ax[1,0])
elif i==4:
counts[ks[i]].plot(kind='bar', color=['green','green','blue','blue'], ax=ax[2,0])
else:
counts[ks[i]].plot(kind='bar', color=['green','green','blue','blue'], ax=ax[i,i])
fig.delaxes(ax[2,1]) #delete the unused subplot
plt.tight_layout()
To explore the relationships between the response Exited and the other features (which have many different values) we need another kind of plot. Boxplots are convenient to compare continuous variables (or discrete variables with many different values) with categorical variables. Let's use the method provided by pandas to plot a boxplot for each feature:
fig, ax = plt.subplots(nrows=2, ncols=2, figsize=[7, 7])
datos.boxplot(column='Age',by='Exited',ax=ax[0,0])
datos.boxplot(column='Tenure',by='Exited',ax=ax[0,1])
datos.boxplot(column='Balance',by='Exited',ax=ax[1,0])
datos.boxplot(column='EstimatedIncome',by='Exited',ax=ax[1,1])
#tight_layout to plot the figure with automatic adjustment of the subplot margins
plt.tight_layout()
The bar plots suggest that the number of products that customers have influences their behaviour, i.e. those who have 2 products are less likely to exit compared to those who have one product. Similarly, the customers less likely to leave seem to be those who have a credit card, are active clients, and are based in France. The boxplots suggest there is some effect of the age of the clients and of their balance. In the subsequent analyses we may consider removing some variables that seem less important. However, the algorithms that we will use might find some patterns in the data that we cannot see now with this simple visual analysis.
Now let's prepare the data to train the model. First, we will create "dummy" (i.e. binary) variables from the categorical features we have explored above. For each categorical feature we will create n-1 dummy variables, where n is the number of levels of the original feature. This is necessary in linear models (this is sometimes called "the dummy variable trap"). But it is sensible in any case, and we will do so even when we use non-linear algorithms (e.g. decision trees), although it may not be considered a definite assumption or requirement of this kind of algorithm.
Moreover, we will drop some features that are useless to train the model.
#create binary (1, 0) dummy variables
datos["Spain"]=np.where(datos["Geography"]=="Spain",1,0)
datos["Germany"]=np.where(datos["Geography"]=="Germany",1,0)
datos["Female"]=np.where(datos["Gender"]=="Female",1,0)
#drop the original categorical features
datos=datos.drop(columns="Geography")
datos=datos.drop(columns="Gender")
#drop useless features
datos=datos.drop(columns=["_id","CustomerId","Name","person"])
datos.head()
CreditScore | Age | Tenure | Balance | NumOfProducts | HasCrCard | IsActiveMember | EstimatedIncome | Exited | Spain | Germany | Female | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 626 | 62 | 3 | 0.00 | 1 | 1 | 1 | 65010.74 | 0 | 1 | 0 | 0 |
1 | 610 | 46 | 5 | 116886.59 | 1 | 0 | 0 | 107973.44 | 0 | 1 | 0 | 1 |
2 | 621 | 51 | 4 | 109978.83 | 1 | 0 | 0 | 177740.58 | 1 | 0 | 1 | 0 |
3 | 786 | 40 | 6 | 0.00 | 2 | 0 | 0 | 41248.80 | 0 | 1 | 0 | 0 |
4 | 635 | 37 | 9 | 146748.07 | 1 | 0 | 1 | 11407.58 | 0 | 0 | 1 | 1 |
Finally, let's prepare the train and test data sets; for that, we will take the 25% of the original data as test set. We will also compute the target rate (i.e. the percentage of "1" in the response feature) to check whether the train and test subsets result with the same target rate.
#define a function to calculate the target rate
def print_rate(df,target_name):
if len(df.shape)==2:
conteo=(df[target_name].value_counts() / df[target_name].value_counts().sum())
if len(df.shape)==1:
conteo=(df.value_counts() / df.value_counts().sum())
resultado=(conteo[conteo.index==1]).to_list()
return "The target rate is {a}".format(a=str(resultado[0]))
#split the data into train and test sets
X, Y = datos[datos.columns[datos.columns!="Exited"]], datos['Exited']
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, Y, test_size=0.25, shuffle=True, random_state=123)
print();print("X_train: ")
display(X_train.head())
print();print("X_test: ")
display(X_test.head())
print();print("y_train: ")
display(y_train.head())
print();print("y_test: ")
display(y_test.head())
print()
print("Original data; {a}".format(a=print_rate(datos,"Exited")))
print("y_train; {a}".format(a=print_rate(y_train,"Exited")))
print("y_test; {a}".format(a=print_rate(y_test,"Exited")))
X_train:
CreditScore | Age | Tenure | Balance | NumOfProducts | HasCrCard | IsActiveMember | EstimatedIncome | Spain | Germany | Female | |
---|---|---|---|---|---|---|---|---|---|---|---|
9174 | 599 | 30 | 9 | 105443.68 | 1 | 1 | 1 | 121124.53 | 0 | 0 | 0 |
4379 | 684 | 41 | 6 | 135203.81 | 2 | 1 | 1 | 121967.88 | 0 | 0 | 1 |
541 | 687 | 40 | 1 | 0.00 | 2 | 1 | 0 | 8207.36 | 1 | 0 | 1 |
7251 | 850 | 38 | 3 | 0.00 | 2 | 0 | 1 | 179360.76 | 1 | 0 | 1 |
1135 | 597 | 60 | 0 | 78539.84 | 1 | 0 | 1 | 48502.88 | 0 | 1 | 1 |
X_test:
CreditScore | Age | Tenure | Balance | NumOfProducts | HasCrCard | IsActiveMember | EstimatedIncome | Spain | Germany | Female | |
---|---|---|---|---|---|---|---|---|---|---|---|
2656 | 546 | 25 | 7 | 127728.24 | 2 | 1 | 1 | 105279.74 | 1 | 0 | 0 |
445 | 850 | 35 | 9 | 0.00 | 2 | 0 | 0 | 25329.48 | 0 | 0 | 1 |
9505 | 698 | 45 | 5 | 164450.94 | 1 | 1 | 0 | 141970.02 | 1 | 0 | 0 |
332 | 635 | 32 | 8 | 0.00 | 2 | 1 | 1 | 19367.98 | 1 | 0 | 1 |
4168 | 496 | 41 | 1 | 176024.05 | 2 | 1 | 0 | 182337.98 | 0 | 0 | 1 |
y_train:
9174 0 4379 0 541 0 7251 0 1135 0 Name: Exited, dtype: int64
y_test:
2656 0 445 0 9505 1 332 1 4168 0 Name: Exited, dtype: int64
Original data; The target rate is 0.2037 y_train; The target rate is 0.2044 y_test; The target rate is 0.2016
Finally, let's export the four data sets:
output_path = "/home/fabio/Documents/data_science_project_1/data/"
#Export the data
file_names = ['X_train', 'X_test', 'y_train', 'y_test']
for fname in file_names:
with open(output_path+fname+".csv", 'w', newline='') as csvfile:
writer = csv.writer(csvfile)
row_list = (eval(fname)).values.tolist()
#test data rise an exception because it is not iterable
if isinstance((eval(fname)).values.tolist()[0],int):
writer.writerow(row_list)
else:
writer.writerows(row_list)
csvfile.close()
print("Close connection to file: {a}".format(a=csvfile.closed))
#Export the names of the features
X_names = X_train.columns.to_list()
with open(output_path+"X_names.csv", 'w', newline='') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(X_names)
csvfile.close()
print("Close connection to file: {a}".format(a=csvfile.closed))
f=open(output_path+"y_names.csv", "w")
f.write(y_train.name)
f.close()
print("Close connection to file: {a}".format(a=f.closed))
Close connection to file: True Close connection to file: True Close connection to file: True
Now we are ready to start training the models.