import pymongo
import pandas as pd
import matplotlib.pyplot as plt
import math
import numpy as np
from sklearn import model_selection
from IPython.display import display
import csv


#Open a connection with mongodb
client = pymongo.MongoClient('localhost', 27017)

#get the database
print(client.list_database_names())
db = client.customers

#get the collection
print(db.list_collection_names())
colle = db.natural_person

#count the documents of the collection
print(colle.count_documents({}))

['admin', 'config', 'customers', 'local', 'test']
['customers_data', 'legal_entity', 'natural_person']
10000


#query some documents just to have a look
[colle.find()[i] for i in range(3)]

[{'_id': ObjectId('60152a83127b7b1168e813d2'),
  'CustomerId': '15683544',
  'Name': 'Buccho',
  'CreditScore': 626,
  'Geography': 'Spain',
  'Gender': 'Male',
  'Age': 62,
  'Tenure': 3,
  'Balance': 0.0,
  'NumOfProducts': 1,
  'HasCrCard': 1,
  'IsActiveMember': 1,
  'EstimatedIncome': 65010.74,
  'Exited': 0,
  'person': 'natural'},
 {'_id': ObjectId('60152a83127b7b1168e813d3'),
  'CustomerId': '15737489',
  'Name': 'Ramsden',
  'CreditScore': 610,
  'Geography': 'Spain',
  'Gender': 'Female',
  'Age': 46,
  'Tenure': 5,
  'Balance': 116886.59,
  'NumOfProducts': 1,
  'HasCrCard': 0,
  'IsActiveMember': 0,
  'EstimatedIncome': 107973.44,
  'Exited': 0,
  'person': 'natural'},
 {'_id': ObjectId('60152a83127b7b1168e813d4'),
  'CustomerId': '15642391',
  'Name': 'Lettiere',
  'CreditScore': 621,
  'Geography': 'Germany',
  'Gender': 'Male',
  'Age': 51,
  'Tenure': 4,
  'Balance': 109978.83,
  'NumOfProducts': 1,
  'HasCrCard': 0,
  'IsActiveMember': 0,
  'EstimatedIncome': 177740.58,
  'Exited': 1,
  'person': 'natural'}]


#query the whole collection and store it in a dataframe
datos=pd.DataFrame(list(colle.find()))


#close the connection
client.close()


datos.head()


#see the types of data in the data frame and their unique values
see_unique=dict()
for i in range(len(datos.columns)):
    a=datos.columns[i]
    b=len(list(datos.iloc[:,i].unique()))
    c=str(datos.iloc[:,i].dtypes)
    see_unique.update({a:[b,c]})

for k , v in iter(see_unique.items()):
        print(str(k)+" : "+str(v))

_id : [10000, 'object']
CustomerId : [10000, 'object']
Name : [2932, 'object']
CreditScore : [460, 'int64']
Geography : [3, 'object']
Gender : [2, 'object']
Age : [70, 'int64']
Tenure : [11, 'int64']
Balance : [6382, 'float64']
NumOfProducts : [4, 'int64']
HasCrCard : [2, 'int64']
IsActiveMember : [2, 'int64']
EstimatedIncome : [9999, 'float64']
Exited : [2, 'int64']
person : [1, 'object']


#let's se the unique values of variables with few unique values
some_variables=["Gender","Geography","NumOfProducts","HasCrCard","IsActiveMember","Exited"]

for nombre in some_variables:
    print(datos[nombre].unique())

['Male' 'Female']
['Spain' 'Germany' 'France']
[1 2 3 4]
[1 0]
[1 0]
[0 1]


counts={}
for _ in ["Gender","Geography","NumOfProducts","HasCrCard","IsActiveMember"]:
    item=datos[[_,'Exited']].groupby([_,'Exited']).size()
    counts[_]=item

ks = list(counts.keys())

fig, ax = plt.subplots(nrows=3, ncols=2, figsize=[7, 10]) #figure size in inches
for i in range(len(counts)):
    if i==2:
        counts[ks[i]].plot(kind='bar', color=['green','green','blue','blue'], ax=ax[0,1])
    elif i==3:
        counts[ks[i]].plot(kind='bar', color=['green','green','blue','blue'], ax=ax[1,0])
    elif i==4:
        counts[ks[i]].plot(kind='bar', color=['green','green','blue','blue'], ax=ax[2,0])
    else:
        counts[ks[i]].plot(kind='bar', color=['green','green','blue','blue'], ax=ax[i,i])
fig.delaxes(ax[2,1]) #delete the unused subplot
plt.tight_layout()


fig, ax = plt.subplots(nrows=2, ncols=2, figsize=[7, 7])
datos.boxplot(column='Age',by='Exited',ax=ax[0,0])
datos.boxplot(column='Tenure',by='Exited',ax=ax[0,1])
datos.boxplot(column='Balance',by='Exited',ax=ax[1,0])
datos.boxplot(column='EstimatedIncome',by='Exited',ax=ax[1,1])
#tight_layout to plot the figure with automatic adjustment of the subplot margins
plt.tight_layout()


#create binary (1, 0) dummy variables
datos["Spain"]=np.where(datos["Geography"]=="Spain",1,0)
datos["Germany"]=np.where(datos["Geography"]=="Germany",1,0)
datos["Female"]=np.where(datos["Gender"]=="Female",1,0)

#drop the original categorical features
datos=datos.drop(columns="Geography")
datos=datos.drop(columns="Gender")

#drop useless features
datos=datos.drop(columns=["_id","CustomerId","Name","person"])

datos.head()


#define a function to calculate the target rate
def print_rate(df,target_name):
    if len(df.shape)==2:
        conteo=(df[target_name].value_counts() / df[target_name].value_counts().sum())
    if len(df.shape)==1:
        conteo=(df.value_counts() / df.value_counts().sum())
    resultado=(conteo[conteo.index==1]).to_list()
    return "The target rate is {a}".format(a=str(resultado[0]))

#split the data into train and test sets
X, Y = datos[datos.columns[datos.columns!="Exited"]], datos['Exited']
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, Y, test_size=0.25, shuffle=True, random_state=123)

print();print("X_train: ")
display(X_train.head())
print();print("X_test: ")
display(X_test.head())
print();print("y_train: ")
display(y_train.head())
print();print("y_test: ")
display(y_test.head())

print()
print("Original data; {a}".format(a=print_rate(datos,"Exited")))
print("y_train; {a}".format(a=print_rate(y_train,"Exited")))
print("y_test; {a}".format(a=print_rate(y_test,"Exited")))

X_train:

X_test:

y_train:

9174    0
4379    0
541     0
7251    0
1135    0
Name: Exited, dtype: int64

y_test:

2656    0
445     0
9505    1
332     1
4168    0
Name: Exited, dtype: int64

Original data; The target rate is 0.2037
y_train; The target rate is 0.2044
y_test; The target rate is 0.2016


output_path = "/home/fabio/Documents/data_science_project_1/data/"

#Export the data
file_names = ['X_train', 'X_test', 'y_train', 'y_test']

for fname in file_names:
    with open(output_path+fname+".csv", 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        row_list = (eval(fname)).values.tolist()
        #test data rise an exception because it is not iterable
        if isinstance((eval(fname)).values.tolist()[0],int):
            writer.writerow(row_list)
        else:
            writer.writerows(row_list)
        csvfile.close()
        
print("Close connection to file: {a}".format(a=csvfile.closed))

#Export the names of the features
X_names = X_train.columns.to_list()

with open(output_path+"X_names.csv", 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(X_names)
    csvfile.close()

print("Close connection to file: {a}".format(a=csvfile.closed))

f=open(output_path+"y_names.csv", "w")
f.write(y_train.name)
f.close()

print("Close connection to file: {a}".format(a=f.closed))

Close connection to file: True
Close connection to file: True
Close connection to file: True

	_id	CustomerId	Name	CreditScore	Geography	Gender	Age	Tenure	Balance	NumOfProducts	HasCrCard	IsActiveMember	EstimatedIncome	Exited	person
0	60152a83127b7b1168e813d2	15683544	Buccho	626	Spain	Male	62	3	0.00	1	1	1	65010.74	0	natural
1	60152a83127b7b1168e813d3	15737489	Ramsden	610	Spain	Female	46	5	116886.59	1	0	0	107973.44	0	natural
2	60152a83127b7b1168e813d4	15642391	Lettiere	621	Germany	Male	51	4	109978.83	1	0	0	177740.58	1	natural
3	60152a83127b7b1168e813d5	15578788	Bibi	786	Spain	Male	40	6	0.00	2	0	0	41248.80	0	natural
4	60152a83127b7b1168e813d6	15693322	Shaver	635	Germany	Female	37	9	146748.07	1	0	1	11407.58	0	natural

	CreditScore	Age	Tenure	Balance	NumOfProducts	HasCrCard	IsActiveMember	EstimatedIncome	Spain	Germany	Female
9174	599	30	9	105443.68	1	1	1	121124.53	0	0	0
4379	684	41	6	135203.81	2	1	1	121967.88	0	0	1
541	687	40	1	0.00	2	1	0	8207.36	1	0	1
7251	850	38	3	0.00	2	0	1	179360.76	1	0	1
1135	597	60	0	78539.84	1	0	1	48502.88	0	1	1

	CreditScore	Age	Tenure	Balance	NumOfProducts	HasCrCard	IsActiveMember	EstimatedIncome	Spain	Female
2656	546	25	7	127728.24	2	1	1	105279.74	1	0
445	850	35	9	0.00	2	0	0	25329.48	0	1
9505	698	45	5	164450.94	1	1	0	141970.02	1	0
332	635	32	8	0.00	2	1	1	19367.98	1	1
4168	496	41	1	176024.05	2	1	0	182337.98	0	1