import pymongo
import pandas as pd
import numpy as np
from IPython.display import display
import csv
import pickle


#Open a connection with mongodb
client = pymongo.MongoClient('localhost', 27017)

#get the database
print(client.list_database_names())
db = client.customers

#get the collection
print(db.list_collection_names())
colle = db.deploy_data

#count the documents of the collection
print(colle.count_documents({}))

['admin', 'config', 'customers', 'local', 'test']
['deploy_data', 'customers_data', 'legal_entity', 'natural_person']
1020


#query some documents just to have a look
[colle.find()[i] for i in range(2)]

[{'_id': ObjectId('60cde74aaf6ab57dbe5cbd78'),
  'CustomerId': 'E15683183',
  'Name': 'E7090X',
  'CreditScore': 684,
  'Geography': 'Spain',
  'Gender': '0',
  'Age': 75,
  'Tenure': 3,
  'Balance': 999.0,
  'NumOfProducts': 1,
  'HasCrCard': 0,
  'IsActiveMember': 1,
  'EstimatedIncome': 1165675.74,
  'person': 'entity'},
 {'_id': ObjectId('60cde74aaf6ab57dbe5cbd79'),
  'CustomerId': '15625083',
  'Name': 'Abramov',
  'CreditScore': 660,
  'Geography': 'Germany',
  'Gender': 'Female',
  'Age': 38,
  'Tenure': 5,
  'Balance': 146337.88,
  'NumOfProducts': 1,
  'HasCrCard': 1,
  'IsActiveMember': 1,
  'EstimatedIncome': 7674.34,
  'person': 'natural'}]


#query the whole collection and store it in a dataframe
datos=pd.DataFrame(list(colle.find()))


#close the connection
client.close()


datos.head()


print("Unique values of 'person' before filtering: {a}".format(a=datos["person"].unique()))

datos = datos[datos["person"]=="natural"]

print("Unique values of 'person' after filtering: {a}".format(a=datos["person"].unique()))

Unique values of 'person' before filtering: ['entity' 'natural']
Unique values of 'person' after filtering: ['natural']


#create binary (1, 0) dummy variables
datos["Spain"]=np.where(datos["Geography"]=="Spain",1,0)
datos["Germany"]=np.where(datos["Geography"]=="Germany",1,0)
datos["Female"]=np.where(datos["Gender"]=="Female",1,0)

#drop the original categorical features
datos=datos.drop(columns="Geography")
datos=datos.drop(columns="Gender")

datos.head()


X_deploy_ids = datos[["CustomerId","Name","person"]]
display(X_deploy_ids.head())

X_deploy = datos.drop(columns=["CustomerId","Name","person","_id"])
X_deploy.head()


output_path = "/home/fabio/Documents/data_science_project_1/data/"

X_train_names=list()
with open(output_path+"X_names.csv", 'r') as csvfile:
    foo = csv.reader(csvfile)
    for _ in foo:
        X_train_names.extend(_)
csvfile.close()

print(X_train_names)
print()
print(list(X_deploy.columns))
print()

if X_train_names == list(X_deploy.columns):
    print("They are identical")

['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedIncome', 'Spain', 'Germany', 'Female']

['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedIncome', 'Spain', 'Germany', 'Female']

They are identical


tuned_model_path = "/home/fabio/Documents/data_science_project_1/results/"

#Load the tuned model
loaded_model = pickle.load(open(tuned_model_path+"tuned_model.pickle", 'rb'))

#Make predictions
y_pred=loaded_model.predict(X_deploy)

#Add the predictions to X_deploy_ids as a new column
#X_deploy_ids["Predictions"] = y_pred
X_deploy_ids.insert(loc=3, column="Predictions", value=y_pred, allow_duplicates=False)

X_deploy_ids.head()


project_data_path = "/home/fabio/Documents/data_science_project_1/data/"

#Load the real outcomes
real_outcomes = pd.read_csv(project_data_path+"0_deployment_data_real_target.csv")
real_outcomes = real_outcomes[real_outcomes["person"]=="natural"]
real_outcomes.head()


joined = X_deploy_ids.merge(real_outcomes, how='inner', on=["CustomerId","Name","person"])
joined.head()


#compute the score
score = list()
for i in range(len(joined)):
    if joined["Exited"][i] == joined["Predictions"][i]:
        score.append(1)
    else:
        score.append(0)

accuracy = round( ( sum(score) / len(score) )*100, 1)

print("The accuracy of the predictions is: {a}".format(a=str(accuracy)+"%"))

The accuracy of the predictions is: 80.3%

Deploying the model¶

Preparing the data¶

Making predictions¶

Validating the predictions¶

Next step: deployment process flow¶

	_id	CustomerId	Name	CreditScore	Geography	Gender	Age	Tenure	Balance	NumOfProducts	HasCrCard	IsActiveMember	EstimatedIncome	person
0	60cde74aaf6ab57dbe5cbd78	E15683183	E7090X	684	Spain	0	75	3	999.00	1	0	1	1165675.74	entity
1	60cde74aaf6ab57dbe5cbd79	15625083	Abramov	660	Germany	Female	38	5	146337.88	1	1	1	7674.34	natural
2	60cde74aaf6ab57dbe5cbd7a	15712974	Hogarth	761	France	Female	33	4	137480.88	1	1	1	53765.71	natural
3	60cde74aaf6ab57dbe5cbd7b	E15607817	E8908X	812	Spain	0	35	1	87219.88	2	1	1	1034466.96	entity
4	60cde74aaf6ab57dbe5cbd7c	E15601011	E6431X	642	France	0	66	9	999.00	3	1	1	1161182.14	entity