import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
import random
import csv


input_path = "/home/fabio/Documents/data_science_project_1/data/"

X_train = pd.read_csv(input_path+"X_train.csv", header=None)
print("X train:\n shape: {a}\n data:".format(a=X_train.shape))
display(X_train.head())

X_test = pd.read_csv(input_path+"X_test.csv", header=None)
print("X test:\n shape: {a}\n data:".format(a=X_test.shape))
display(X_test.head())

def load_y(path,file_name):
    print(file_name)
    dataset = pd.read_csv(path+file_name, header=None)
    dataset = np.array(dataset)
    print(dataset)
    dataset_shape = dataset.shape
    print(dataset_shape)
    print("Reshaped dataset:")
    dataset = np.reshape(dataset, newshape=dataset_shape[1],)
    print(dataset)
    print(dataset.shape)
    return dataset

y_train = load_y(input_path,"y_train.csv")
print()
y_test = load_y(input_path,"y_test.csv")

X train:
 shape: (7500, 11)
 data:

X test:
 shape: (2500, 11)
 data:

y_train.csv
[[0 0 0 ... 0 1 0]]
(1, 7500)
Reshaped dataset:
[0 0 0 ... 0 1 0]
(7500,)

y_test.csv
[[0 0 1 ... 0 0 0]]
(1, 2500)
Reshaped dataset:
[0 0 1 ... 0 0 0]
(2500,)


for _ in ["X_names.csv", "y_names.csv"]:
    print(_)
    with open(input_path+_, 'r') as csvfile:
        [print(i) for i in (csv.reader(csvfile))]
        csvfile.close()

X_names.csv
['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedIncome', 'Spain', 'Germany', 'Female']
y_names.csv
['Exited']


methods_dict={"method":["KNN","LogReg","Tree"],
             "command":["KNeighborsClassifier()","LogisticRegression()",
                        "DecisionTreeClassifier(random_state=123,max_depth=5)"]}

#Define the function "lift"
#"predictions" is a DataFrame with two columns (named 0 and 1, in this order)
#"target" is an list with the real values of the target variable in the test dataset
def lift(predictions,target,quantiles):
    predictions["real"]=target
    random.seed(123)
    predictions["aleatory"]=random.sample(range(len(predictions)),len(predictions))
    predictions=predictions.iloc[:,1:4]
    predictions=predictions.sort_values(by=[1,"aleatory"],ascending=False)
    bins=pd.cut(np.array(range(len(predictions))),quantiles,right=True,labels=False)
    predictions["bins"]=bins
    results=predictions.groupby(by="bins",axis=0,as_index=False).sum()
    results["sample_size"]=predictions.groupby(by="bins",axis=0,as_index=False).count().iloc[:,1]
    results["real_cumsum"]=np.cumsum(results["real"])
    results["real_cumsum_rate"]=(results["real_cumsum"])/sum(target)
    mean_rate=sum(target)/len(results)
    results["mean_rate"]=mean_rate
    results["lift"]=results["real"]/mean_rate
    mylist=[]
    for i in range(len(results)):
        mylist.append((results["real_cumsum"][i])/(mean_rate*(i+1)))
    results["lift_cum"]=mylist
    results=results.drop(columns=[1,"aleatory"])
    return results

#Define the function "plot_lift"
#"arg1" is a DataFrame created with the function "lift"
def plot_lift(arg1):
    fig,ax=plt.subplots(1,2)
    positions=(np.arange(len(arg1["lift_cum"])))+1
    ax[0].set_xlim(right=max(arg1["lift_cum"])+1)
    ax[0].barh(y=positions,width=arg1["lift_cum"])
    ax[0].set_xlabel("Cumulated lift")
    ax[0].set_ylabel("Bins")
    ax[0].set_title("Lift")
    for y, x in enumerate(arg1["lift_cum"]):
        ax[0].text(x+0.1, y+1, str(round(x,2)))
    ax[1].set_xlim(right=max(arg1["real_cumsum_rate"])+1)
    ax[1].barh(y=positions,width=arg1["real_cumsum_rate"])
    ax[1].set_xlabel("Cumulated captured target")
    ax[1].set_title("Captured target")
    for y, x in enumerate(arg1["real_cumsum_rate"]):
        ax[1].text(x+0.1, y+1, str(round(x,2)))
    plt.show()

#pipeline to train and test the three models
def train_test_models(definitions):
    for i in range(len(definitions["command"])):
        newStr=(definitions["method"][i])
        print("Train/test with {foo}".format(foo=newStr))
        clf=eval(methods_dict["command"][i])
        clf.fit(X_train, y_train)
        pred=pd.DataFrame(clf.predict_proba(X_test))
        results=lift(predictions=pred,target=list(y_test),quantiles=10)
        plot_lift(results)


train_test_models(methods_dict)

Train/test with KNN

Train/test with LogReg

Train/test with Tree

	0	1	2	3	4	5	6	7	8	9	10
0	599.0	30.0	9.0	105443.68	1.0	1.0	1.0	121124.53	0.0	0.0	0.0
1	684.0	41.0	6.0	135203.81	2.0	1.0	1.0	121967.88	0.0	0.0	1.0
2	687.0	40.0	1.0	0.00	2.0	1.0	0.0	8207.36	1.0	0.0	1.0
3	850.0	38.0	3.0	0.00	2.0	0.0	1.0	179360.76	1.0	0.0	1.0
4	597.0	60.0	0.0	78539.84	1.0	0.0	1.0	48502.88	0.0	1.0	1.0

	0	1	2	3	4	5	6	7	8	10
0	546.0	25.0	7.0	127728.24	2.0	1.0	1.0	105279.74	1.0	0.0
1	850.0	35.0	9.0	0.00	2.0	0.0	0.0	25329.48	0.0	1.0
2	698.0	45.0	5.0	164450.94	1.0	1.0	0.0	141970.02	1.0	0.0
3	635.0	32.0	8.0	0.00	2.0	1.0	1.0	19367.98	1.0	1.0
4	496.0	41.0	1.0	176024.05	2.0	1.0	0.0	182337.98	0.0	1.0