import pandas as pd
import numpy as np
from IPython.display import display
import csv
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
import copy
import pprint


input_path = "/home/fabio/Documents/data_science_project_1/data/"

X_train = pd.read_csv(input_path+"X_train.csv", header=None)
print("X train:\n shape: {a}\n data:".format(a=X_train.shape))
display(X_train.head())

X_test = pd.read_csv(input_path+"X_test.csv", header=None)
print("X test:\n shape: {a}\n data:".format(a=X_test.shape))
display(X_test.head())

X train:
 shape: (7500, 11)
 data:

X test:
 shape: (2500, 11)
 data:


def load_y(path,file_name):
    print(file_name)
    dataset = pd.read_csv(path+file_name, header=None)
    dataset = np.array(dataset)
    print(dataset)
    dataset_shape = dataset.shape
    print(dataset_shape)
    print("Reshaped dataset:")
    dataset = np.reshape(dataset, newshape=dataset_shape[1],)
    print(dataset)
    print(dataset.shape)
    return dataset

y_train = load_y(input_path,"y_train.csv")
print()
y_test = load_y(input_path,"y_test.csv")

y_train.csv
[[0 0 0 ... 0 1 0]]
(1, 7500)
Reshaped dataset:
[0 0 0 ... 0 1 0]
(7500,)

y_test.csv
[[0 0 1 ... 0 0 0]]
(1, 2500)
Reshaped dataset:
[0 0 1 ... 0 0 0]
(2500,)


for _ in ["X_names.csv", "y_names.csv"]:
    print(_)
    with open(input_path+_, 'r') as csvfile:
        [print(i) for i in (csv.reader(csvfile))]
        csvfile.close()

X_names.csv
['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedIncome', 'Spain', 'Germany', 'Female']
y_names.csv
['Exited']


methods_dict={"method":["KNN","LogReg","Tree"],
             "command":["KNeighborsClassifier()","LogisticRegression()","DecisionTreeClassifier(random_state=123)"],
             "train_accuracy":[],
             "test_accuracy":[]}

def train_test_models(definitions):
    for i in range(len(definitions["command"])):
        newStr=(definitions["method"][i])
        print("Train/test with {foo}".format(foo=newStr))
        clf=eval(definitions["command"][i])
        clf.fit(X_train, y_train)
        train_score=round(clf.score(X_train, y_train),5)
        y_pred=clf.predict(X_test)
        test_score=round((sum(y_pred==y_test) / len(y_test)),5)
        definitions["train_accuracy"].append(train_score)
        definitions["test_accuracy"].append(test_score)

def see_accuracy(definitions):
    fig,ax=plt.subplots()
    ax.plot(definitions["train_accuracy"],color="b",linestyle="--",marker="o",label="Train accuracy")
    ax.plot(definitions["test_accuracy"],color="r",marker="v",label="Test accuracy")
    ax.set_xticks([0,1,2],minor=False)
    ax.set_xticklabels(definitions["method"])
    ax.set_xlabel("Method")
    ax.set_ylabel("Accuracy")
    ax.legend(loc='upper center')
    plt.show()

train_test_models(methods_dict)

see_accuracy(methods_dict)

Train/test with KNN
Train/test with LogReg
Train/test with Tree


methods_dict_2=copy.deepcopy(methods_dict)
methods_dict_2["command"][2]=methods_dict_2["command"][2].replace(")",",max_depth=5)")
methods_dict_2["train_accuracy"]=[]
methods_dict_2["test_accuracy"]=[]

pprint.pprint(methods_dict_2)

{'command': ['KNeighborsClassifier()',
             'LogisticRegression()',
             'DecisionTreeClassifier(random_state=123,max_depth=5)'],
 'method': ['KNN', 'LogReg', 'Tree'],
 'test_accuracy': [],
 'train_accuracy': []}


train_test_models(methods_dict_2)

see_accuracy(methods_dict_2)

Train/test with KNN
Train/test with LogReg
Train/test with Tree


fig,ax=plt.subplots(1,2,sharey=True,figsize=[9, 5])
ax[0].plot(methods_dict["train_accuracy"],color="b",linestyle="--",marker="o",label="Train accuracy")
ax[0].plot(methods_dict["test_accuracy"],color="r",marker="v",label="Test accuracy")
ax[0].set_title("'Overfitted Tree' method")
ax[0].set_xticks([0,1,2],minor=False)
ax[0].set_xticklabels(methods_dict["method"])
ax[0].set_xlabel("Method")
ax[0].set_ylabel("Accuracy")
ax[0].legend(loc='upper center')
ax[1].plot(methods_dict_2["train_accuracy"],color="b",linestyle="--",marker="o",label="Train accuracy")
ax[1].plot(methods_dict_2["test_accuracy"],color="r",marker="v",label="Test accuracy")
ax[1].set_title("'Pruned Tree' method")
ax[1].set_xticks([0,1,2],minor=False)
ax[1].set_xticklabels(methods_dict["method"])
ax[1].set_xlabel("Method")
ax[1].legend(loc='upper center')
plt.show()

	0	1	2	3	4	5	6	7	8	9	10
0	599.0	30.0	9.0	105443.68	1.0	1.0	1.0	121124.53	0.0	0.0	0.0
1	684.0	41.0	6.0	135203.81	2.0	1.0	1.0	121967.88	0.0	0.0	1.0
2	687.0	40.0	1.0	0.00	2.0	1.0	0.0	8207.36	1.0	0.0	1.0
3	850.0	38.0	3.0	0.00	2.0	0.0	1.0	179360.76	1.0	0.0	1.0
4	597.0	60.0	0.0	78539.84	1.0	0.0	1.0	48502.88	0.0	1.0	1.0

	0	1	2	3	4	5	6	7	8	10
0	546.0	25.0	7.0	127728.24	2.0	1.0	1.0	105279.74	1.0	0.0
1	850.0	35.0	9.0	0.00	2.0	0.0	0.0	25329.48	0.0	1.0
2	698.0	45.0	5.0	164450.94	1.0	1.0	0.0	141970.02	1.0	0.0
3	635.0	32.0	8.0	0.00	2.0	1.0	1.0	19367.98	1.0	1.0
4	496.0	41.0	1.0	176024.05	2.0	1.0	0.0	182337.98	0.0	1.0