import pandas as pd
import numpy as np
import csv
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt


input_path = "/home/fabio/Documents/data_science_project_1/data/"

X_train = pd.read_csv(input_path+"X_train.csv", header=None)
print("X train:\n shape: {a}\n data:".format(a=X_train.shape))
display(X_train.head())

X_test = pd.read_csv(input_path+"X_test.csv", header=None)
print("X test:\n shape: {a}\n data:".format(a=X_test.shape))
display(X_test.head())

def load_y(path,file_name):
    print(file_name)
    dataset = pd.read_csv(path+file_name, header=None)
    dataset = np.array(dataset)
    print(dataset)
    dataset_shape = dataset.shape
    print(dataset_shape)
    print("Reshaped dataset:")
    dataset = np.reshape(dataset, newshape=dataset_shape[1],)
    print(dataset)
    print(dataset.shape)
    return dataset

y_train = load_y(input_path,"y_train.csv")
print()
y_test = load_y(input_path,"y_test.csv")

X train:
 shape: (7500, 11)
 data:

X test:
 shape: (2500, 11)
 data:

y_train.csv
[[0 0 0 ... 0 1 0]]
(1, 7500)
Reshaped dataset:
[0 0 0 ... 0 1 0]
(7500,)

y_test.csv
[[0 0 1 ... 0 0 0]]
(1, 2500)
Reshaped dataset:
[0 0 1 ... 0 0 0]
(2500,)


for _ in ["X_names.csv", "y_names.csv"]:
    print(_)
    with open(input_path+_, 'r') as csvfile:
        [print(i) for i in (csv.reader(csvfile))]
        csvfile.close()

X_names.csv
['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedIncome', 'Spain', 'Germany', 'Female']
y_names.csv
['Exited']


import sklearn
print(sklearn.__version__)

0.23.2


highest = int( (X_train.shape[0]) / 500 )
print(highest)

15


train_accuracy=[]
test_accuracy=[]

for i in np.arange(1,highest+1):
    print("Train/test with max_depth="+str(i))
    clf=DecisionTreeClassifier(random_state=123,max_depth=i)
    clf.fit(X_train, y_train)
    train_score=round(clf.score(X_train, y_train),5)
    train_accuracy.append(train_score)
    y_pred=clf.predict(X_test)
    test_score=round((sum(y_pred==y_test) / len(y_test)),5)
    test_accuracy.append(test_score)

Train/test with max_depth=1
Train/test with max_depth=2
Train/test with max_depth=3
Train/test with max_depth=4
Train/test with max_depth=5
Train/test with max_depth=6
Train/test with max_depth=7
Train/test with max_depth=8
Train/test with max_depth=9
Train/test with max_depth=10
Train/test with max_depth=11
Train/test with max_depth=12
Train/test with max_depth=13
Train/test with max_depth=14
Train/test with max_depth=15


fig,ax=plt.subplots()
ax.plot(train_accuracy,label="Train accuracy",color="b",linestyle="--")
ax.plot(test_accuracy,label="Test accuracy",color="r",linestyle="--")
ax.set_xticks(np.arange(0,len(train_accuracy)))
ax.set_xticklabels(np.arange(0,len(train_accuracy))+1)
ax.set_xlabel("Tree max_depth")
ax.set_ylabel("Accuracy")
ax.legend(loc="upper center")

foo=test_accuracy.index(max(test_accuracy))
print("The max_depth with the highest test accuracy is {}".format(foo+1))

The max_depth with the highest test accuracy is 6

	0	1	2	3	4	5	6	7	8	9	10
0	599.0	30.0	9.0	105443.68	1.0	1.0	1.0	121124.53	0.0	0.0	0.0
1	684.0	41.0	6.0	135203.81	2.0	1.0	1.0	121967.88	0.0	0.0	1.0
2	687.0	40.0	1.0	0.00	2.0	1.0	0.0	8207.36	1.0	0.0	1.0
3	850.0	38.0	3.0	0.00	2.0	0.0	1.0	179360.76	1.0	0.0	1.0
4	597.0	60.0	0.0	78539.84	1.0	0.0	1.0	48502.88	0.0	1.0	1.0

	0	1	2	3	4	5	6	7	8	10
0	546.0	25.0	7.0	127728.24	2.0	1.0	1.0	105279.74	1.0	0.0
1	850.0	35.0	9.0	0.00	2.0	0.0	0.0	25329.48	0.0	1.0
2	698.0	45.0	5.0	164450.94	1.0	1.0	0.0	141970.02	1.0	0.0
3	635.0	32.0	8.0	0.00	2.0	1.0	1.0	19367.98	1.0	1.0
4	496.0	41.0	1.0	176024.05	2.0	1.0	0.0	182337.98	0.0	1.0