## load libraries
import sys
from numpy import *
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import operator
%matplotlib inline
from sklearn.feature_extraction import DictVectorizer
from sklearn import preprocessing
from sklearn import neighbors, tree, naive_bayes
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.cross_validation import train_test_split
from sklearn.neighbors import KNeighborsClassifier
data = pd.read_csv("loan.csv", low_memory=False)
# 5% of the data without replacement
data = data.sample(frac=0.05, replace=False, random_state=123)
data.shape
data.head(n=5)
data.columns
The loan_status column is the target!
pd.unique(data['loan_status'].values.ravel())
print("Amount of Classes: ", len(pd.unique(data['loan_status'].values.ravel())))
len(pd.unique(data['zip_code'].values.ravel())) # want to make sure this was not too unique
len(pd.unique(data['url'].values.ravel())) # drop url
len(pd.unique(data['last_pymnt_d'].values.ravel()))
len(pd.unique(data['next_pymnt_d'].values.ravel()))
for col in data.select_dtypes(include=['object']).columns:
print ("Column {} has {} unique instances".format( col, len(data[col].unique())) )
len(pd.unique(data['member_id'].values.ravel())) == data.shape[0]
data = data.drop('id', 1) #
data = data.drop('member_id', 1)#
data = data.drop('url', 1)#
data = data.drop('purpose', 1)
data = data.drop('title', 1)#
data = data.drop('zip_code', 1)#
data = data.drop('emp_title', 1)#
data = data.drop('earliest_cr_line', 1)#
data = data.drop('term', 1)
data = data.drop('sub_grade', 1) #
data = data.drop('last_pymnt_d', 1)#
data = data.drop('next_pymnt_d', 1)#
data = data.drop('last_credit_pull_d', 1)
data = data.drop('issue_d', 1) ##
data = data.drop('desc', 1)##
data = data.drop('addr_state', 1)##
data.shape
# yay this is better
for col in data.select_dtypes(include=['object']).columns:
print ("Column {} has {} unique instances".format( col, len(data[col].unique())) )
data['loan_amnt'].plot(kind="hist", bins=10)
data['grade'].value_counts().plot(kind='bar')
data['emp_length'].value_counts().plot(kind='bar')
data['loan_status'].value_counts().plot(kind='bar')
data._get_numeric_data().columns
"There are {} numeric columns in the data set".format(len(data._get_numeric_data().columns) )
data.select_dtypes(include=['object']).columns
"There are {} Character columns in the data set (minus the target)".format(len(data.select_dtypes(include=['object']).columns) -1)
X = data.drop("loan_status", axis=1, inplace = False)
y = data.loan_status
y.head()
def model_matrix(df , columns):
dummified_cols = pd.get_dummies(df[columns])
df = df.drop(columns, axis = 1, inplace=False)
df_new = df.join(dummified_cols)
return df_new
X = model_matrix(X, ['grade', 'emp_length', 'home_ownership', 'verification_status',
'pymnt_plan', 'initial_list_status', 'application_type', 'verification_status_joint'])
# 'issue_d' 'desc' 'addr_state'
X.head()
X.shape
# impute rows with NaN with a 0 for now
X2 = X.fillna(value = 0)
X2.head()
from sklearn.preprocessing import MinMaxScaler
Scaler = MinMaxScaler()
X2[['loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'int_rate',
'installment', 'annual_inc', 'dti', 'delinq_2yrs', 'inq_last_6mths',
'mths_since_last_delinq', 'mths_since_last_record', 'open_acc',
'pub_rec', 'revol_bal', 'revol_util', 'total_acc', 'out_prncp',
'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp',
'total_rec_int', 'total_rec_late_fee', 'recoveries',
'collection_recovery_fee', 'last_pymnt_amnt',
'collections_12_mths_ex_med', 'mths_since_last_major_derog',
'policy_code', 'annual_inc_joint', 'dti_joint', 'acc_now_delinq',
'tot_coll_amt', 'tot_cur_bal', 'open_acc_6m', 'open_il_6m',
'open_il_12m', 'open_il_24m', 'mths_since_rcnt_il', 'total_bal_il',
'il_util', 'open_rv_12m', 'open_rv_24m', 'max_bal_bc', 'all_util',
'total_rev_hi_lim', 'inq_fi', 'total_cu_tl', 'inq_last_12m']] = Scaler.fit_transform(X2[['loan_amnt', 'funded_amnt', 'funded_amnt_inv', 'int_rate',
'installment', 'annual_inc', 'dti', 'delinq_2yrs', 'inq_last_6mths',
'mths_since_last_delinq', 'mths_since_last_record', 'open_acc',
'pub_rec', 'revol_bal', 'revol_util', 'total_acc', 'out_prncp',
'out_prncp_inv', 'total_pymnt', 'total_pymnt_inv', 'total_rec_prncp',
'total_rec_int', 'total_rec_late_fee', 'recoveries',
'collection_recovery_fee', 'last_pymnt_amnt',
'collections_12_mths_ex_med', 'mths_since_last_major_derog',
'policy_code', 'annual_inc_joint', 'dti_joint', 'acc_now_delinq',
'tot_coll_amt', 'tot_cur_bal', 'open_acc_6m', 'open_il_6m',
'open_il_12m', 'open_il_24m', 'mths_since_rcnt_il', 'total_bal_il',
'il_util', 'open_rv_12m', 'open_rv_24m', 'max_bal_bc', 'all_util',
'total_rev_hi_lim', 'inq_fi', 'total_cu_tl', 'inq_last_12m']])
X2.head()
x_train, x_test, y_train, y_test = train_test_split(X2, y, test_size=.3, random_state=123)
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)
# start out with the number of classes for neighbors
data_knn = KNeighborsClassifier(n_neighbors = 10, metric='euclidean')
data_knn
data_knn.fit(x_train, y_train)
data_knn.predict(x_test)
# R-square from training and test data
rsquared_train = data_knn.score(x_train, y_train)
rsquared_test = data_knn.score(x_test, y_test)
print ('Training data R-squared:')
print(rsquared_train)
print ('Test data R-squared:')
print(rsquared_test)
# confusion matrix
from sklearn.metrics import confusion_matrix
knn_confusion_matrix = confusion_matrix(y_true = y_test, y_pred = data_knn.predict(x_test))
print("The Confusion matrix:\n", knn_confusion_matrix)
# visualize the confusion matrix
# http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
plt.matshow(knn_confusion_matrix, cmap = plt.cm.Blues)
plt.title("KNN Confusion Matrix\n")
#plt.xticks([0,1], ['No', 'Yes'])
#plt.yticks([0,1], ['No', 'Yes'])
plt.ylabel('True label')
plt.xlabel('Predicted label')
for y in range(knn_confusion_matrix.shape[0]):
for x in range(knn_confusion_matrix.shape[1]):
plt.text(x, y, '{}'.format(knn_confusion_matrix[y, x]),
horizontalalignment = 'center',
verticalalignment = 'center',)
plt.show()
#Generate the classification report
from sklearn.metrics import classification_report
knn_classify_report = classification_report(y_true = y_test,
y_pred = data_knn.predict(x_test))
print(knn_classify_report)
fin.