Credit Card Default Prediction Using XG-boost in SageMaker

Importing Libraries & Datasets

				
					# Updgrade Pandas version
!pip3 install pandas --upgrade

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline


creditcard_df = pd.read_csv('UCI_Credit_Card.csv')

creditcard_df['LIMIT_BAL'].max()
<!--1000000.0-->


creditcard_df['LIMIT_BAL'].mean()
<!--167484.32266666667-->

				
			

Data Visualisation

				
					# Let's see if we have any missing data, luckily we don't!
sns.heatmap(creditcard_df.isnull());
				
			
				
					# Let's drop the ID column
creditcard_df.drop(['ID'], axis = 1, inplace = True)

# Let's see how many customers could potentially default on their credit card payment! 
cc_default_df        = creditcard_df[creditcard_df['default.payment.next.month'] == 1]
cc_nodefault_df      = creditcard_df[creditcard_df['default.payment.next.month'] == 0]


# Count the number of employees who stayed and left
# It seems that we are dealing with an imbalanced dataset 

print("Total =", len(creditcard_df))

print("Number of customers who defaulted on their credit card payments =", len(cc_default_df))
print("Percentage of customers who defaulted on their credit card payments =", 1.*len(cc_default_df)/len(creditcard_df)*100.0, "%")
 
print("Number of customers who did not default on their credit card payments (paid their balance)=", len(cc_nodefault_df))
print("Percentage of customers who did not default on their credit card payments (paid their balance)=", 1.*len(cc_nodefault_df)/len(creditcard_df)*100.0, "%")

<!--Total = 30000-->
<!--Number of customers who defaulted on their credit card payments = 6636-->
<!--Percentage of customers who defaulted on their credit card payments = 22.12 %-->
<!--Number of customers who did not default on their credit card payments (paid their balance)= 23364-->
<!--Percentage of customers who did not default on their credit card payments (paid their balance)= 77.88000000000001 %-->


# Plot the correlation matrix
correlations = creditcard_df.corr()
f, ax = plt.subplots(figsize = (20, 20))
sns.heatmap(correlations, annot = True);
				
			
				
					plt.figure(figsize = [25, 12])
sns.countplot(x = 'AGE', hue = 'default.payment.next.month', data = creditcard_df);
				
			
				
					plt.figure(figsize=[20,20])
plt.subplot(311)
sns.countplot(x = 'EDUCATION', hue = 'default.payment.next.month', data = creditcard_df)
plt.subplot(312)
sns.countplot(x = 'SEX', hue = 'default.payment.next.month', data = creditcard_df)
plt.subplot(313)
sns.countplot(x = 'MARRIAGE', hue = 'default.payment.next.month', data = creditcard_df)

				
			
				
					# KDE (Kernel Density Estimate) is used for visualizing the Probability Density of a continuous variable. 
# KDE describes the probability density at different values in a continuous variable. 

plt.figure(figsize=(12,7))

sns.distplot(cc_nodefault_df['LIMIT_BAL'], bins = 250, color = 'r')
sns.distplot(cc_default_df['LIMIT_BAL'], bins = 250, color = 'b')

plt.xlabel('Amount of bill statement in September, 2005 (NT dollar)')
#plt.xlim(0, 200000)

				
			
				
					# KDE (Kernel Density Estimate) is used for visualizing the Probability Density of a continuous variable. 
# KDE describes the probability density at different values in a continuous variable. 

plt.figure(figsize=(12,7))

sns.kdeplot(cc_nodefault_df['BILL_AMT1'], label = 'Customers who did not default (paid balance)', shade = True, color = 'r')
sns.kdeplot(cc_default_df['BILL_AMT1'], label = 'Customers who defaulted (did not pay balance)', shade = True, color = 'b')

plt.xlabel('Amount of bill statement in September, 2005 (NT dollar)')
#plt.xlim(0, 200000)

				
			
				
					plt.figure(figsize=[10,20])
plt.subplot(211)
sns.boxplot(x = 'MARRIAGE', y = 'LIMIT_BAL', data = creditcard_df, showfliers = False);
plt.subplot(212)
sns.boxplot(x = 'MARRIAGE', y = 'LIMIT_BAL', data = creditcard_df);

				
			
				
					plt.figure(figsize = [10, 20])
plt.subplot(211)
sns.boxplot(x = 'SEX', y = 'LIMIT_BAL', data = creditcard_df)

plt.subplot(212)
sns.boxplot(x = 'SEX', y = 'LIMIT_BAL', data = creditcard_df, showfliers = False)
				
			

Testing & Training Datasets

				
					X_cat = creditcard_df[['SEX', 'EDUCATION', 'MARRIAGE']]
X_cat

from sklearn.preprocessing import OneHotEncoder
onehotencoder = OneHotEncoder()
X_cat = onehotencoder.fit_transform(X_cat).toarray()

X_cat.shape

<!--(30000, 13)-->

X_cat = pd.DataFrame(X_cat)

# note that we dropped the target 'default.payment.next.month'
X_numerical = creditcard_df[['LIMIT_BAL', 'AGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 
                'BILL_AMT1','BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6',
                'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']]
X_numerical

X = pd.concat([X_cat, X_numerical], axis = 1)

y = creditcard_df['default.payment.next.month']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

# Train an XGBoost classifier model 
import xgboost as xgb

param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [10, 20, 30],
    'n_estimators': [100, 200, 500]
}

xgb_model = xgb.XGBClassifier(use_label_encoder=False)

# Perform GridSearchCV
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, scoring='accuracy', cv=5, verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best parameters from GridSearchCV
best_params = grid_search.best_params_

model = xgb.XGBClassifier(**best_params, use_label_encoder=False)
model.fit(X_train, y_train)

from sklearn.metrics import accuracy_score
y_pred = model.predict(X_test)

from sklearn.metrics import confusion_matrix, classification_report

print("Accuracy {} %".format( 100 * accuracy_score(y_pred, y_test)))

Accuracy 81.55 %

# Model Performance on the Testing dataset
cm = confusion_matrix(y_pred, y_test)
sns.heatmap(cm, annot = True, fmt = 'd');
				
			

Training in Sagemaker

				
					X = np.array(X)
y = np.array(y)

# reshaping the array from (500,) to (500, 1)
y = y.reshape(-1,1)
y.shape

# spliting the data into training, testing and validation sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size = 0.5)

# Convert the array into dataframe in a way that target variable is set as the first column and followed by feature columns
# This is because sagemaker built-in algorithm expects the data in this format.

train_data = pd.DataFrame({'Target': y_train[:,0]})
for i in range(X_train.shape[1]):
    train_data[i] = X_train[:,i]
    
val_data = pd.DataFrame({'Target':y_test[:,0]})
for i in range(X_test.shape[1]):
    val_data[i] = X_test[:,i]
    
# save train_data and validation_data as csv files.

train_data.to_csv('train.csv', header = False, index = False)
val_data.to_csv('validation.csv', header = False, index = False)

# Boto3 is the Amazon Web Services (AWS) Software Development Kit (SDK) for Python
# Boto3 allows Python developer to write software that makes use of services like Amazon S3 and Amazon EC2

import sagemaker
import boto3

# Create a sagemaker session
sagemaker_session = sagemaker.Session()


bucket = sagemaker_session.default_bucket()                    # Set a default S3 bucket
prefix = 'XGBoost-classifier'
key = 'XGBoost-classifier'
#Roles give learning and hosting access to the data
# IAM role

role = sagemaker.get_execution_role()

# read the data from csv file and then upload the data to s3 bucket
import os
with open('train.csv','rb') as f:
    # The following code uploads the data into S3 bucket to be accessed later for training
    boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'train', key)).upload_fileobj(f)

# Let's print out the training data location in s3
s3_train_data = 's3://{}/{}/train/{}'.format(bucket, prefix, key)
print('uploaded training data location: {}'.format(s3_train_data))

# read the data from csv file and then upload the data to s3 bucket

with open('validation.csv','rb') as f:
    # The following code uploads the data into S3 bucket to be accessed later for training

    boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(prefix, 'validation', key)).upload_fileobj(f)
# Let's print out the validation data location in s3
s3_validation_data = 's3://{}/{}/validation/{}'.format(bucket, prefix, key)
print('uploaded validation data location: {}'.format(s3_validation_data))

# create output placeholder in S3 bucket to store the output
output_location = 's3://{}/{}/output'.format(bucket, prefix)
print('training artifacts will be uploaded to: {}'.format(output_location))

from sagemaker.amazon.amazon_estimator import get_image_uri

# container = sagemaker.image_uris.retrieve("xgboost", region = 'us-east-1')
container = sagemaker.image_uris.retrieve("xgboost", boto3.Session().region_name, version='1.0-1')

# container = get_image_uri(boto3.Session().region_name, 'xgboost','1.0-1') # Latest version of XGboost

# num_round: The number of rounds to run the training.


# Alpha: L1 regularization term on weights. Increasing this value makes models more conservative.

# colsample_by_tree: fraction of features that will be used to train each tree.

# eta: Step size shrinkage used in updates to prevent overfitting. 
# After each boosting step, eta parameter shrinks the feature weights to make the boosting process more conservative.



Xgboost_classifier = sagemaker.estimator.Estimator(container,
                                                   role, 
                                                   instance_count = 1, 
                                                   instance_type = 'ml.m4.xlarge',
                                                   output_path = output_location,
                                                   sagemaker_session = sagemaker_session)

# To improve the performance of the model, a hyperparameters tuning job need to be run 

Xgboost_classifier.set_hyperparameters(max_depth = 20,
                                       objective = 'multi:softmax',
                                       num_class = 2,
                                       eta = 0.1,
                                       num_round = 150)
                                  
                                  
                                       
train_input = sagemaker.session.TrainingInput(s3_data = s3_train_data, content_type='csv',s3_data_type = 'S3Prefix')
valid_input = sagemaker.session.TrainingInput(s3_data = s3_validation_data, content_type='csv',s3_data_type = 'S3Prefix')

data_channels = {'train': train_input,'validation': valid_input}


Xgboost_classifier.fit(data_channels)
				
			
2022-05-04 04:07:11 Starting - Starting the training job...
2022-05-04 04:07:38 Starting - Preparing the instances for trainingProfilerReport-1651637231: InProgress
.........
2022-05-04 04:09:10 Downloading - Downloading input data...
2022-05-04 04:09:30 Training - Downloading the training image......
2022-05-04 04:10:30 Training - Training image download completed. Training in progress.INFO:sagemaker-containers:Imported framework sagemaker_xgboost_container.training
INFO:sagemaker-containers:Failed to parse hyperparameter objective value multi:softmax to Json.
Returning the value itself
INFO:sagemaker-containers:No GPUs detected (normal if no gpus installed)
INFO:sagemaker_xgboost_container.training:Running XGBoost Sagemaker in algorithm mode
INFO:root:Determined delimiter of CSV input is ','
INFO:root:Determined delimiter of CSV input is ','
INFO:root:Determined delimiter of CSV input is ','
[04:10:29] 24000x33 matrix with 792000 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,
INFO:root:Determined delimiter of CSV input is ','
[04:10:29] 3000x33 matrix with 99000 entries loaded from /opt/ml/input/data/validation?format=csv&label_column=0&delimiter=,
INFO:root:Single node training.
INFO:root:Train matrix has 24000 rows
INFO:root:Validation matrix has 3000 rows
[04:10:29] WARNING: /workspace/src/learner.cc:328: 
Parameters: { num_round } might not be used.
This may not be accurate due to some parameters are only used in language bindings but
passed down to XGBoost core. Or some parameters are not used but slip through this
verification. Please open an issue if you find above cases.
[0]#011train-merror:0.08154#011validation-merror:0.21300
[1]#011train-merror:0.06742#011validation-merror:0.20000
[2]#011train-merror:0.06158#011validation-merror:0.19633
[3]#011train-merror:0.05763#011validation-merror:0.19567
[4]#011train-merror:0.05508#011validation-merror:0.19200
[5]#011train-merror:0.05217#011validation-merror:0.19433
[6]#011train-merror:0.04875#011validation-merror:0.19233
[7]#011train-merror:0.04642#011validation-merror:0.19133
[8]#011train-merror:0.04304#011validation-merror:0.19133
[9]#011train-merror:0.03992#011validation-merror:0.18800
[10]#011train-merror:0.03733#011validation-merror:0.18800
[11]#011train-merror:0.03425#011validation-merror:0.18500
[12]#011train-merror:0.03225#011validation-merror:0.18600
[13]#011train-merror:0.02971#011validation-merror:0.18467
[14]#011train-merror:0.02783#011validation-merror:0.18700
[15]#011train-merror:0.02625#011validation-merror:0.18433
[16]#011train-merror:0.02417#011validation-merror:0.18433
[17]#011train-merror:0.02242#011validation-merror:0.18333
[18]#011train-merror:0.02013#011validation-merror:0.18100
[19]#011train-merror:0.01796#011validation-merror:0.18233
[20]#011train-merror:0.01625#011validation-merror:0.18100
[21]#011train-merror:0.01550#011validation-merror:0.18067
[22]#011train-merror:0.01362#011validation-merror:0.18133
[23]#011train-merror:0.01233#011validation-merror:0.18100
[24]#011train-merror:0.01087#011validation-merror:0.17867
[25]#011train-merror:0.01021#011validation-merror:0.17900
[26]#011train-merror:0.00937#011validation-merror:0.17800
[27]#011train-merror:0.00800#011validation-merror:0.17767
[28]#011train-merror:0.00737#011validation-merror:0.17733
[29]#011train-merror:0.00679#011validation-merror:0.17767
[30]#011train-merror:0.00625#011validation-merror:0.17800
[31]#011train-merror:0.00575#011validation-merror:0.17733
[32]#011train-merror:0.00562#011validation-merror:0.17700
[33]#011train-merror:0.00546#011validation-merror:0.17700
[34]#011train-merror:0.00517#011validation-merror:0.17600
[35]#011train-merror:0.00492#011validation-merror:0.17667
[36]#011train-merror:0.00475#011validation-merror:0.17733
[37]#011train-merror:0.00462#011validation-merror:0.17733
[38]#011train-merror:0.00450#011validation-merror:0.17700
[39]#011train-merror:0.00442#011validation-merror:0.17700
[40]#011train-merror:0.00438#011validation-merror:0.17767
[41]#011train-merror:0.00438#011validation-merror:0.17800
[42]#011train-merror:0.00433#011validation-merror:0.17733
[43]#011train-merror:0.00421#011validation-merror:0.17833
[44]#011train-merror:0.00417#011validation-merror:0.17700
[45]#011train-merror:0.00417#011validation-merror:0.17600
[46]#011train-merror:0.00417#011validation-merror:0.17700
[47]#011train-merror:0.00417#011validation-merror:0.17767
[48]#011train-merror:0.00379#011validation-merror:0.17800
[49]#011train-merror:0.00379#011validation-merror:0.17700
[50]#011train-merror:0.00371#011validation-merror:0.17667
[51]#011train-merror:0.00358#011validation-merror:0.17700
[52]#011train-merror:0.00354#011validation-merror:0.17633
[53]#011train-merror:0.00354#011validation-merror:0.17700
[54]#011train-merror:0.00354#011validation-merror:0.17633
[55]#011train-merror:0.00354#011validation-merror:0.17633
[56]#011train-merror:0.00346#011validation-merror:0.17667
[57]#011train-merror:0.00346#011validation-merror:0.17733
[58]#011train-merror:0.00342#011validation-merror:0.17700
[59]#011train-merror:0.00342#011validation-merror:0.17667
[60]#011train-merror:0.00342#011validation-merror:0.17767
[61]#011train-merror:0.00342#011validation-merror:0.17667
[62]#011train-merror:0.00342#011validation-merror:0.17700
[63]#011train-merror:0.00346#011validation-merror:0.17567
[64]#011train-merror:0.00342#011validation-merror:0.17600
[65]#011train-merror:0.00317#011validation-merror:0.17633
[66]#011train-merror:0.00317#011validation-merror:0.17633
[67]#011train-merror:0.00317#011validation-merror:0.17633
[68]#011train-merror:0.00317#011validation-merror:0.17633
[69]#011train-merror:0.00317#011validation-merror:0.17667
[70]#011train-merror:0.00317#011validation-merror:0.17667
[71]#011train-merror:0.00317#011validation-merror:0.17700
[72]#011train-merror:0.00317#011validation-merror:0.17633
[73]#011train-merror:0.00296#011validation-merror:0.17567
[74]#011train-merror:0.00300#011validation-merror:0.17567
[75]#011train-merror:0.00300#011validation-merror:0.17567
[76]#011train-merror:0.00258#011validation-merror:0.17667
[77]#011train-merror:0.00258#011validation-merror:0.17733
[78]#011train-merror:0.00258#011validation-merror:0.17667
[79]#011train-merror:0.00258#011validation-merror:0.17667
[80]#011train-merror:0.00254#011validation-merror:0.17633
[81]#011train-merror:0.00213#011validation-merror:0.17567
[82]#011train-merror:0.00208#011validation-merror:0.17500
[83]#011train-merror:0.00213#011validation-merror:0.17500
[84]#011train-merror:0.00213#011validation-merror:0.17500
[85]#011train-merror:0.00208#011validation-merror:0.17533
[86]#011train-merror:0.00208#011validation-merror:0.17467
[87]#011train-merror:0.00204#011validation-merror:0.17533
[88]#011train-merror:0.00204#011validation-merror:0.17500
[89]#011train-merror:0.00204#011validation-merror:0.17533
[90]#011train-merror:0.00187#011validation-merror:0.17567
[91]#011train-merror:0.00171#011validation-merror:0.17600
[92]#011train-merror:0.00167#011validation-merror:0.17600
[93]#011train-merror:0.00167#011validation-merror:0.17600
[94]#011train-merror:0.00167#011validation-merror:0.17600
[95]#011train-merror:0.00167#011validation-merror:0.17567
[96]#011train-merror:0.00167#011validation-merror:0.17567
[97]#011train-merror:0.00167#011validation-merror:0.17533
[98]#011train-merror:0.00158#011validation-merror:0.17567
[99]#011train-merror:0.00158#011validation-merror:0.17567
[100]#011train-merror:0.00158#011validation-merror:0.17533
[101]#011train-merror:0.00158#011validation-merror:0.17467
[102]#011train-merror:0.00158#011validation-merror:0.17500
[103]#011train-merror:0.00158#011validation-merror:0.17500
[104]#011train-merror:0.00158#011validation-merror:0.17600
[105]#011train-merror:0.00142#011validation-merror:0.17467
[106]#011train-merror:0.00137#011validation-merror:0.17500
[107]#011train-merror:0.00133#011validation-merror:0.17533
[108]#011train-merror:0.00133#011validation-merror:0.17433
[109]#011train-merror:0.00133#011validation-merror:0.17433
[110]#011train-merror:0.00133#011validation-merror:0.17433
[111]#011train-merror:0.00129#011validation-merror:0.17467
[112]#011train-merror:0.00121#011validation-merror:0.17367
[113]#011train-merror:0.00121#011validation-merror:0.17367
[114]#011train-merror:0.00121#011validation-merror:0.17467
[115]#011train-merror:0.00121#011validation-merror:0.17467
[116]#011train-merror:0.00121#011validation-merror:0.17500
[117]#011train-merror:0.00112#011validation-merror:0.17467
[118]#011train-merror:0.00112#011validation-merror:0.17400
[119]#011train-merror:0.00112#011validation-merror:0.17500
[120]#011train-merror:0.00112#011validation-merror:0.17433
[121]#011train-merror:0.00108#011validation-merror:0.17600
[122]#011train-merror:0.00108#011validation-merror:0.17667
[123]#011train-merror:0.00108#011validation-merror:0.17667
[124]#011train-merror:0.00108#011validation-merror:0.17633
[125]#011train-merror:0.00108#011validation-merror:0.17600
[126]#011train-merror:0.00108#011validation-merror:0.17567
[127]#011train-merror:0.00108#011validation-merror:0.17633
[128]#011train-merror:0.00108#011validation-merror:0.17700
[129]#011train-merror:0.00108#011validation-merror:0.17767
[130]#011train-merror:0.00104#011validation-merror:0.17733
[131]#011train-merror:0.00104#011validation-merror:0.17767
[132]#011train-merror:0.00104#011validation-merror:0.17767
[133]#011train-merror:0.00104#011validation-merror:0.17733
[134]#011train-merror:0.00104#011validation-merror:0.17733
[135]#011train-merror:0.00100#011validation-merror:0.17667
[136]#011train-merror:0.00100#011validation-merror:0.17600
[137]#011train-merror:0.00100#011validation-merror:0.17700
[138]#011train-merror:0.00100#011validation-merror:0.17667
[139]#011train-merror:0.00092#011validation-merror:0.17700
[140]#011train-merror:0.00092#011validation-merror:0.17733
[141]#011train-merror:0.00092#011validation-merror:0.17733
[142]#011train-merror:0.00092#011validation-merror:0.17733
[143]#011train-merror:0.00092#011validation-merror:0.17733
[144]#011train-merror:0.00092#011validation-merror:0.17633
[145]#011train-merror:0.00092#011validation-merror:0.17567
[146]#011train-merror:0.00092#011validation-merror:0.17500
[147]#011train-merror:0.00088#011validation-merror:0.17533
[148]#011train-merror:0.00083#011validation-merror:0.17467
[149]#011train-merror:0.00083#011validation-merror:0.17467

2022-05-04 04:11:42 Uploading - Uploading generated training model
2022-05-04 04:11:42 Completed - Training job completed
Training seconds: 163
Billable seconds: 163

Deploying Model for Inferencing

				
					predictor = Xgboost_classifier.deploy(initial_instance_count = 1, instance_type = "ml.m4.xlarge")

predictor.endpoint_name

from sagemaker.predictor import csv_serializer, json_deserializer
predictor.serializer = csv_serializer

predictions = predictor.predict(X_test)
predictions

predicted_values = bytes_2_array(predictions)

predicted_values

# custom code to convert the values in bytes format to array

def bytes_2_array(x):
    
    #makes entire prediction as string and splits based on ','
    l = str(x).split(',')
    
    #Since the first element contains unwanted characters like (b,',') we remove them
    l[0] = l[0][2:]
    #same-thing as above remove the unwanted last character (')
    l[-1] = l[-1][:-1]
    
    #iterating through the list of strings and converting them into float type
    for i in range(len(l)):
        l[i] = float(l[i])
        
    #converting the list to into array
    l = np.array(l).astype('float32')
    
    #reshape one-dimensional array to two-dimentaional array
    return l.reshape(-1,1)
    
    # plot metrics
from sklearn.metrics import precision_score, recall_score, accuracy_score

print("Precision = {}".format(precision_score(y_test, predicted_values, average='macro')))
print("Recall = {}".format(recall_score(y_test, predicted_values, average='macro')))
print("Accuracy = {}".format(accuracy_score(y_test, predicted_values)))

<!--Precision = 0.7437012617876095-->
<!--Recall = 0.6789832032095353-->
<!--Accuracy = 0.8253333333333334-->


# Delete the end-point
predictor.delete_endpoint()