# Now using the new dataset csv file
import pandas as pd
pd.options.display.precision = 3
df = pd.read_csv("new_dataset.csv")
df.head(5)

# View details about the data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 950938 entries, 0 to 950937
Data columns (total 10 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   TransactionID            950938 non-null  object 
 1   CustomerID               950938 non-null  object 
 2   CustomerDOB              950938 non-null  object 
 3   CustGender               950052 non-null  object 
 4   CustLocation             950827 non-null  object 
 5   CustAccountBalance       950938 non-null  float64
 6   TransactionDate          950938 non-null  object 
 7   TransactionTime          950938 non-null  object 
 8   TransactionAmount (INR)  950938 non-null  float64
 9   Age                      950938 non-null  float64
dtypes: float64(3), object(7)
memory usage: 72.6+ MB

# Check null or missing values
df.isna().sum()

TransactionID                0
CustomerID                   0
CustomerDOB                  0
CustGender                 886
CustLocation               111
CustAccountBalance           0
TransactionDate              0
TransactionTime              0
TransactionAmount (INR)      0
Age                          0
dtype: int64

# for Age if Nan, fill with X, indicate gender was not specified.
df['CustGender'].fillna("X", inplace = True)
df.isna().sum()

TransactionID                0
CustomerID                   0
CustomerDOB                  0
CustGender                   0
CustLocation               111
CustAccountBalance           0
TransactionDate              0
TransactionTime              0
TransactionAmount (INR)      0
Age                          0
dtype: int64

# Drop row, if DOB, AccountBalance are Nan
df = df.drop(df[df['CustomerDOB'].isna()].index)
df = df.drop(df[df['CustAccountBalance'].isna()].index)
df = df.drop(df[df['CustLocation'].isna()].index)

# Rename columns
df['Gender'] = df['CustGender']
df['Location'] = df['CustLocation']
df['AccountBalance'] = df['CustAccountBalance']
df['TransactionAmount'] = df['TransactionAmount (INR)']
df.drop(columns=['CustGender', 'CustAccountBalance', 'CustLocation', 'TransactionAmount (INR)'], inplace=True)

print("done")

dc = df.copy()

done

# Fix data types
df['AccountBalance'] = df['AccountBalance'].astype(int)
df['TransactionTime'] = pd.to_datetime(df['TransactionTime'])
df['TransactionDate'] = pd.to_datetime(df['TransactionDate'], format="%Y-%m-%d")
# df['TransactionDate'] = pd.to_datetime(df['TransactionDate'], format='%Y-%m-%d %H:%M:%S')
df['CustomerDOB'] =  pd.to_datetime(df['CustomerDOB'], format="%Y-%m-%d")

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 950827 entries, 0 to 950937
Data columns (total 10 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   TransactionID      950827 non-null  object        
 1   CustomerID         950827 non-null  object        
 2   CustomerDOB        950827 non-null  datetime64[ns]
 3   TransactionDate    950827 non-null  datetime64[ns]
 4   TransactionTime    950827 non-null  datetime64[ns]
 5   Age                950827 non-null  float64       
 6   Gender             950827 non-null  object        
 7   Location           950827 non-null  object        
 8   AccountBalance     950827 non-null  int32         
 9   TransactionAmount  950827 non-null  float64       
dtypes: datetime64[ns](3), float64(2), int32(1), object(4)
memory usage: 76.2+ MB

# Check duplicates
print(f"Get the total number of duplicate rows is = {df.duplicated().sum()}")

Get the total number of duplicate rows is = 0

# Create a new column to track how often a custom transacts.
df['TransactionFrequency'] = df.groupby('CustomerID')['TransactionDate'].transform('nunique')
# A column to store a users average spending
df['AverageSpend'] = df.groupby('CustomerID')['TransactionAmount'].transform('mean')
# Total value of transactions per user.
df['total_transactions'] = df.groupby('CustomerID')['TransactionAmount'].transform('sum')

df.head(5)

# Determine when do most transaction happen.
pd.DataFrame(df.TransactionDate.describe())

df['TransactionFrequency'].max()

6

# Create a dataframe that will hold the total value of transactions per day
daily_transaction_value = df.groupby('TransactionDate')['TransactionAmount'].count().reset_index()

# Create a dataframe that will hold total monthly transaction value
df.sort_values('TransactionDate', inplace=True)
df_monthly_transaction_value = df.resample('m', on='TransactionDate')['TransactionAmount'].sum().reset_index()

# Viw the data frame
df_monthly_transaction_value.columns = ['TransactionDate', 'TransactionAmount']
df_monthly_transaction_value.head(5)

# Import neccesary library and classes
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objs as go
import plotly.offline as pyoff
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA

pyoff.init_notebook_mode()

# box plot
sns.boxplot(x = "Gender", y = "AverageSpend", data = df )

<Axes: xlabel='Gender', ylabel='AverageSpend'>

# Eliminate unrelisatic ages i.e age > 100.
df['age_group'] = df['Age'].apply(segment_age_groups)
df = df[(df['Age'] > 13) & (df['Age'] <= 90)]
## Drop records where age < 13

## Drop rows where gender is T, this is unknown, X is considered as gender is not disclosed.
df = df[df['Gender'] != 'T']
sns.boxplot(x = "Gender", y = "AverageSpend", data = df )

<Axes: xlabel='Gender', ylabel='AverageSpend'>

sns.lineplot(x = 'TransactionDate', y = 'TransactionAmount', data = df, color = 'b')
plt.title('transactions vs date')
plt.xlabel('Date')
plt.ylabel('Amount')
plt.show()

C:\ProgramData\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning:

use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.

C:\ProgramData\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning:

use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.

# Create plot data to plot total value transactions over days and Months
plot_data = [
    go.Histogram(
        x = df['TransactionDate'].dt.day
    )]

plot_layout = go.Layout(title='Total Daily Transaction Value')
fig = go.Figure(data = plot_data, layout = plot_layout)
pyoff.iplot(fig)

# Get the total value of transactions by month
df['Month'] = df['TransactionDate'].dt.month
monthly_transactions = df.groupby('Month')['TransactionAmount'].sum().reset_index()
plt.figure(figsize=(10,6))
plt.bar(monthly_transactions['Month'], monthly_transactions['TransactionAmount'])
plt.xlabel('Month')
plt.ylabel('Total Value of Transactions')
plt.title('Total Value of Transactions by Month')
plt.show()

# most transaction happend in Aug.

# Encode varchar discrete features
le = LabelEncoder()

df['Location'] = le.fit_transform(df['Location'])
df['Gender'] = le.fit_transform(df['Gender'])

# Get discrete and continous columns
def extract_feature_columns(dataframe):
    cont_col = []
    disc_col = []

    for e in dataframe.columns:
        if ( 'AccountBalance' in e ) or ( 'Age' in e ) or ( 'TransactionAmount' in e):
            cont_col.append(e)
        elif ( 'TransactionFrequency' in e ) or ( 'Gender' in e ) or ( 'Location' in e ):
            disc_col.append(e)
        elif (df[e].dtype == 'NaN') or ( 'TransactionID' in e ) or (  'CustomerID' in e ):
            disc_col.append(e)
        else:
            cont_col.append(e)
    return cont_col, disc_col

cont_col, disc_col = extract_feature_columns(df)
print(f"Discrete columns are {disc_col}")
print(f"Continous columns are {cont_col}")

Discrete columns are ['TransactionID', 'CustomerID', 'Gender', 'Location', 'TransactionFrequency']
Continous columns are ['CustomerDOB', 'TransactionDate', 'TransactionTime', 'Age', 'AccountBalance', 'TransactionAmount', 'AverageSpend', 'total_transactions', 'age_group', 'Month']

# Drop CustomerID, TransactionID, TransactionTime, TransactionDate
df.drop(['TransactionID', 'TransactionTime', 'CustomerID', 'CustomerDOB'], axis = 1, inplace = True)
# preview dataframe with changes 
df.head(5)

# Group transactions by user age
def segment_age_groups(row):
    if row >= 36 and row < 60:
        return "senior"
    elif row >= 18 and row <= 35:
        return "adult"
    elif row >= 12 and row <= 17:
        return "teenager"
    elif row >= 60:
        return "pensioner"

df['age_group'] = df['Age'].apply(segment_age_groups)

# Visualize total transaction values by age groups
age_groups = df.groupby('Age')['TransactionAmount'].sum().reset_index()
plt.figure(figsize=(10, 6))
plt.bar(age_groups['Age'], age_groups['TransactionAmount'])
plt.xlabel('Age group')
plt.ylabel('Total Value of Transactions')
plt.title('Transactions Value by Age groups')
plt.show()

age_groups = df.groupby('Age')['TransactionAmount'].sum().reset_index()
plt.figure(figsize=(10, 6))
plt.bar(age_groups['Age'], age_groups['TransactionAmount'])
plt.xlabel('Age group')
plt.ylabel('Total Value of Transactions')
plt.title('Transactions Value by Age groups')
plt.show()

# Visualize total transaction values by age groups again
age_groups = df.groupby('age_group')['TransactionAmount'].sum().reset_index()
plt.figure(figsize=(10, 6))
plt.bar(age_groups['age_group'], age_groups['TransactionAmount'])
plt.xlabel('Age group')
plt.ylabel('Total Value of Transactions')
plt.title('Transactions Value by Age groups')
plt.show()

# Above plot helping 
df['AverageSpend'].describe()

count    8.934e+05
mean     1.305e+03
std      5.179e+03
min      0.000e+00
25%      1.857e+02
50%      4.650e+02
75%      1.100e+03
max      1.560e+06
Name: AverageSpend, dtype: float64

df.groupby('age_group')['age_group'].count()

age_group
adult       413418
senior      479838
teenager        97
Name: age_group, dtype: int64

dc = df.copy()
# dc['age_group'] = le.fit_transform(dc['age_group'])
dc.drop(['age_group'], axis = 1, inplace = True)
plt.figure(figsize = (12, 8))
sns.heatmap(dc.corr(), annot = True, linewidth = 3)
plt.title('Correlation Heatmap')

Text(0.5, 1.0, 'Correlation Heatmap')

# Group transactions in classes
def classify_transaction(row):
    if row >= 1100:
        return "class_3"
    elif row >= 465 and row < 1100:
        return "class_2"
    elif row >= 0 and row < 465:
        return "class_1"

df['Class'] = df['AverageSpend'].apply(classify_transaction)
df.head(5)

df['Class'] = le.fit_transform(df['Class'])
df['age_group'] = le.fit_transform(df['age_group'])
dt = df.copy()

dt.head(15)

# Export this dataframe to data_for_lr_model.csv file - this will be used to create a Random Forest Classifier model
dt.to_csv("data_for_lr_model.csv")

# Random Forest Classifier Modelling
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pickle

df = pd.read_csv('data_for_lr_model.csv')
# feature columns
X = df[['Age', 'Gender', 'Location', 'AccountBalance',
       'TransactionAmount', 'TransactionFrequency', 'AverageSpend',
       'total_transactions', 'age_group']]
y = df['Class'] # target column
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# logistic regression model
model = RandomForestClassifier(n_estimators=100, random_state=42)
# training data
model.fit(X_train, y_train)
# testing data
y_pred = model.predict(X_test)
# model's performance
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)
print('Classification Report:')
print(classification_report(y_test, y_pred))
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))

with open('RandomForestClassifier.pkl', 'wb') as f:
    pickle.dump(model, f)

Accuracy: 1.0
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     89478
           1       1.00      1.00      1.00     44581
           2       1.00      1.00      1.00     44612

    accuracy                           1.00    178671
   macro avg       1.00      1.00      1.00    178671
weighted avg       1.00      1.00      1.00    178671

Confusion Matrix:
[[89478     0     0]
 [    0 44581     0]
 [    0     0 44612]]

	TransactionID	CustomerID	CustomerDOB	CustGender	CustLocation	CustAccountBalance	TransactionDate	TransactionTime	TransactionAmount (INR)	Age
0	T1	C5841053	1994-10-01	F	JAMSHEDPUR	17819.05	2016-02-08	1970-01-01 00:00:00.000143207	25.0	30.0
1	T3	C4417068	1996-11-26	F	MUMBAI	17874.44	2016-02-08	1970-01-01 00:00:00.000142712	459.0	28.0
2	T5	C9031234	1988-03-24	F	NAVI MUMBAI	6714.43	2016-02-08	1970-01-01 00:00:00.000181156	1762.5	36.0
3	T7	C7126560	1992-01-26	F	MUMBAI	973.46	2016-02-08	1970-01-01 00:00:00.000173806	566.0	32.0
4	T8	C1220223	1982-01-27	M	MUMBAI	95075.54	2016-02-08	1970-01-01 00:00:00.000170537	148.0	42.0

	TransactionID	CustomerID	CustomerDOB	TransactionDate	TransactionTime	Age	Gender	Location	AccountBalance	TransactionAmount	TransactionFrequency	AverageSpend	total_transactions
0	T1	C5841053	1994-10-01	2016-02-08	1970-01-01 00:00:00.000143207	30.0	F	JAMSHEDPUR	17819	25.0	1	25.00	25.0
1	T3	C4417068	1996-11-26	2016-02-08	1970-01-01 00:00:00.000142712	28.0	F	MUMBAI	17874	459.0	1	459.00	459.0
2	T5	C9031234	1988-03-24	2016-02-08	1970-01-01 00:00:00.000181156	36.0	F	NAVI MUMBAI	6714	1762.5	1	1762.50	1762.5
3	T7	C7126560	1992-01-26	2016-02-08	1970-01-01 00:00:00.000173806	32.0	F	MUMBAI	973	566.0	4	205.25	821.0
4	T8	C1220223	1982-01-27	2016-02-08	1970-01-01 00:00:00.000170537	42.0	M	MUMBAI	95075	148.0	1	148.00	148.0

	TransactionDate	TransactionAmount
0	2016-01-31	5.292e+07
1	2016-02-29	5.524e+07
2	2016-03-31	6.336e+07
3	2016-04-30	6.745e+07
4	2016-05-31	5.661e+07

	TransactionDate	Age	Gender	Location	AccountBalance	TransactionAmount	TransactionFrequency	AverageSpend	total_transactions	age_group	Month
85565	2016-01-08	35.0	1	4723	26101	500.00	1	500.00	500.00	adult	1
100459	2016-01-08	36.0	1	1690	348	60.00	1	60.00	60.00	senior	1
100460	2016-01-08	33.0	0	2761	2540	101.15	1	101.15	101.15	adult	1
100461	2016-01-08	35.0	1	6138	43992	730.00	2	472.50	945.00	adult	1
100462	2016-01-08	33.0	1	1690	5593	380.00	1	380.00	380.00	adult	1

	TransactionDate	Age	Gender	Location	AccountBalance	TransactionAmount	TransactionFrequency	AverageSpend	total_transactions	age_group	Month	Class
85565	2016-01-08	35.0	1	4723	26101	500.00	1	500.00	500.00	adult	1	class_2
100459	2016-01-08	36.0	1	1690	348	60.00	1	60.00	60.00	senior	1	class_1
100460	2016-01-08	33.0	0	2761	2540	101.15	1	101.15	101.15	adult	1	class_1
100461	2016-01-08	35.0	1	6138	43992	730.00	2	472.50	945.00	adult	1	class_2
100462	2016-01-08	33.0	1	1690	5593	380.00	1	380.00	380.00	adult	1	class_1

Chief Technology Operations Project - CTO_Bank Project¶

Feature Engineering¶

Data Visualisations¶

Visualize value of transactions across all ages¶

	TransactionDate
count	950827
mean	2016-07-25 13:27:43.862301696
min	2016-01-08 00:00:00
25%	2016-06-09 00:00:00
50%	2016-08-20 00:00:00
75%	2016-09-09 00:00:00
max	2016-12-09 00:00:00