!pip install scikit-learn-extra

Requirement already satisfied: scikit-learn-extra in /usr/local/lib/python3.11/dist-packages (0.3.0)
Requirement already satisfied: numpy>=1.13.3 in /usr/local/lib/python3.11/dist-packages (from scikit-learn-extra) (1.26.4)
Requirement already satisfied: scipy>=0.19.1 in /usr/local/lib/python3.11/dist-packages (from scikit-learn-extra) (1.15.3)
Requirement already satisfied: scikit-learn>=0.23.0 in /usr/local/lib/python3.11/dist-packages (from scikit-learn-extra) (1.7.0)
Requirement already satisfied: joblib>=1.2.0 in /usr/local/lib/python3.11/dist-packages (from scikit-learn>=0.23.0->scikit-learn-extra) (1.5.1)
Requirement already satisfied: threadpoolctl>=3.1.0 in /usr/local/lib/python3.11/dist-packages (from scikit-learn>=0.23.0->scikit-learn-extra) (3.6.0)

import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler

from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
import warnings
warnings.filterwarnings("ignore")

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

data = pd.read_excel('/content/drive/MyDrive/MIT - Data Analytics/Elective Project/AllLife Bank/Credit Card Customer Data.xlsx')

data.head()

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 660 entries, 0 to 659
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype
---  ------               --------------  -----
 0   Sl_No                660 non-null    int64
 1   Customer Key         660 non-null    int64
 2   Avg_Credit_Limit     660 non-null    int64
 3   Total_Credit_Cards   660 non-null    int64
 4   Total_visits_bank    660 non-null    int64
 5   Total_visits_online  660 non-null    int64
 6   Total_calls_made     660 non-null    int64
dtypes: int64(7)
memory usage: 36.2 KB

data.nunique()

duplicate_keys = data['Customer Key'].duplicated(keep=False)

data[duplicate_keys]

data = data.drop_duplicates(subset='Customer Key', keep=False)

data.drop(columns = ['Sl_No', 'Customer Key'], inplace = True)

data[data.duplicated()]

data = data[~data.duplicated()]
data.shape

(639, 5)

data.describe().T

for col in data.columns:
    print(col)
    print('Skew :', round(data[col].skew(), 2))

    plt.figure(figsize = (15, 4))
    plt.subplot(1, 2, 1)

    data[col].hist()

    plt.ylabel('count')

    plt.subplot(1, 2, 2)

    sns.boxplot(x = data[col])

    plt.show()

Avg_Credit_Limit
Skew : 2.2

Total_Credit_Cards
Skew : 0.16

Total_visits_bank
Skew : 0.15

Total_visits_online
Skew : 2.22

Total_calls_made
Skew : 0.65

plt.figure(figsize = (8, 8))

sns.heatmap(data.corr(), annot = True, fmt = '0.2f')

plt.show()

scaler = StandardScaler()
data_scaled = StandardScaler().fit_transform(data)

from sklearn.decomposition import PCA

n = data.shape[1]

pca = PCA(n_components=n)

principal_components = pca.fit_transform(data_scaled)

data_pca = pd.DataFrame(principal_components, columns = data.columns)

data_copy = data_pca.copy(deep = True)

#1
sse = {}

# 2
for k in range(1, 10):
    kmeans = KMeans(n_clusters = k, max_iter = 1000, random_state = 1).fit(data_pca)
    sse[k] = kmeans.inertia_

# 3
plt.figure()
plt.plot(list(sse.keys()), list(sse.values()), 'bx-')
plt.xlabel("Number of cluster")
plt.ylabel("SSE")

plt.show()

kmeans = KMeans(n_clusters=3, max_iter=1000, random_state=1)

kmeans.fit(data_pca)

data_copy['Labels'] = kmeans.labels_
data['Labels'] = kmeans.labels_

data.Labels.value_counts()

mean = data.groupby('Labels').mean()
median = data.groupby('Labels').median()

df_kmeans = pd.concat([mean, median], axis = 0)
df_kmeans.index = ['group_0 Mean', 'group_1 Mean', 'group_2 Mean', 'group_0 Median', 'group_1 Median', 'group_2 Median']
df_kmeans.T

data_copy.boxplot(by = 'Labels', layout = (1, 5), figsize = (20, 7))

plt.show()

gmm = GaussianMixture(n_components=3, random_state=1)

gmm.fit(data_pca)

data_copy['GmmLabels'] = gmm.predict(data_pca)
data['GmmLabels'] = gmm.predict(data_pca)

data.GmmLabels.value_counts()

original_features = ["Avg_Credit_Limit", "Total_Credit_Cards", "Total_visits_bank", "Total_visits_online", "Total_calls_made"]

mean = data.groupby('GmmLabels').mean()
median = data.groupby('GmmLabels').median()

df_gmm = pd.concat([mean, median], axis = 0)
df_gmm.index = ['group_0 Mean', 'group_1 Mean', 'group_2 Mean', 'group_0 Median', 'group_1 Median', 'group_2 Median']
df_gmm[original_features].T

features_with_lables = ["Avg_Credit_Limit", "Total_Credit_Cards", "Total_visits_bank", "Total_visits_online", "Total_calls_made", "GmmLabels"]

data_copy[features_with_lables].boxplot(by = 'GmmLabels', layout = (1, 5),figsize = (20, 7))

plt.show()

(data['Labels'] == data['GmmLabels']).value_counts()

!pip install scikit-learn-extra

Requirement already satisfied: scikit-learn-extra in /usr/local/lib/python3.11/dist-packages (0.3.0)
Requirement already satisfied: numpy>=1.13.3 in /usr/local/lib/python3.11/dist-packages (from scikit-learn-extra) (1.26.4)
Requirement already satisfied: scipy>=0.19.1 in /usr/local/lib/python3.11/dist-packages (from scikit-learn-extra) (1.15.3)
Requirement already satisfied: scikit-learn>=0.23.0 in /usr/local/lib/python3.11/dist-packages (from scikit-learn-extra) (1.7.0)
Requirement already satisfied: joblib>=1.2.0 in /usr/local/lib/python3.11/dist-packages (from scikit-learn>=0.23.0->scikit-learn-extra) (1.5.1)
Requirement already satisfied: threadpoolctl>=3.1.0 in /usr/local/lib/python3.11/dist-packages (from scikit-learn>=0.23.0->scikit-learn-extra) (3.6.0)

from sklearn_extra.cluster import KMedoids

kmedo = KMedoids(n_clusters=3, random_state=1)
kmedo.fit(data_pca)

data_copy['kmedoLabels'] = kmedo.predict(data_pca)

data['kmedoLabels'] = kmedo.predict(data_pca)

data.kmedoLabels.value_counts()

mean = data.groupby('kmedoLabels').mean()

median = data.groupby('kmedoLabels').median()

df_kmedoids = pd.concat([mean, median], axis = 0)

df_kmedoids.index = ['group_0 Mean', 'group_1 Mean', 'group_2 Mean', 'group_0 Median', 'group_1 Median', 'group_2 Median']

df_kmedoids[original_features].T

features_with_lables = ["Avg_Credit_Limit", "Total_Credit_Cards", "Total_visits_bank", "Total_visits_online", "Total_calls_made", "kmedoLabels"]

data_copy[features_with_lables].boxplot(by = 'kmedoLabels', layout = (1, 5), figsize = (20, 7))

plt.show()

comparison = pd.concat([df_kmedoids, df_kmeans], axis = 1)[original_features]

comparison

	Sl_No	Customer Key	Avg_Credit_Limit	Total_Credit_Cards	Total_visits_bank	Total_visits_online	Total_calls_made
0	1	87073	100000	2	1	1	0
1	2	38414	50000	3	0	10	9
2	3	17341	50000	7	1	3	4
3	4	40496	30000	5	1	1	4
4	5	47437	100000	6	0	12	3

	Sl_No	Customer Key	Avg_Credit_Limit	Total_Credit_Cards	Total_visits_bank	Total_visits_online	Total_calls_made
4	5	47437	100000	6	0	12	3
48	49	37252	6000	4	0	2	8
104	105	97935	17000	2	1	2	10
332	333	47437	17000	7	3	1	0
391	392	96929	13000	4	5	0	0
398	399	96929	67000	6	2	2	2
411	412	50706	44000	4	5	0	2
432	433	37252	59000	6	2	1	2
541	542	50706	60000	7	5	2	2
632	633	97935	187000	7	1	7	0

	Avg_Credit_Limit	Total_Credit_Cards	Total_visits_bank	Total_visits_online	Total_calls_made
162	8000	2	0	3	4
175	6000	1	0	2	5
215	8000	4	0	4	7
295	10000	6	4	2	3
324	9000	4	5	0	4
361	18000	6	3	1	4
378	12000	6	5	2	1
385	8000	7	4	2	0
395	5000	4	5	0	1
455	47000	6	2	0	4
497	52000	4	2	1	2

	count	mean	std	min	25%	50%	75%	max
Avg_Credit_Limit	639.0	34532.081377	37450.554493	3000.0	11000.0	18000.0	48000.0	200000.0
Total_Credit_Cards	639.0	4.699531	2.180100	1.0	3.0	5.0	6.0	10.0
Total_visits_bank	639.0	2.397496	1.620324	0.0	1.0	2.0	4.0	5.0
Total_visits_online	639.0	2.619718	2.942125	0.0	1.0	2.0	4.0	15.0
Total_calls_made	639.0	3.600939	2.870573	0.0	1.0	3.0	5.0	10.0

	count
Labels
0	372
2	219
1	48

Unsupervised Learning Project: AllLife Bank Customer Segmentation¶

Context¶

Objective¶

About the data¶

Importing libraries and overview of the dataset¶

Loading the data¶

Data Overview¶

Data Preprocessing and Exploratory Data Analysis¶

Check the summary Statistics¶

Scaling the data¶

Applying PCA on scaled data¶

K-Means¶

Create the cluster profiles using the summary statistics and box plots for each label¶

Gaussian Mixture Model¶

Create the cluster profiles using the summary statistics and box plots for each label¶

Compare the clusters from both algorithms - K-means and Gaussian Mixture Model¶

K-Medoids¶

Create cluster profiles using the summary statistics and box plots for each label¶

Compare the clusters from K-Means and K-Medoids¶

Conclusions and Business Recommendations¶

	group_0 Mean	group_1 Mean	group_2 Mean	group_0 Median	group_1 Median	group_2 Median
Avg_Credit_Limit	33922.043011	140937.500000	12246.575342	31500.0	145500.0	12000.0
Total_Credit_Cards	5.516129	8.833333	2.406393	6.0	9.0	2.0
Total_visits_bank	3.481183	0.604167	0.949772	3.0	1.0	1.0
Total_visits_online	0.981183	10.958333	3.575342	1.0	11.0	4.0
Total_calls_made	2.002688	1.062500	6.872146	2.0	1.0	7.0

	group_0 Mean	group_1 Mean	group_2 Mean	group_0 Median	group_1 Median	group_2 Median
Avg_Credit_Limit	12222.727273	84939.393939	28449.477352	12000.0	68000.0	20000.0
Total_Credit_Cards	2.418182	7.037879	5.372822	2.0	7.0	5.0
Total_visits_bank	0.954545	1.704545	3.822300	1.0	2.0	4.0
Total_visits_online	3.568182	4.583333	0.989547	4.0	2.0	1.0
Total_calls_made	6.859091	1.962121	1.857143	7.0	2.0	2.0

	Avg_Credit_Limit	Avg_Credit_Limit	Total_Credit_Cards	Total_Credit_Cards	Total_visits_bank	Total_visits_bank	Total_visits_online	Total_visits_online	Total_calls_made	Total_calls_made
group_0 Mean	12222.727273	33922.043011	2.418182	5.516129	0.954545	3.481183	3.568182	0.981183	6.859091	2.002688
group_1 Mean	84939.393939	140937.500000	7.037879	8.833333	1.704545	0.604167	4.583333	10.958333	1.962121	1.062500
group_2 Mean	28449.477352	12246.575342	5.372822	2.406393	3.822300	0.949772	0.989547	3.575342	1.857143	6.872146
group_0 Median	12000.000000	31500.000000	2.000000	6.000000	1.000000	3.000000	4.000000	1.000000	7.000000	2.000000
group_1 Median	68000.000000	145500.000000	7.000000	9.000000	2.000000	1.000000	2.000000	11.000000	2.000000	1.000000
group_2 Median	20000.000000	12000.000000	5.000000	2.000000	4.000000	1.000000	1.000000	4.000000	2.000000	7.000000

	count
kmedoLabels
2	287
0	220
1	132