In [None]:
import os
import pandas as pd
from tqdm import tqdm
import datetime 
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV, train_test_split, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from imblearn.over_sampling import ADASYN
from sklearn.metrics import recall_score, precision_score, f1_score, make_scorer, accuracy_score
import xgboost as xgb
from sklearn.metrics import ConfusionMatrixDisplay
import time
import lime
import shap
from lime.lime_tabular import LimeTabularExplainer

In [None]:
def training(model, parameters=None, target_name=None, model_name=None):

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    resampler = ADASYN(random_state=0, n_neighbors=4)
    recall = make_scorer(recall_score)

    scores = {'model_name': model_name,
             'opt_hyperparameter': None,
             'val_score': None,
             'train_accuracy_score': None,
             'test_accuracy_score': None,
             'precision_score' : None,
             'recall_score': None,
             'run_time':None,
             'trained_model': None}

    grid_obj = GridSearchCV(model,
                            parameters,
                            n_jobs=-1, 
                            cv=skf,
                            return_train_score=True,
                            scoring=recall)

    start_time = time.time()
    grid_obj = grid_obj.fit(X_val_encoded, y_val)
    results_df = pd.DataFrame(grid_obj.cv_results_)
    results_df = (results_df.set_index(results_df["params"].apply(
                        lambda x: "_".join(str(val) for val in x.values()))))
    model = grid_obj.best_estimator_

    # resample train data
    X_res, y_res = resampler.fit_resample(X_train_encoded, y_train)

    # fit best model on resampled train data
    model.fit(X_res, y_res)

    # test trained model
    y_pred = model.predict(ct.transform(X_test))
    p_score= precision_score(y_test, y_pred, pos_label=1)
    r_score = recall_score(y_test, y_pred, pos_label=1)
    accuracy = accuracy_score(y_test, y_pred)
    
    print('accuracy:', accuracy)
    print('recall:', r_score)
    print('precision:', p_score)
    
    plt.figure(figsize=(6,6))
    plt.rcParams.update({'font.size': 14})
    ConfusionMatrixDisplay.from_predictions(y_test, y_pred,
                                            colorbar=False, cmap='Oranges')
    plt.title('Model Prediction', fontsize=12)
    plt.xlabel('Predicted Label', fontsize=12)
    plt.ylabel('True Label', fontsize=12);
    plt.tight_layout()
    plt.savefig('CM_GB.png', dpi=300)

    return model

## Data Audit

- VOC Data

In [None]:
DATA_PATH = '../data'

In [None]:
DATA_PATH = os.path.join(DATA_PATH, 'Test')
files_list = os.listdir(DATA_PATH)
print(f'There are {len(files_list)} files in Test')

In [None]:
df = {}
for file in tqdm(files_list):
    df[file] = pd.read_csv(os.path.join(DATA_PATH, file))

In [None]:
features = []
ref = []
diff = {}
for key, value in df.items():
    if not ref:
        ref = set(value.columns)
    else:
        diff = ref.symmetric_difference(set(value.columns))
    
    if diff:
        raise('Columns are different')
    
print(f'List of features: {ref}')
print(f'Number of features: {len(ref)}')

There are consistent features across files.

In [None]:
df_voc = pd.DataFrame()
for key, value in df.items():
    df_voc = pd.concat([df_voc, value])

In [None]:
df_voc.head(5)

In [None]:
df_voc.shape

In [None]:
null_percentage = pd.DataFrame(
    (100 * df_voc.isna().sum() / df_voc.shape[0]).sort_values(ascending=False),
     columns=['Percentage'])

display(null_percentage)
num_null = len(null_percentage[null_percentage.Percentage != 0])
print(f'Number of features with null values: {num_null}')

In [None]:
dup = df_voc.shape[0] - df_voc.drop_duplicates().shape[0]
print(f'There are {dup} duplicate rows')

- Valuable Data

In [None]:
df_valuable = pd.read_excel("../data/VOC Valuable_.xlsx")

In [None]:
df_valuable.shape

In [None]:
null_percentage = pd.DataFrame(
    (100 * df_valuable.isna().sum() / df_valuable.shape[0]).sort_values(ascending=False),
     columns=['Percentage'])

display(null_percentage)
num_null = len(null_percentage[null_percentage.Percentage != 0])
print(f'Number of features with null values: {num_null}')

In [None]:
dup = df_valuable.shape[0] - df_valuable.drop_duplicates().shape[0]
print(f'There are {dup} duplicate rows')

## Data Prep

In [None]:
df_voc['Date'] = pd.to_datetime(df_voc['Minute(Contact Date)']).dt.date

In [None]:
df_valuable['Date'] = pd.to_datetime(df_valuable['Day(Contact Date)']).dt.date

In [None]:
df_valuable.rename(columns = {'VOC? Yes=1 No-0' : 'VOC',
                              'VOC Valuable=1 Not Valuable=0':'Valuable'},
                   inplace = True)

In [None]:
check = df_valuable['Ticket ID'].tolist()

In [None]:
df_test =  pd.merge(df_voc[df_voc['Ticket ID'].isin(check)][['Analytic Genre Tags', 'Content Category Tags',
                                                             'Content Type Tags', 'Dev. Type ID Tags', 'Dev. Type Name Tags',
                                                             'Error Tags', 'Hashtags', 'MoP Tags', 'Primary Hashtag',
                                                             'Show Tags', 'Title ID Tags', 'Ticket ID', 'Date']],
                    df_valuable[['Ticket ID', 'Date', 'VOC', 'Valuable']],
                    how = 'left', on = ['Ticket ID', 'Date'])

In [None]:
df_test.shape

In [None]:
null_percentage = pd.DataFrame(
    (100 * df_test.isna().sum() / df_test.shape[0]).sort_values(ascending=False),
     columns=['Percentage'])

display(null_percentage)
num_null = len(null_percentage[null_percentage.Percentage != 0])
print(f'Number of features with null values: {num_null}')

In [None]:
dup = df_test.shape[0] - df_test.drop_duplicates().shape[0]
print(f'There are {dup} duplicate rows')

In [None]:
df_test = df_test.drop_duplicates()

## Data Visualization

In [None]:
df_test = df_test.astype(str)

*Analytic Genre Tags*

In [None]:
test = df_test.groupby(['Analytic Genre Tags']).aggregate(
count = ('Analytic Genre Tags', 'count')).reset_index().sort_values(['count'], ascending = True)

test['%_count'] = ((test['count']/test['count'].sum())*100).round(4)

In [None]:
fig = plt.figure(figsize = (10, 7))
 
# creating the bar plot
plt.barh(test['Analytic Genre Tags'], 
         test['%_count'],
         color = 'orange')
 
plt.xlabel("Number of Tickets (in %)")
plt.ylabel("Analytic Genre Tags")
plt.show()

*Content Category Tags*

In [None]:
test = df_test.groupby(['Content Category Tags']).aggregate(
count = ('Content Category Tags', 'count')).reset_index().sort_values(['count'], ascending = True)

test['%_count'] = ((test['count']/test['count'].sum())*100).round(4)

In [None]:
fig = plt.figure(figsize = (10, 7))
 
# creating the bar plot
plt.barh(test['Content Category Tags'], 
         test['%_count'],
         color = 'orange')
 
plt.xlabel("Number of Tickets (in %)")
plt.ylabel("Content Category Tags")
plt.show()

*Content Type Tags*

In [None]:
test = df_test.groupby(['Content Type Tags']).aggregate(
count = ('Content Type Tags', 'count')).reset_index().sort_values(['count'], ascending = True)

test['%_count'] = ((test['count']/test['count'].sum())*100).round(4)

In [None]:
fig = plt.figure(figsize = (10, 7))
 
# creating the bar plot
plt.barh(test['Content Type Tags'], 
         test['%_count'],
         color = 'orange')
 
plt.xlabel("Number of Tickets (in %)")
plt.ylabel("Content Type Tags")
plt.show()

*Dev. Type ID Tags*

In [None]:
test = df_test.groupby(['Dev. Type ID Tags']).aggregate(
count = ('Dev. Type ID Tags', 'count')).reset_index().sort_values(['count'], ascending = True)

test['%_count'] = ((test['count']/test['count'].sum())*100).round(4)

In [None]:
fig = plt.figure(figsize = (10, 7))
 
# creating the bar plot
plt.barh(test['Dev. Type ID Tags'], 
         test['%_count'],
         color = 'orange')
 
plt.xlabel("Number of Tickets (in %)")
plt.ylabel("Dev. Type ID Tags")
plt.show()

*Dev. Type Name Tags*

In [None]:
test = df_test.groupby(['Dev. Type Name Tags']).aggregate(
count = ('Dev. Type Name Tags', 'count')).reset_index().sort_values(['count'], ascending = True)

test['%_count'] = ((test['count']/test['count'].sum())*100).round(4)

In [None]:
test.shape

In [None]:
fig = plt.figure(figsize = (10, 7))
 
# creating the bar plot
plt.barh(test['Dev. Type Name Tags'], 
         test['%_count'],
         color = 'orange')
 
plt.xlabel("Number of Tickets (in %)")
plt.ylabel("Dev. Type Name Tags")
plt.show()

*Error Tags*

In [None]:
test = df_test.groupby(['Error Tags']).aggregate(
count = ('Error Tags', 'count')).reset_index().sort_values(['count'], ascending = True)

test['%_count'] = ((test['count']/test['count'].sum())*100).round(4)

In [None]:
fig = plt.figure(figsize = (10, 7))
 
# creating the bar plot
plt.barh(test['Error Tags'], 
         test['%_count'],
         color = 'orange')
 
plt.xlabel("Number of Tickets (in %)")
plt.ylabel("Error Tags")
plt.show()

In [None]:
df_test.loc[df_test["Error Tags"] == "nan", "Error Tags"] = "no tags"

*Hashtags*

In [None]:
test = df_test.groupby(['Hashtags']).aggregate(
count = ('Hashtags', 'count')).reset_index().sort_values(['count'], ascending = True)

test['%_count'] = ((test['count']/test['count'].sum())*100).round(4)

In [None]:
fig = plt.figure(figsize = (10, 7))
 
# creating the bar plot
plt.barh(test['Hashtags'], 
         test['%_count'],
         color = 'orange')
 
plt.xlabel("Number of Tickets (in %)")
plt.ylabel("Hashtags")
plt.show()

*MoP Tags*

In [None]:
test = df_test.groupby(['MoP Tags']).aggregate(
count = ('MoP Tags', 'count')).reset_index().sort_values(['count'], ascending = True)

test['%_count'] = ((test['count']/test['count'].sum())*100).round(4)

In [None]:
fig = plt.figure(figsize = (10, 7))
 
# creating the bar plot
plt.barh(test['MoP Tags'], 
         test['%_count'],
         color = 'orange')
 
plt.xlabel("Number of Tickets (in %)")
plt.ylabel("MoP Tags")
plt.show()

In [None]:
df_test.loc[df_test["MoP Tags"] == "nan", "MoP Tags"] = "no tags"

*Primary Hashtag*

In [None]:
test = df_test.groupby(['Primary Hashtag']).aggregate(
count = ('Primary Hashtag', 'count')).reset_index().sort_values(['count'], ascending = True)

test['%_count'] = ((test['count']/test['count'].sum())*100).round(4)

In [None]:
fig = plt.figure(figsize = (10, 7))
 
# creating the bar plot
plt.barh(test['Primary Hashtag'], 
         test['%_count'],
         color = 'orange')
 
plt.xlabel("Number of Tickets (in %)")
plt.ylabel("Primary Hashtag")
plt.show()

*Show Tags*

In [None]:
test = df_test.groupby(['Show Tags']).aggregate(
count = ('Show Tags', 'count')).reset_index().sort_values(['count'], ascending = True)

test['%_count'] = ((test['count']/test['count'].sum())*100).round(4)

In [None]:
fig = plt.figure(figsize = (10, 7))
 
# creating the bar plot
plt.barh(test['Show Tags'], 
         test['%_count'],
         color = 'orange')
 
plt.xlabel("Number of Tickets (in %)")
plt.ylabel("Show Tags")
plt.show()

*Title ID Tags*

In [None]:
test = df_test.groupby(['Title ID Tags']).aggregate(
count = ('Title ID Tags', 'count')).reset_index().sort_values(['count'], ascending = True)

test['%_count'] = ((test['count']/test['count'].sum())*100).round(4)

In [None]:
fig = plt.figure(figsize = (10, 7))
 
# creating the bar plot
plt.barh(test['Title ID Tags'], 
         test['%_count'],
         color = 'orange')
 
plt.xlabel("Number of Tickets (in %)")
plt.ylabel("Title ID Tags")
plt.show()

*VOC*

In [None]:
test = df_test.groupby(['VOC']).aggregate(
count = ('VOC', 'count')).reset_index().sort_values(['count'], ascending = True)

test['%_count'] = ((test['count']/test['count'].sum())*100).round(4)

In [None]:
fig = plt.figure(figsize = (10, 7))
 
# creating the bar plot
plt.barh(test['VOC'], 
         test['%_count'],
         color = 'orange')
 
plt.xlabel("Number of Tickets (in %)")
plt.ylabel("Voice of Customer")
plt.show()

*Valuable*

In [None]:
test = df_test.groupby(['Valuable']).aggregate(
count = ('Valuable', 'count')).reset_index().sort_values(['count'], ascending = True)

test['%_count'] = ((test['count']/test['count'].sum())*100).round(4)

In [None]:
fig = plt.figure(figsize = (10, 7))
 
# creating the bar plot
plt.barh(test['Valuable'], 
         test['%_count'],
         color = 'orange')
 
plt.xlabel("Number of Tickets (in %)")
plt.ylabel("Valuable VOC")
plt.show()

# Cramer's V - VOC

In [None]:
features = df_test.drop(['VOC', 'Valuable', 'Date', 'Ticket ID', 'Primary Hashtag'], axis=1).columns.tolist()

In [None]:
VOC_cramer = pd.DataFrame(columns = ['feature', 'chi2', 'pvalue', 'cramer'])

In [None]:
for i in range(len(features)):
    
    test_crosstabs = pd.crosstab(index = df_test.VOC, columns= df_test[features[i]])
    
    # Chi-square value
    X2 = stats.chi2_contingency(test_crosstabs)
    chi_stat = X2[0]
    pvalue = X2[1]
    
    # Cramer's V value
    N = len(df_test)
    minimum_dimension = (min(test_crosstabs.shape)-1)
    v = np.sqrt((chi_stat/N) / minimum_dimension)
    
    VOC_cramer.loc[len(VOC_cramer.index)] = [features[i], chi_stat, pvalue, v]

In [None]:
VOC_cramer.sort_values('cramer', ascending = False)

# Cramer's V - Valuable

In [None]:
VOC_valuable = pd.DataFrame(columns = ['feature', 'chi2', 'pvalue', 'cramer'])

In [None]:
for i in range(len(features)):
    
    test_crosstabs = pd.crosstab(index = df_test.Valuable, columns= df_test[features[i]])
    
    # Chi-square value
    X2 = stats.chi2_contingency(test_crosstabs)
    chi_stat = X2[0]
    pvalue = X2[1]
    
    # Cramer's V value
    N = len(df_test)
    minimum_dimension = (min(test_crosstabs.shape)-1)
    v = np.sqrt((chi_stat/N) / minimum_dimension)
    
    VOC_valuable.loc[len(VOC_valuable.index)] = [features[i], chi_stat, pvalue, v]

In [None]:
VOC_valuable.sort_values('cramer', ascending = False)

# Model - VOC

In [None]:
df_test[['VOC', 'Valuable']] = df_test[['VOC', 'Valuable']].astype(int)

In [None]:
df_dummy = df_test[features]

categorical_names = {}
label_encoders_dict = {}
label_encoders_mapping = {}
categorical_cols = features
for feature in categorical_cols:
    label_encoder = LabelEncoder()
    df_dummy[feature] = label_encoder.fit_transform(df_dummy[feature])
    categorical_names[feature] = label_encoder.classes_
    integer_mapping = {l: i for i, l in enumerate(label_encoder.classes_)}
    label_encoders_mapping[feature] = integer_mapping
    label_encoders_dict[feature] = label_encoder

X = df_dummy.copy()
y = df_test['VOC']


ct = ColumnTransformer(transformers=[('cat', OneHotEncoder(handle_unknown = 'ignore'), [0, 1, 2, 3, 4,
                                                                                        5, 6, 7, 8, 9])], remainder='passthrough')
ct.fit(X) #.toarray()


# split your data into 2: Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.20,
                                                    random_state=143,
                                                    stratify=y)
X_train_encoded = ct.transform(X_train)
X_train_encoded, X_val_encoded, y_train, y_val = train_test_split(X_train_encoded,
                                                 y_train,
                                                 test_size=0.15,
                                                 random_state=143,
                                                 stratify=y_train)

In [None]:
# Random Forest
parameters = {'max_depth': [5, 7, 8, 9, 10, 12],
              'min_samples_split': [2,3,4,5],
              'n_estimators': [50, 75, 100, 150],
              'max_features': ['auto', 'sqrt', 'log2'],
              } 


model_rf = RandomForestClassifier(random_state=123)
model_rf = training(parameters=parameters, target_name='VOC',
                    model_name='RandomForestClassifier', model=model_rf)
model_rf

In [None]:
# XGBoost
parameters = { 'booster': ['gblinear'],
              'learning_rate' : [0.01, 0.05, 0.1, 0.2, 0.5],
              'n_estimators' : [5, 10, 15, 20, 30, 50, 70, 100, 150],
              }
                           
model_xgb = xgb.XGBClassifier(random_state=123)
model_xgb= training(parameters=parameters, target_name='VOC',
                    model_name='XGB', model=model_xgb)
model_xgb

In [None]:
# Gradient Boosting
parameters = {'max_depth': [5, 7, 8, 9, 10, 12],
              'n_estimators': [50, 75, 100, 150],
              'learning_rate': [0.05, 0.1, 0.2],
              'max_features': ['auto', 'sqrt', 'log2'],
              }
              
model_gb = GradientBoostingClassifier(random_state=123)
model_gb = training(parameters=parameters, target_name='VOC',
                    model_name='GradientBoostingClassifier', model=model_gb)

model_gb

Explain Best Model

- LIME

In [None]:
predict_fn = lambda x: model_gb.predict_proba(ct.transform(x)).astype(float)
explainer_lime = LimeTabularExplainer(X_train.values, class_names=['non - VOC', 'VOC'],
                                     feature_names=features, 
                                     categorical_features=[0, 1, 2, 3, 4, 5, 6, 7, 8 , 9, 10], 
                                     categorical_names=categorical_names, kernel_width=3,
                                     random_state=143,
                                     discretize_continuous=True)

indices = X_test.index.tolist()[5:10]
for i in indices:
    print(i)
    exp = explainer_lime.explain_instance(X_test.loc[i], predict_fn, num_features=X_train_encoded.shape[1])
    exp.show_in_notebook(show_all=False)
    

- Feature Importance

In [None]:
features_new = ct.get_feature_names_out()

In [None]:
# transform feature names to original
# get index of categorical columns
result_cat = np.char.find([str(i) for i in ct.get_feature_names_out()], 'cat__', start=0)
idx_cat = [idx if val!=-1 else None for idx, val in enumerate(result_cat)]
# get index of numerical columns
result_numeric = np.char.find([str(i) for i in ct.get_feature_names_out()], 'remainder__', start=0)
idx_numeric = [idx if val!=-1 else None for idx, val in enumerate(result_numeric)]

categorical_cols = ['Analytic Genre Tags', 'Content Category Tags', 'Content Type Tags',
                    'Dev. Type ID Tags', 'Dev. Type Name Tags', 'Error Tags', 'Hashtags',
                    'MoP Tags', 'Show Tags', 'Title ID Tags']
numeric_cols = []

prefix = 'cat__'
new_cols_dict = {}
for col in categorical_cols:
    for key, label in label_encoders_mapping[col].items():
        for i in idx_cat:
            if i!=None:
                if features_new[i] == prefix+str(col)+'_'+str(label):
                    new_cols_dict[features_new[i]] = col+'='+key
    
prefix = 'remainder__'
for col in numeric_cols:
        for i in idx_numeric:
            if i!=None:
                if features_new[i] == prefix+str(col):
                    new_cols_dict[features_new[i]] = col

In [None]:
cols_name = []
for feature in features_new:
    cols_name.append(new_cols_dict[feature])
    coefs = model_gb.feature_importances_
top_predictor = cols_name[np.argmax(np.abs(coefs))]
abs_coefs = np.abs(coefs)
np.array(cols_name)[np.argsort(abs_coefs)][-20:]

coefs_count = len(abs_coefs)
fig, ax = plt.subplots(figsize=(10,15))
bars = ax.barh(np.arange(coefs_count)[-20:], sorted(abs_coefs)[-20:], color='orange')
ax.set_yticks(np.arange(coefs_count)[-20:])
ax.set_yticklabels(np.array(cols_name)[np.argsort(abs_coefs)][-20:], fontsize=14)

ax.bar_label(bars, label_type='center', fmt='%.2f')
ax.set_title('Important Features with weights', fontsize=16)
plt.xticks(fontsize=14)
plt.show()


- SHAP

In [None]:
X_encoded = ct.transform(X_test)
df_encoded_test = pd.DataFrame(X_encoded.toarray())
df_encoded_test.columns = cols_name

explainer = shap.TreeExplainer(model_gb, 
                               feature_names=cols_name)
shap_values = explainer.shap_values(df_encoded_test)
# summarize the effects of all the features to the class1 : shap_values[1]
plt.figure(figsize = (10, 7))
shap.summary_plot(shap_values = shap_values, 
                  features = df_encoded_test)
plt.tight_layout()

# Model - Valuable

In [None]:
df_dummy = df_test[features]

categorical_names = {}
label_encoders_dict = {}
label_encoders_mapping = {}
categorical_cols = features
for feature in categorical_cols:
    label_encoder = LabelEncoder()
    df_dummy[feature] = label_encoder.fit_transform(df_dummy[feature])
    categorical_names[feature] = label_encoder.classes_
    integer_mapping = {l: i for i, l in enumerate(label_encoder.classes_)}
    label_encoders_mapping[feature] = integer_mapping
    label_encoders_dict[feature] = label_encoder

X = df_dummy.copy()
y = df_test['Valuable']


ct = ColumnTransformer(transformers=[('cat', OneHotEncoder(handle_unknown = 'ignore'), [0, 1, 2, 3, 4,
                                                                                        5, 6, 7, 8, 9])], remainder='passthrough')
ct.fit(X) #.toarray()


# split your data into 2: Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.20,
                                                    random_state=143,
                                                    stratify=y)
X_train_encoded = ct.transform(X_train)
X_train_encoded, X_val_encoded, y_train, y_val = train_test_split(X_train_encoded,
                                                 y_train,
                                                 test_size=0.15,
                                                 random_state=143,
                                                 stratify=y_train)

In [None]:
# Random Forest
parameters = {'max_depth': [5, 7, 8, 9, 10, 12],
              'min_samples_split': [2,3,4,5],
              'n_estimators': [50, 75, 100, 150],
              'max_features': ['auto', 'sqrt', 'log2'],
              } 


model_rf = RandomForestClassifier(random_state=123)
model_rf = training(parameters=parameters, target_name='Valuable',
                    model_name='RandomForestClassifier', model=model_rf)
model_rf

In [None]:
# XGBoost
parameters = { 'booster': ['gblinear'],
              'learning_rate' : [0.01, 0.05, 0.1, 0.2, 0.5],
              'n_estimators' : [5, 10, 15, 20, 30, 50, 70, 100, 150],
              }
                           
model_xgb = xgb.XGBClassifier(random_state=123)
model_xgb= training(parameters=parameters, target_name='Valuable',
                    model_name='XGB', model=model_xgb)
model_xgb

In [None]:
# Gradient Boosting
parameters = {'max_depth': [5, 7, 8, 9, 10, 12],
              'n_estimators': [50, 75, 100, 150],
              'learning_rate': [0.05, 0.1, 0.2],
              'max_features': ['auto', 'sqrt', 'log2'],
              }
              
model_gb = GradientBoostingClassifier(random_state=123)
model_gb = training(parameters=parameters, target_name='Valuable',
                    model_name='GradientBoostingClassifier', model=model_gb)

model_gb

Explain Best Model

- LIME

In [None]:
predict_fn = lambda x: model_rf.predict_proba(ct.transform(x)).astype(float)
explainer_lime = LimeTabularExplainer(X_train.values, class_names=['non - Valuable', 'Valuable'],
                                     feature_names=features, 
                                     categorical_features=[0, 1, 2, 3, 4, 5, 6, 7, 8 , 9, 10], 
                                     categorical_names=categorical_names, kernel_width=3,
                                     random_state=143,
                                     discretize_continuous=True)

indices = X_test.index.tolist()[5:10]
for i in indices:
    print(i)
    exp = explainer_lime.explain_instance(X_test.loc[i], predict_fn, num_features=X_train_encoded.shape[1])
    exp.show_in_notebook(show_all=False)
    

- Feature Importance

In [None]:
features_new = ct.get_feature_names_out()

In [None]:
# transform feature names to original
# get index of categorical columns
result_cat = np.char.find([str(i) for i in ct.get_feature_names_out()], 'cat__', start=0)
idx_cat = [idx if val!=-1 else None for idx, val in enumerate(result_cat)]
# get index of numerical columns
result_numeric = np.char.find([str(i) for i in ct.get_feature_names_out()], 'remainder__', start=0)
idx_numeric = [idx if val!=-1 else None for idx, val in enumerate(result_numeric)]

categorical_cols = ['Analytic Genre Tags', 'Content Category Tags', 'Content Type Tags',
                    'Dev. Type ID Tags', 'Dev. Type Name Tags', 'Error Tags', 'Hashtags',
                    'MoP Tags', 'Show Tags', 'Title ID Tags']
numeric_cols = []

prefix = 'cat__'
new_cols_dict = {}
for col in categorical_cols:
    for key, label in label_encoders_mapping[col].items():
        for i in idx_cat:
            if i!=None:
                if features_new[i] == prefix+str(col)+'_'+str(label):
                    new_cols_dict[features_new[i]] = col+'='+key
    
prefix = 'remainder__'
for col in numeric_cols:
        for i in idx_numeric:
            if i!=None:
                if features_new[i] == prefix+str(col):
                    new_cols_dict[features_new[i]] = col

In [None]:
cols_name = []
for feature in features_new:
    cols_name.append(new_cols_dict[feature])
    coefs = model_rf.feature_importances_
top_predictor = cols_name[np.argmax(np.abs(coefs))]
abs_coefs = np.abs(coefs)
np.array(cols_name)[np.argsort(abs_coefs)][-20:]

coefs_count = len(abs_coefs)
fig, ax = plt.subplots(figsize=(10,15))
bars = ax.barh(np.arange(coefs_count)[-20:], sorted(abs_coefs)[-20:], color='orange')
ax.set_yticks(np.arange(coefs_count)[-20:])
ax.set_yticklabels(np.array(cols_name)[np.argsort(abs_coefs)][-20:], fontsize=14)

ax.bar_label(bars, label_type='center', fmt='%.2f')
ax.set_title('Important Features with weights', fontsize=16)
plt.xticks(fontsize=14)
plt.show()


- SHAP

In [None]:
X_encoded = ct.transform(X_test)
df_encoded_test = pd.DataFrame(X_encoded.toarray())
df_encoded_test.columns = cols_name

explainer = shap.TreeExplainer(model_rf, 
                               feature_names=cols_name)
shap_values = explainer.shap_values(df_encoded_test)
# summarize the effects of all the features to the class1 : shap_values[1]
plt.figure(figsize = (10, 7))
shap.summary_plot(shap_values = shap_values, 
                  features = df_encoded_test)
plt.tight_layout()

## ONLY TICKET DATA ##

## Data Prep

In [None]:
df_test = df_voc[['Analytic Genre Tags', 'Content Category Tags',
                  'Content Type Tags', 'Dev. Type ID Tags', 'Dev. Type Name Tags',
                  'Error Tags', 'Hashtags', 'MoP Tags', 'Primary Hashtag',
                  'Show Tags', 'Title ID Tags', 'Ticket ID', 'Date']]

In [None]:
df_test.shape

In [None]:
null_percentage = pd.DataFrame(
    (100 * df_test.isna().sum() / df_test.shape[0]).sort_values(ascending=False),
     columns=['Percentage'])

display(null_percentage)
num_null = len(null_percentage[null_percentage.Percentage != 0])
print(f'Number of features with null values: {num_null}')

In [None]:
dup = df_test.shape[0] - df_test.drop_duplicates().shape[0]
print(f'There are {dup} duplicate rows')

In [None]:
df_test = df_test.drop_duplicates()

## Data Visualization

In [None]:
df_test = df_test.astype(str)

*Analytic Genre Tags*

In [None]:
test = df_test.groupby(['Analytic Genre Tags']).aggregate(
count = ('Analytic Genre Tags', 'count')).reset_index().sort_values(['count'], ascending = True)

test['%_count'] = ((test['count']/test['count'].sum())*100).round(4)

In [None]:
fig = plt.figure(figsize = (10, 7))
 
# creating the bar plot
plt.barh(test['Analytic Genre Tags'], 
         test['%_count'],
         color = 'orange')
 
plt.xlabel("Number of Tickets (in %)")
plt.ylabel("Analytic Genre Tags")
plt.show()

*Content Category Tags*

In [None]:
test = df_test.groupby(['Content Category Tags']).aggregate(
count = ('Content Category Tags', 'count')).reset_index().sort_values(['count'], ascending = True)

test['%_count'] = ((test['count']/test['count'].sum())*100).round(4)

In [None]:
fig = plt.figure(figsize = (10, 7))
 
# creating the bar plot
plt.barh(test['Content Category Tags'], 
         test['%_count'],
         color = 'orange')
 
plt.xlabel("Number of Tickets (in %)")
plt.ylabel("Content Category Tags")
plt.show()

*Content Type Tags*

In [None]:
test = df_test.groupby(['Content Type Tags']).aggregate(
count = ('Content Type Tags', 'count')).reset_index().sort_values(['count'], ascending = True)

test['%_count'] = ((test['count']/test['count'].sum())*100).round(4)

In [None]:
fig = plt.figure(figsize = (10, 7))
 
# creating the bar plot
plt.barh(test['Content Type Tags'], 
         test['%_count'],
         color = 'orange')
 
plt.xlabel("Number of Tickets (in %)")
plt.ylabel("Content Type Tags")
plt.show()

*Dev. Type ID Tags*

In [None]:
test = df_test.groupby(['Dev. Type ID Tags']).aggregate(
count = ('Dev. Type ID Tags', 'count')).reset_index().sort_values(['count'], ascending = True)

test['%_count'] = ((test['count']/test['count'].sum())*100).round(4)

In [None]:
test.shape

In [None]:
test = test.tail(25)

fig = plt.figure(figsize = (10, 7))
 
# creating the bar plot
plt.barh(test['Dev. Type ID Tags'], 
         test['%_count'],
         color = 'orange')
 
plt.xlabel("Number of Tickets (in %)")
plt.ylabel("Dev. Type ID Tags")
plt.show()

*Dev. Type Name Tags*

In [None]:
test = df_test.groupby(['Dev. Type Name Tags']).aggregate(
count = ('Dev. Type Name Tags', 'count')).reset_index().sort_values(['count'], ascending = True)

test['%_count'] = ((test['count']/test['count'].sum())*100).round(4)

In [None]:
test.shape

In [None]:
test = test.tail(25)

fig = plt.figure(figsize = (10, 7))
 
# creating the bar plot
plt.barh(test['Dev. Type Name Tags'], 
         test['%_count'],
         color = 'orange')
 
plt.xlabel("Number of Tickets (in %)")
plt.ylabel("Dev. Type Name Tags")
plt.show()

*Error Tags*

In [None]:
test = df_test.groupby(['Error Tags']).aggregate(
count = ('Error Tags', 'count')).reset_index().sort_values(['count'], ascending = True)

test['%_count'] = ((test['count']/test['count'].sum())*100).round(4)

In [None]:
test.shape

In [None]:
test = test.tail(25)

fig = plt.figure(figsize = (10, 7))
 
# creating the bar plot
plt.barh(test['Error Tags'], 
         test['%_count'],
         color = 'orange')
 
plt.xlabel("Number of Tickets (in %)")
plt.ylabel("Error Tags")
plt.show()

In [None]:
df_test.loc[df_test["Error Tags"] == "nan", "Error Tags"] = "no tags"

*Hashtags*

In [None]:
test = df_test.groupby(['Hashtags']).aggregate(
count = ('Hashtags', 'count')).reset_index().sort_values(['count'], ascending = True)

test['%_count'] = ((test['count']/test['count'].sum())*100).round(4)

In [None]:
test.shape

In [None]:
test = test.tail(25)

fig = plt.figure(figsize = (10, 7))
 
# creating the bar plot
plt.barh(test['Hashtags'], 
         test['%_count'],
         color = 'orange')
 
plt.xlabel("Number of Tickets (in %)")
plt.ylabel("Hashtags")
plt.show()

*MoP Tags*

In [None]:
test = df_test.groupby(['MoP Tags']).aggregate(
count = ('MoP Tags', 'count')).reset_index().sort_values(['count'], ascending = True)

test['%_count'] = ((test['count']/test['count'].sum())*100).round(4)

In [None]:
fig = plt.figure(figsize = (10, 7))
 
# creating the bar plot
plt.barh(test['MoP Tags'], 
         test['%_count'],
         color = 'orange')
 
plt.xlabel("Number of Tickets (in %)")
plt.ylabel("MoP Tags")
plt.show()

In [None]:
df_test.loc[df_test["MoP Tags"] == "nan", "MoP Tags"] = "no tags"

*Primary Hashtag*

In [None]:
test = df_test.groupby(['Primary Hashtag']).aggregate(
count = ('Primary Hashtag', 'count')).reset_index().sort_values(['count'], ascending = True)

test['%_count'] = ((test['count']/test['count'].sum())*100).round(4)

In [None]:
fig = plt.figure(figsize = (10, 7))
 
# creating the bar plot
plt.barh(test['Primary Hashtag'], 
         test['%_count'],
         color = 'orange')
 
plt.xlabel("Number of Tickets (in %)")
plt.ylabel("Primary Hashtag")
plt.show()

*Show Tags*

In [None]:
test = df_test.groupby(['Show Tags']).aggregate(
count = ('Show Tags', 'count')).reset_index().sort_values(['count'], ascending = True)

test['%_count'] = ((test['count']/test['count'].sum())*100).round(4)

In [None]:
test.shape

In [None]:
test = test.tail(25)

fig = plt.figure(figsize = (10, 7))
 
# creating the bar plot
plt.barh(test['Show Tags'], 
         test['%_count'],
         color = 'orange')
 
plt.xlabel("Number of Tickets (in %)")
plt.ylabel("Show Tags")
plt.show()

*Title ID Tags*

In [None]:
test = df_test.groupby(['Title ID Tags']).aggregate(
count = ('Title ID Tags', 'count')).reset_index().sort_values(['count'], ascending = True)

test['%_count'] = ((test['count']/test['count'].sum())*100).round(4)

In [None]:
fig = plt.figure(figsize = (10, 7))
 
# creating the bar plot
plt.barh(test['Title ID Tags'], 
         test['%_count'],
         color = 'orange')
 
plt.xlabel("Number of Tickets (in %)")
plt.ylabel("Title ID Tags")
plt.show()

*VOC*

In [None]:
df_test.loc[df_test['Primary Hashtag'] == "#product-feedback", 'VOC'] = '1'
df_test['VOC'].fillna('0', inplace = True)

In [None]:
test = df_test.groupby(['VOC']).aggregate(
count = ('VOC', 'count')).reset_index().sort_values(['count'], ascending = True)

test['%_count'] = ((test['count']/test['count'].sum())*100).round(4)

In [None]:
fig = plt.figure(figsize = (10, 7))
 
# creating the bar plot
plt.barh(test['VOC'], 
         test['%_count'],
         color = 'orange')
 
plt.xlabel("Number of Tickets (in %)")
plt.ylabel("Voice of Customer")
plt.show()

# Cramer's V - VOC

In [None]:
features = df_test.drop(['VOC', 'Valuable', 'Primary Hashtag'], axis=1).columns.tolist()

In [None]:
VOC_cramer = pd.DataFrame(columns = ['feature', 'chi2', 'pvalue', 'cramer'])

In [None]:
for i in range(len(features)):
    
    test_crosstabs = pd.crosstab(index = df_test.VOC, columns= df_test[features[i]])
    
    # Chi-square value
    X2 = stats.chi2_contingency(test_crosstabs)
    chi_stat = X2[0]
    pvalue = X2[1]
    
    # Cramer's V value
    N = len(df_test)
    minimum_dimension = (min(test_crosstabs.shape)-1)
    v = np.sqrt((chi_stat/N) / minimum_dimension)
    
    VOC_cramer.loc[len(VOC_cramer.index)] = [features[i], chi_stat, pvalue, v]

In [None]:
VOC_cramer.sort_values('cramer', ascending = False)

# Model - VOC

In [None]:
df_test[['VOC', 'Valuable']] = df_test[['VOC', 'Valuable']].astype(int)

In [None]:
df_dummy = df_test[features]

categorical_names = {}
label_encoders_dict = {}
label_encoders_mapping = {}
categorical_cols = features
for feature in categorical_cols:
    label_encoder = LabelEncoder()
    df_dummy[feature] = label_encoder.fit_transform(df_dummy[feature])
    categorical_names[feature] = label_encoder.classes_
    integer_mapping = {l: i for i, l in enumerate(label_encoder.classes_)}
    label_encoders_mapping[feature] = integer_mapping
    label_encoders_dict[feature] = label_encoder

X = df_dummy.copy()
y = df_test['VOC']


ct = ColumnTransformer(transformers=[('cat', OneHotEncoder(handle_unknown = 'ignore'), [0, 1, 2, 3, 
                                                                                        4, 5, 6, 7])], remainder='passthrough')
ct.fit(X) #.toarray()


# split your data into 2: Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.20,
                                                    random_state=143,
                                                    stratify=y)
X_train_encoded = ct.transform(X_train)
X_train_encoded, X_val_encoded, y_train, y_val = train_test_split(X_train_encoded,
                                                 y_train,
                                                 test_size=0.15,
                                                 random_state=143,
                                                 stratify=y_train)

In [None]:
def training(model, parameters=None, target_name=None, model_name=None):

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    resampler = ADASYN(random_state=0, n_neighbors=4)
    recall = make_scorer(recall_score)

    scores = {'model_name': model_name,
             'opt_hyperparameter': None,
             'val_score': None,
             'train_accuracy_score': None,
             'test_accuracy_score': None,
             'precision_score' : None,
             'recall_score': None,
             'run_time':None,
             'trained_model': None}

    grid_obj = GridSearchCV(model,
                            parameters,
                            n_jobs=-1, 
                            cv=skf,
                            return_train_score=True,
                            scoring=recall)

    start_time = time.time()
    grid_obj = grid_obj.fit(X_val_encoded, y_val)
    results_df = pd.DataFrame(grid_obj.cv_results_)
    results_df = (results_df.set_index(results_df["params"].apply(
                        lambda x: "_".join(str(val) for val in x.values()))))
    model = grid_obj.best_estimator_

    # resample train data
    X_res, y_res = resampler.fit_resample(X_train_encoded, y_train)

    # fit best model on resampled train data
    model.fit(X_res, y_res)

    # test trained model
    y_pred = model.predict(ct.transform(X_test))
    p_score= precision_score(y_test, y_pred, pos_label=1)
    r_score = recall_score(y_test, y_pred, pos_label=1)
    accuracy = accuracy_score(y_test, y_pred)
    
    print('accuracy:', accuracy)
    print('recall:', r_score)
    print('precision:', p_score)
    
    plt.figure(figsize=(6,6))
    plt.rcParams.update({'font.size': 14})
    ConfusionMatrixDisplay.from_predictions(y_test, y_pred,
                                            colorbar=False, cmap='Oranges')
    plt.title('Model Prediction', fontsize=12)
    plt.xlabel('Predicted Label', fontsize=12)
    plt.ylabel('True Label', fontsize=12);
    plt.tight_layout()
    plt.savefig('CM_GB.png', dpi=300)

    return model

In [None]:
# Random Forest
parameters = {'max_depth': [5, 7, 8, 9, 10, 12],
              'min_samples_split': [2,3,4,5],
              'n_estimators': [50, 75, 100, 150],
              'max_features': ['auto', 'sqrt', 'log2'],
              } 


model_rf = RandomForestClassifier(random_state=123)
model_rf = training(parameters=parameters, target_name='VOC',
                    model_name='RandomForestClassifier', model=model_rf)
model_rf

In [None]:
# XGBoost
parameters = { 'booster': ['gblinear'],
              'learning_rate' : [0.01, 0.05, 0.1, 0.2, 0.5],
              'n_estimators' : [5, 10, 15, 20, 30, 50, 70, 100, 150],
              }
                           
model_xgb = xgb.XGBClassifier(random_state=123)
model_xgb= training(parameters=parameters, target_name='VOC',
                    model_name='XGB', model=model_xgb)
model_xgb

In [None]:
# Gradient Boosting
parameters = {'max_depth': [5, 7, 8, 9, 10, 12],
              'n_estimators': [50, 75, 100, 150],
              'learning_rate': [0.05, 0.1, 0.2],
              'max_features': ['auto', 'sqrt', 'log2'],
              }
              
model_gb = GradientBoostingClassifier(random_state=123)
model_gb = training(parameters=parameters, target_name='VOC',
                    model_name='GradientBoostingClassifier', model=model_gb)

model_gb

Explain Best Model

- LIME

In [None]:
predict_fn = lambda x: model_xgb.predict_proba(ct.transform(x)).astype(float)
explainer_lime = LimeTabularExplainer(X_train.values, class_names=['non - VOC', 'VOC'],
                                     feature_names=features, 
                                     categorical_features=[0, 1, 2, 3, 4, 5, 6, 7, 8 , 9, 10], 
                                     categorical_names=categorical_names, kernel_width=3,
                                     random_state=143,
                                     discretize_continuous=True)

indices = X_test.index.tolist()[5:10]
for i in indices:
    print(i)
    exp = explainer_lime.explain_instance(X_test.loc[i], predict_fn, num_features=X_train_encoded.shape[1])
    exp.show_in_notebook(show_all=False)
    

- Feature Importance

In [None]:
features_new = ct.get_feature_names_out()

In [None]:
# transform feature names to original
# get index of categorical columns
result_cat = np.char.find([str(i) for i in ct.get_feature_names_out()], 'cat__', start=0)
idx_cat = [idx if val!=-1 else None for idx, val in enumerate(result_cat)]
# get index of numerical columns
result_numeric = np.char.find([str(i) for i in ct.get_feature_names_out()], 'remainder__', start=0)
idx_numeric = [idx if val!=-1 else None for idx, val in enumerate(result_numeric)]

categorical_cols = ['Analytic Genre Tags', 'Content Category Tags', 'Content Type Tags',
                    'Dev. Type ID Tags', 'Dev. Type Name Tags', 'Error Tags', 'Hashtags',
                    'MoP Tags', 'Show Tags', 'Title ID Tags', 'Ticket ID']
numeric_cols = []

prefix = 'cat__'
new_cols_dict = {}
for col in categorical_cols:
    for key, label in label_encoders_mapping[col].items():
        for i in idx_cat:
            if i!=None:
                if features_new[i] == prefix+str(col)+'_'+str(label):
                    new_cols_dict[features_new[i]] = col+'='+key
    
prefix = 'remainder__'
for col in numeric_cols:
        for i in idx_numeric:
            if i!=None:
                if features_new[i] == prefix+str(col):
                    new_cols_dict[features_new[i]] = col

In [None]:
cols_name = []
for feature in features_new:
    cols_name.append(new_cols_dict[feature])
    coefs = model_xgb.feature_importances_
top_predictor = cols_name[np.argmax(np.abs(coefs))]
abs_coefs = np.abs(coefs)
np.array(cols_name)[np.argsort(abs_coefs)][-20:]

coefs_count = len(abs_coefs)
fig, ax = plt.subplots(figsize=(10,15))
bars = ax.barh(np.arange(coefs_count)[-20:], sorted(abs_coefs)[-20:], color='orange')
ax.set_yticks(np.arange(coefs_count)[-20:])
ax.set_yticklabels(np.array(cols_name)[np.argsort(abs_coefs)][-20:], fontsize=14)

ax.bar_label(bars, label_type='center', fmt='%.2f')
ax.set_title('Important Features with weights', fontsize=16)
plt.xticks(fontsize=14)
plt.show()


- SHAP

In [None]:
X_encoded = ct.transform(X_test)
df_encoded_test = pd.DataFrame(X_encoded.toarray())
df_encoded_test.columns = cols_name

explainer = shap.TreeExplainer(model_gb, 
                               feature_names=cols_name)
shap_values = explainer.shap_values(df_encoded_test)
# summarize the effects of all the features to the class1 : shap_values[1]
plt.figure(figsize = (10, 7))
shap.summary_plot(shap_values = shap_values, 
                  features = df_encoded_test)
plt.tight_layout()