Random Forest

Implementing a Random Forest Classifier model, following the Decision Tree conclusion and best modelling methods described on the past tab

The Random Forest Classifier generates multiple decision trees from a randomly selected subset of the training set. To predict the model collects the “votes” from different decision trees to decide the final prediction. We can see each small tree as an expert of each subset.

Code

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.metrics import ConfusionMatrixDisplay, accuracy_score, recall_score
from sklearn.metrics import confusion_matrix
from tabnanny import check
from nltk.tokenize import TweetTokenizer
import re
import pandas as pd
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.style
matplotlib.style.use('seaborn-pastel')
from nltk.tokenize import RegexpTokenizer
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.probability import FreqDist
from nltk.stem import PorterStemmer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

Code

df = pd.read_csv('data/endireh_ev.csv', encoding='latin1')
print(df.shape)
df.loc[:,df.apply(pd.Series.nunique) != 1]
df.replace([np.inf, -np.inf], np.nan).dropna(axis=1)
print(df.shape)

df.head(5)

C:\Users\valer\AppData\Local\Temp\ipykernel_11604\887091855.py:1: DtypeWarning: Columns (188) have mixed types. Specify dtype option on import or set low_memory=False.
  df = pd.read_csv('data/endireh_ev.csv', encoding='latin1')

(73500, 354)

(73500, 354)

	P12_1	P12_2	P12_3	P12_4	P12_5	P12_6	P12_7	P12_8	P12_9	P12_10	...	P4_13_4	P4_13_5	P4_13_6	P4_13_7	FAC_VIV_y.1	FAC_MUJ_y.1	ESTRATO_y.1	UPM_DIS_y.1	EST_DIS_y.1	label
0	2	1	2	3	1	3	3	8.0	8.0	3.0	...	1.0	NaN	NaN	NaN	113	113	4	1	3	1.0
1	1	1	3	3	3	1	3	1.0	2.0	4.0	...	5.0	NaN	NaN	NaN	113	113	4	1	3	0.0
2	2	1	1	3	3	3	1	3.0	8.0	1.0	...	2.0	NaN	NaN	NaN	78	155	2	2	1	0.0
3	1	1	3	1	1	1	1	1.0	2.0	3.0	...	1.0	NaN	NaN	NaN	78	78	2	2	1	0.0
4	1	1	4	3	3	3	3	8.0	8.0	3.0	...	5.0	NaN	NaN	NaN	78	78	2	2	1	0.0

5 rows × 354 columns

Code

#check balance of the classes
def check_balance(df):
    print(df.label.value_counts())
    
check_balance(df)

0.0    54940
1.0    18560
Name: label, dtype: int64

Code

y=df["label"]
X = df.drop("label", axis=1)

categorical = df.select_dtypes(include=['object']).columns.tolist()
X = X.drop(categorical, axis=1)
X= X.fillna(0)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X.to_numpy())
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

from sklearn.feature_selection import VarianceThreshold
selector = VarianceThreshold(threshold=.001)
X_vr = selector.fit_transform(X_scaled)

print(X_scaled.shape)
print(X_vr.shape)
X_train, X_test, y_train, y_test = train_test_split(X_vr,y)

(73500, 343)
(73500, 343)

Code

from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(max_depth=2, random_state=0)

Starting to test the model with a small max_depth (2)

Code

model.fit(X_train, y_train)

RandomForestClassifier(max_depth=2, random_state=0)

In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

Code

y_pred = model.predict(X_test)

Code

# Save the results in a data frame.
from sklearn.metrics import classification_report, confusion_matrix
def print_report(y_test, y_pred):
    clf_report_linear = classification_report(y_test, y_pred, output_dict=True)
    print(pd.DataFrame(clf_report_linear).transpose())

print_report(y_test, y_pred)

              precision    recall  f1-score       support
0.0            0.935919  0.994538  0.964339  13731.000000
1.0            0.980180  0.798665  0.880161   4644.000000
accuracy       0.945034  0.945034  0.945034      0.945034
macro avg      0.958050  0.896601  0.922250  18375.000000
weighted avg   0.947106  0.945034  0.943064  18375.000000

Code

accuracy = []
j = list(range(2,9))
for i in  j:
    model = RandomForestClassifier(max_depth=i, random_state=0)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(acc)
    accuracy.append(acc)

0.9450340136054421

0.9703401360544218

0.9947755102040816

0.9988027210884354

0.9997278911564625

0.9998367346938776

1.0

Even with a small number of layers the accuracy result increased in comparison with the decision tree implementation

Code

plt.subplots(1, figsize=(10, 10))
plt.plot(j, accuracy,label="Test score", color="red",marker="o")

plt.title("Learning Curve")
plt.xlabel("Number of layers in Decision Tree (max_depth)",fontsize=16), plt.ylabel("Accuracy",fontsize=16), plt.legend(loc="best")
plt.show()

From the plot we can see that the max depth parameter is similar to the single decision tree we performed before, so we are keeping the value as 7 for our best model

Code

model = RandomForestClassifier(max_depth=7, random_state=0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print_report(y_test, y_pred)

              precision    recall  f1-score       support
0.0            1.000000  0.999782  0.999891  13731.000000
1.0            0.999354  1.000000  0.999677   4644.000000
accuracy       0.999837  0.999837  0.999837      0.999837
macro avg      0.999677  0.999891  0.999784  18375.000000
weighted avg   0.999837  0.999837  0.999837  18375.000000

Code

from sklearn.metrics import plot_confusion_matrix
def plot_cm(cm):
    disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                                )

    fig, ax = plt.subplots(figsize=(10,8))
    ax.tick_params(axis='x', labelrotation = 45)
    disp.plot(ax=ax,xticks_rotation='vertical',)
    plt.show()
cm_test = confusion_matrix(y_test,y_pred)
plot_cm(cm_test)

After looking at the results we can conclude Random Forest Classifier outperformed a single Decision Tree, after training with our data, obtaining a total accuracy of 99.7%

Even though the accuracy is closer to 100% we should evaluate the performance of the model because there might be an overfitting. Looking closely into the results, specially at the confussion matrix we can observe how the model is predicting correctly on False Negatives. Which is why recall in Negatives is not a 100%.

Analyzing social data is not easy, specially on sensitive topics like violence. Training a model with our data is making the model not capable of detecting when a women has not experienced emotional violence; this was an unexpected behavior since we have an unbalanced set with more data points labeled as “0” thus we would have expected the model overfitting on that class.

As next steps would be to bring back the data we dropped to test the model and see how it behaves, it would also be interesting to tune the model to not be unbalanced towards an specific class or understand if certain features are actually causing this behavior.