import matplotlib.pyplot as plt
import string
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.sentiment import SentimentIntensityAnalyzer
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import nltk
Naive Bayes
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, cohen_kappa_score, f1_score, classification_report
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.naive_bayes import MultinomialNB
'vader_lexicon')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4') nltk.download(
[nltk_data] Downloading package vader_lexicon to
[nltk_data] /Users/zhu/nltk_data...
[nltk_data] Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to /Users/zhu/nltk_data...
[nltk_data] Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/zhu/nltk_data...
[nltk_data] Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/zhu/nltk_data...
[nltk_data] Package omw-1.4 is already up-to-date!
True
#RELOAD FILE
=pd.read_csv('cleaned_tweets.csv')
dfprint(df.shape)
#CONVERT FROM STRING LABELS TO INTEGERS
=[]; #y1=[]; y2=[]
labels=[]
y1for label in df["label"]:
if label not in labels:
labels.append(label)print("index =",len(labels)-1,": label =",label)
for i in range(0,len(labels)):
if(label==labels[i]):
y1.append(i)=np.array(y1)
y1
# CONVERT DF TO LIST OF STRINGS
=df["tweets"].to_list()
corpus
print("number of text chunks = ",len(corpus))
print(corpus[0:3])
(10029, 3)
index = 0 : label = 0
index = 1 : label = 1
number of text chunks = 10029
[' no sé si das más pena tú o tu terrible ortografía\x85 pinche naca', ' cuando todo se va al infierno la gente que está a tu lado sin vacilar es tu familia', ' en 1800 nace natturner el esclavo rebelde del sur de estadosunidos ']
# INITIALIZE COUNT VECTORIZER
# minDF = 0.01 means "ignore terms that appear in less than 1% of the documents".
# minDF = 5 means "ignore terms that appear in less than 5 documents".
=CountVectorizer(min_df=0.001)
vectorizer
# RUN COUNT VECTORIZER ON OUR COURPUS
= vectorizer.fit_transform(corpus)
Xs =np.array(Xs.todense())
X
#CONVERT TO ONE-HOT VECTORS
=np.max(X,axis=0)
maxs=np.ceil(X/maxs)
X
# DOUBLE CHECK
print(X.shape,y1.shape)
(10029, 1113) (10029,)
=250
num_rows_keep=np.sort(np.random.choice(X.shape[0], num_rows_keep, replace=False))
index# print(y1[index])
# print(index)
=X[index, :]
tmp1# print(tmp1.shape,tmp1.dtype,tmp1[:,].shape)
#COMPUTE DISTANCE MATRIX
=[]
dij
#LOOP OVER ROWS
for i in range(0,tmp1.shape[0]):
=[]
tmp2#LOOP OVER ROWS
for j in range(0,tmp1.shape[0]):
#EXTRACT VECTORS
=tmp1[i,:]
vi=tmp1[j,:]
vj#print(vi.shape,vj.shape)
#COMPUTE DISTANCES
=np.dot(vi, vj)/(np.linalg.norm(vi)*np.linalg.norm(vj)) #cosine sim
dist#dist=np.linalg.norm(vi-vj) #euclidean
# BUILD DISTANCE MATRIX
if(i==j or np.max(vi) == 0 or np.max(vj)==0):
0)
tmp2.append(else:
tmp2.append(dist); #print(dij)
dij.append(tmp2)# raise
=np.array(dij)
dij
#normalize
# dij=(dij-np.min(dij))/(np.max(dij)-np.min(dij))
#Lower triangle of an array.
# dij=np.sort(dij,axis=0)
# dij=np.sort(dij,axis=1)
# dij=np.tril(dij, k=-1)
import seaborn as sns
# sns.heatmap(np.exp(dij), annot=False) #, linewidths=.05)
= sns.heatmap(dij, annot=False) #, linewidths=.05)
heatmap "heatmap.png")
heatmap.figure.savefig(print(dij.shape)
print(dij)
/var/folders/kw/y1wfcqhd7nz_cc11ydrrc7cr0000gn/T/ipykernel_89460/1203232167.py:23: RuntimeWarning: invalid value encountered in double_scalars
dist=np.dot(vi, vj)/(np.linalg.norm(vi)*np.linalg.norm(vj)) #cosine sim
(250, 250)
[[0. 0. 0. ... 0. 0. 0. ]
[0. 0. 0.11111111 ... 0. 0.28867513 0.16666667]
[0. 0.11111111 0. ... 0.40201513 0.38490018 0. ]
...
[0. 0. 0.40201513 ... 0. 0.43519414 0. ]
[0. 0.28867513 0.38490018 ... 0.43519414 0. 0. ]
[0. 0.16666667 0. ... 0. 0. 0. ]]
from sklearn.decomposition import PCA
# COMPUTE PCA WITH 10 COMPONENTS
= PCA(n_components=10)
pca
pca.fit(X)print(pca.explained_variance_ratio_)
print(pca.singular_values_)
# GET PRINCIPLE COMPONENT PROJECTIONS
= pca.fit_transform(X)
principal_components = pd.DataFrame(data = principal_components) #, columns = ['PC1','PC2','PC3','PC4','PC5'])
df2 =pd.concat([df2,df['label']], axis=1)
df3
# FIRST TWO COMPONENTS
=df2, x=0, y=1,hue=df["label"])
sns.scatterplot(data
plt.show()
#3D PLOT
= plt.axes(projection='3d')
ax 0], df2[1], df2[2], c=y1);
ax.scatter3D(df2[
plt.show()
#PAIRPLOT
= sns.pairplot(data=df3,hue="label") #.to_numpy()) #,hue=df["label"]) #, hue="time")
tweet_pairplot # tweet_pairplot.figure.savefig("tweet_pairplot.png")
tweet_pairplot"tweet_pairplot.png") tweet_pairplot.figure.savefig(
[0.05152696 0.03284952 0.02594211 0.02397741 0.02249751 0.02129862
0.01943791 0.01824908 0.01569624 0.01521384]
[62.62769714 50.00502732 44.43772559 42.72187939 41.3824672 40.26473864
38.46572407 37.27088324 34.56583461 34.03051514]
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from sklearn.metrics import accuracy_score
# split the dataset into train and test
= train_test_split(df, test_size=0.20) df_train, df_test
# INITIALIZE COUNT VECTORIZER
# minDF = 0.01 means "ignore terms that appear in less than 1% of the documents".
# minDF = 5 means "ignore terms that appear in less than 5 documents".
=CountVectorizer(min_df=0.001)
vectorizer
# RUN COUNT VECTORIZER ON OUR COURPUS
= vectorizer.fit_transform(corpus)
Xs =np.array(Xs.todense())
X
#CONVERT TO ONE-HOT VECTORS
=np.max(X,axis=0)
maxs=np.ceil(X/maxs)
X
# DOUBLE CHECK
print(X.shape,y1.shape)
print("DATA POINT-0:",X[0,0:10],"y1 =",y1[0])
(10029, 1113) (10029,)
DATA POINT-0: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.] y1 = 0
print(y1[300:400])
[0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0
0 1 0 0 0 0 1 0 0 0 0 1 0 0 1 0 1 0 0 0 0 0 0 1 1 0 0 1 0 1 0 1 1 0 0 0 0
0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
from sklearn.model_selection import train_test_split
=0.2
test_ratio
# SPLIT ARRAYS OR MATRICES INTO RANDOM TRAIN AND TEST SUBSETS.
= train_test_split(X, y1, test_size=test_ratio, random_state=0)
x_train, x_test, y_train, y_test =y_train.flatten()
y_train=y_test.flatten()
y_test
print("x_train.shape :",x_train.shape)
print("y_train.shape :",y_train.shape)
print("X_test.shape :",x_test.shape)
print("y_test.shape :",y_test.shape)
x_train.shape : (8023, 1113)
y_train.shape : (8023,)
X_test.shape : (2006, 1113)
y_test.shape : (2006,)
#CHECK TO MAKE SURE IT WAS RANDOMIZED
print(y_train[0:100])
[0 0 1 1 0 0 1 1 1 0 0 1 0 1 0 0 0 0 0 1 0 1 1 0 0 0 0 0 1 0 1 1 0 0 0 0 1
1 1 0 0 0 0 0 1 1 1 1 0 0 1 0 1 1 1 1 1 0 0 0 0 1 1 0 0 1 0 0 1 1 1 0 0 1
1 0 1 0 0 0 1 0 0 0 1 1 1 1 1 1 0 1 0 1 1 1 0 0 1 0]
def report(y,ypred):
#ACCURACY COMPUTE
print("Accuracy:",accuracy_score(y, ypred)*100)
print("Number of mislabeled points out of a total %d points = %d"
% (y.shape[0], (y != ypred).sum()))
def print_model_summary():
# LABEL PREDICTIONS FOR TRAINING AND TEST SET
= model.predict(x_train)
yp_train = model.predict(x_test)
yp_test
print("ACCURACY CALCULATION\n")
print("TRAINING SET:")
report(y_train,yp_train)
print("\nTEST SET (UNTRAINED DATA):")
report(y_test,yp_test)
print("\nCHECK FIRST 20 PREDICTIONS")
print("TRAINING SET:")
print(y_train[0:20])
print(yp_train[0:20])
print("ERRORS:",yp_train[0:20]-y_train[0:20])
print("\nTEST SET (UNTRAINED DATA):")
print(y_test[0:20])
print(yp_test[0:20])
print("ERRORS:",yp_test[0:20]-y_test[0:20])
from sklearn.naive_bayes import MultinomialNB
# INITIALIZE MODEL
= MultinomialNB()
model
# TRAIN MODEL
model.fit(x_train,y_train)
# PRINT REPORT USING UTILITY FUNCTION ABOVE
print_model_summary()
ACCURACY CALCULATION
TRAINING SET:
Accuracy: 82.53770410071046
Number of mislabeled points out of a total 8023 points = 1401
TEST SET (UNTRAINED DATA):
Accuracy: 77.96610169491525
Number of mislabeled points out of a total 2006 points = 442
CHECK FIRST 20 PREDICTIONS
TRAINING SET:
[0 0 1 1 0 0 1 1 1 0 0 1 0 1 0 0 0 0 0 1]
[1 0 0 1 0 0 1 1 1 1 1 1 0 1 1 0 0 0 0 1]
ERRORS: [ 1 0 -1 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 0 0]
TEST SET (UNTRAINED DATA):
[0 0 1 1 1 1 0 0 0 1 1 1 1 1 0 0 1 0 1 0]
[0 0 1 1 1 1 0 1 0 1 1 1 1 0 1 0 0 0 0 0]
ERRORS: [ 0 0 0 0 0 0 0 1 0 0 0 0 0 -1 1 0 -1 0 -1 0]
Below is KNN Classification Method
## Classification model-2: KNN
## Hyper-Parameter tuning
from sklearn.neighbors import KNeighborsClassifier
# HYPER PARAMETER SEARCH FOR OPTIMAL NUMBER OF NEIGHBORS
=[]
num_neighbors=[]
train_accuracy=[]
test_accuracy
# LOOP OVER HYPER-PARAM
for i in range(1,40):
# INITIALIZE MODEL
= KNeighborsClassifier(n_neighbors=i)
model
# TRAIN MODEL
model.fit(x_train,y_train)
# LABEL PREDICTIONS FOR TRAINING AND TEST SET
= model.predict(x_train)
yp_train = model.predict(x_test)
yp_test
print("n_neighbors =",i)
=accuracy_score(y_train, yp_train)*100
acc1=accuracy_score(y_test, yp_test)*100
acc2
num_neighbors.append(i)
train_accuracy.append(acc1)
test_accuracy.append(acc2)
print(" train accuracy:",acc1)
print(" test accuracy:" ,acc2)
n_neighbors = 1
train accuracy: 99.10258008226349
test accuracy: 68.79361914257228
n_neighbors = 2
train accuracy: 86.0650629440359
test accuracy: 66.6999002991027
n_neighbors = 3
train accuracy: 84.45718559142466
test accuracy: 67.84646061814557
n_neighbors = 4
train accuracy: 81.78985416926336
test accuracy: 68.39481555333998
n_neighbors = 5
train accuracy: 79.43412688520503
test accuracy: 68.19541375872383
n_neighbors = 6
train accuracy: 79.6834101956874
test accuracy: 69.0927218344965
n_neighbors = 7
train accuracy: 77.6268228842079
test accuracy: 69.54137587238284
n_neighbors = 8
train accuracy: 77.61435871868379
test accuracy: 69.39182452642075
n_neighbors = 9
train accuracy: 76.20590801445843
test accuracy: 68.39481555333998
n_neighbors = 10
train accuracy: 76.71693880094728
test accuracy: 68.74376869391826
n_neighbors = 11
train accuracy: 74.69774398604014
test accuracy: 68.64406779661016
n_neighbors = 12
train accuracy: 75.28355976567369
test accuracy: 69.69092721834497
n_neighbors = 13
train accuracy: 74.09946404088247
test accuracy: 68.6939182452642
n_neighbors = 14
train accuracy: 75.03427645519133
test accuracy: 69.0927218344965
n_neighbors = 15
train accuracy: 74.0620715443101
test accuracy: 68.49451645064806
n_neighbors = 16
train accuracy: 74.5357098342266
test accuracy: 69.79062811565304
n_neighbors = 17
train accuracy: 73.23943661971832
test accuracy: 68.9431704885344
n_neighbors = 18
train accuracy: 74.33628318584071
test accuracy: 69.89032901296112
n_neighbors = 19
train accuracy: 73.15218746104948
test accuracy: 68.29511465603191
n_neighbors = 20
train accuracy: 73.65075408201422
test accuracy: 69.59122632103688
n_neighbors = 21
train accuracy: 72.71594166770534
test accuracy: 68.09571286141576
n_neighbors = 22
train accuracy: 73.5011840957248
test accuracy: 69.69092721834497
n_neighbors = 23
train accuracy: 72.65362084008476
test accuracy: 68.54436689930209
n_neighbors = 24
train accuracy: 73.23943661971832
test accuracy: 69.740777666999
n_neighbors = 25
train accuracy: 72.67854917113299
test accuracy: 68.99302093718843
n_neighbors = 26
train accuracy: 73.88757322697246
test accuracy: 69.59122632103688
n_neighbors = 27
train accuracy: 72.77826249532595
test accuracy: 68.44466600199401
n_neighbors = 28
train accuracy: 73.60089741991773
test accuracy: 68.99302093718843
n_neighbors = 29
train accuracy: 72.57883584694005
test accuracy: 68.19541375872383
n_neighbors = 30
train accuracy: 73.05247413685653
test accuracy: 69.0927218344965
n_neighbors = 31
train accuracy: 72.15505421912003
test accuracy: 68.34496510468594
n_neighbors = 32
train accuracy: 72.7533341642777
test accuracy: 68.19541375872383
n_neighbors = 33
train accuracy: 72.0678050604512
test accuracy: 68.19541375872383
n_neighbors = 34
train accuracy: 72.45419419169886
test accuracy: 69.0927218344965
n_neighbors = 35
train accuracy: 71.61909510158296
test accuracy: 67.79661016949152
n_neighbors = 36
train accuracy: 72.47912252274709
test accuracy: 68.9431704885344
n_neighbors = 37
train accuracy: 71.1703851427147
test accuracy: 68.34496510468594
n_neighbors = 38
train accuracy: 71.93069923968591
test accuracy: 68.39481555333998
n_neighbors = 39
train accuracy: 70.77153184594292
test accuracy: 68.04586241276172
# Convergence plot
=2, color='k')
plt.plot(num_neighbors,train_accuracy ,linewidth=2, color='b')
plt.plot(num_neighbors,test_accuracy ,linewidth
"Number of neighbors in KNN")
plt.xlabel("Training (black) and test (blue) accuracy")
plt.ylabel("convergence_plot.png") plt.savefig(
# RETRAIN WITH OPTIMAL HYPER-PARAMETERS
# INITIALIZE MODEL
= KNeighborsClassifier(n_neighbors=30)
model
# TRAIN MODEL
model.fit(x_train,y_train)
# PRINT REPORT USING UTILITY FUNCTION ABOVE
print_model_summary()
ACCURACY CALCULATION
TRAINING SET:
Accuracy: 73.05247413685653
Number of mislabeled points out of a total 8023 points = 2162
TEST SET (UNTRAINED DATA):
Accuracy: 69.0927218344965
Number of mislabeled points out of a total 2006 points = 620
CHECK FIRST 20 PREDICTIONS
TRAINING SET:
[0 0 1 1 0 0 1 1 1 0 0 1 0 1 0 0 0 0 0 1]
[1 1 1 1 0 0 1 1 1 0 1 1 0 1 1 1 0 0 0 1]
ERRORS: [1 1 0 0 0 0 0 0 0 0 1 0 0 0 1 1 0 0 0 0]
TEST SET (UNTRAINED DATA):
[0 0 1 1 1 1 0 0 0 1 1 1 1 1 0 0 1 0 1 0]
[1 0 1 0 0 1 0 1 0 1 1 1 1 1 1 0 1 1 0 0]
ERRORS: [ 1 0 0 -1 -1 0 0 1 0 0 0 0 0 0 1 0 0 1 -1 0]