This project conducts sentiment analysis to classify a Reddit comment's political affiliation, ranging from Left, Center, Right and Alt. The data is scrapped with Pushshift API. The table below is the count of each data file, and the sources of comments:
The raw comments, as given, are not in a form amenable to feature extraction for classification – there is too much ‘noise’. Therefore, the first step is to clean the text comments, including the process of tagging, lemmatization, and token segmentation.
import os
import json
import re
import spacy
import html
import string
import csv
import time
import numpy as np
from print_schema import print_schema
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
sentencizer = nlp.add_pipe("sentencizer")
def clean_comment(comment):
''' This function pre-processes a single comment
Parameters:
comment : string, the body of a comment
Returns:
modComm : string, the modified comment
'''
modComm = comment
#modify this to handle other whitespace chars.
#replace newlines with spaces
modComm = re.sub(r"[\n\t\r]{1,}", " ", modComm)
# unescape html
modComm = html.unescape(modComm)
# remove URLs
modComm = re.sub(r"(http|www)\S+", "", modComm)
#remove duplicate spaces.
modComm = re.sub(' +', ' ', modComm)
# get Spacy document for modComm
utt = nlp(modComm)
# use Spacy document for modComm to create a string.
# Insert "\n" between sentences.
# Split tokens with spaces.
# Write "/POS" after each token.
text = ""
for sent in utt.sents:
for i, token in enumerate(sent):
first = token.lemma_
# Replace the token itself with the token.lemma . E.g., words/NNS becomes word/NNS.
# If the lemma begins with a dash (‘-’) when the token doesn’t (e.g., -PRON- for I, just keep the token.
if token.lemma_.startswith("-") and not token.text.startswith("-"):
first = token.text
# Retain the case of the original token when you perform this replacement. We make two
# distinctions here: if the original token is entirely in uppercase, the so is the lemma; otherwise,
# keep the lemma in lowercase.
first = first.lower()
if token.text.isupper():
first = first.upper()
second = token.tag_
text += f"{first}/{second}"
if i < len(sent) - 1:
text += " "
text += "\n"
modComm = text
return modComm
allOutput = []
for subdir, dirs, files in os.walk("data"):
for file in files:
fullFile = os.path.join(subdir, file)
print( "Processing " + fullFile)
data = json.load(open(fullFile))
# process each line
for line in data:
j = json.loads(line)
# if the comment is deleted, then treat it as an empty string
new_body = j["body"]
if j["body"] == "[deleted]" or j["body"] == "[removed]":
new_body = ""
# clean each comment
new_body = clean_comment(new_body)
# append to final output
new_output = {
"id": j["id"],
"body": new_body,
"cat": file
}
allOutput.append(new_output)
print_schema(allOutput)
For classifying political opinions, I want to extract features that are relevant to bias detection. Several of these features involve counting tokens based on their tags. For example, counting the number of adverbs in a comment involves counting the number of tokens that have been tagged as RB, RBR, or RBS.
The features also include norm sets. Lexical norms are aggregate subjective scores given to words by a large group of individuals. Each type of norm assigns a numerical value to each word. Bristol & GilhoolyLogie's set covers age-of-acquisition, imagery, concreteness, familiarity, and ambiguity measures for 1,944 words of varying length and frequency of occurrence are presented. Similarly, Warringer's norm set. The author collected affective norms of valence (the pleasantness of a stimulus), arousal (the intensity of emotion provoked by a stimulus), and dominance (the degree of control exerted by a stimulus) for 13,915 English words (lemmas).
# wordlists.
FIRST_PERSON_PRONOUNS = {
'i', 'me', 'my', 'mine', 'we', 'us', 'our', 'ours'}
SECOND_PERSON_PRONOUNS = {
'you', 'your', 'yours', 'u', 'ur', 'urs'}
THIRD_PERSON_PRONOUNS = {
'he', 'him', 'his', 'she', 'her', 'hers', 'it', 'its', 'they', 'them',
'their', 'theirs'}
FUTURE_TENSE = {'\'ll', 'will', 'gonna'}
SLANG = {
'smh', 'fwb', 'lmfao', 'lmao', 'lms', 'tbh', 'rofl', 'wtf', 'bff',
'wyd', 'lylc', 'brb', 'atm', 'imao', 'sml', 'btw', 'bw', 'imho', 'fyi',
'ppl', 'sob', 'ttyl', 'imo', 'ltr', 'thx', 'kk', 'omg', 'omfg', 'ttys',
'afn', 'bbs', 'cya', 'ez', 'f2f', 'gtr', 'ic', 'jk', 'k', 'ly', 'ya',
'nm', 'np', 'plz', 'ru', 'so', 'tc', 'tmi', 'ym', 'ur', 'u', 'sol', 'fml'}
NORMS_BG = {}
NORMS_W = {}
def isMultiplePuncToken(token):
''' Helper funnction to check if a word is multi-character punctuation
Parameters:
token : string, a word
Returns:
boolean : if the word contains multi-chracter puncuation
'''
if len(token) <= 1:
return False
for i in token:
if i not in string.punctuation:
return False
return True
def extract(comment):
''' This function extracts features from a single comment
Parameters:
comment : string, the body of a comment (after preprocessing)
Returns:
feats : numpy Array, a 29-length vector of floating point features
'''
# Extract features that rely on capitalization.
# Lowercase the text in comment. Be careful not to lowercase the tags. (e.g. "Dog/NN" -> "dog/NN").
# Extract features that do not rely on capitalization.
feats = np.zeros((1, 29))
# a list contains all word/tag token
# replace \n with empty string, then split the comment
word_list = comment.replace("\n", " ").split(" ")[:-1]
# convert to a dictionary for efficient looping
# Format: { '1': ('dog', 'NN'), .... }
word_dict = { i: (t.rsplit("/", 1)[0], t.rsplit("/", 1)[1]) for i, t in enumerate(word_list) if len(t.rsplit("/",1)) == 2}
# start to extract features
for key in word_dict:
word, tag = word_dict[key][0], word_dict[key][1]
# 1. Number of tokens in uppercase (≥ 3 letters long)
if word.isupper() and len(word) >= 3:
feats[0][0] += 1
# change word to lower case
word = word.lower()
# 2. Number of first-person pronouns
if word in FIRST_PERSON_PRONOUNS:
feats[0][1] += 1
# 3. Number of second-person pronouns
if word in SECOND_PERSON_PRONOUNS:
feats[0][2] += 1
# 4. Number of third-person pronouns
if word in THIRD_PERSON_PRONOUNS:
feats[0][3] += 1
# 5. Number of coordinating conjunctions
if tag == 'CC':
feats[0][4] += 1
# 6. Number of past-tense verbs
if tag == 'VBD':
feats[0][5] += 1
# 7. Number of future-tense verbs
if word in FUTURE_TENSE:
feats[0][6] += 1
# 8. Number of commas
if "," in word:
feats[0][7] += 1
# 9. Number of multi-character punctuation tokens
# compare first character of the word with punctuation list
if isMultiplePuncToken(word):
feats[0][8] += 1
# 10. Number of common nouns
if tag == "NN" or tag == "NNS":
feats[0][9] += 1
# 11. Number of proper nouns
if tag == "NNP" or tag == "NNPS":
feats[0][10] += 1
# 12. Number of adverbs
if tag in ["RB", "RBR", "RBS"]:
feats[0][11] += 1
# 13. Number of wh- words
if tag in ['WDT', 'WP', 'WP$', 'WRB']:
feats[0][12] += 1
# 14. Number of slang acronyms
if word in SLANG:
feats[0][13] += 1
# 7. Number of future-tense verbs
feats[0][6] += len(re.compile(r"go/VBG to/TO [\w]+/VB").findall(comment))
# 15. Average length of sentences, in tokens
feats[0][14] = len(word_list) / comment.count("\n")
# 16. Average length of tokens, excluding punctuation-only tokens, in characters
f16_i = 0
f16_t = 0
for key in word_dict:
word, tag = word_dict[key][0].lower(), word_dict[key][1]
if word not in string.punctuation and not isMultiplePuncToken(word):
f16_i += 1
f16_t += len(word)
if f16_i > 0:
feats[0][15] = f16_t / f16_i
# 17. Number of sentences.
feats[0][16] = comment.count("\n")
# Norms features collector
AoA_ls = []
IMG_ls = []
FAM_ls = []
V_ls = []
D_ls = []
A_ls = []
# append norms to their own collection
# for calculating their mean and std
for key in word_dict:
word = word_dict[key][0].lower()
if word != "":
# Bristol, Gilhooly, and Logie features
if word in NORMS_BG:
AoA_ls.append(int(NORMS_BG[word][0]))
IMG_ls.append(int(NORMS_BG[word][1]))
FAM_ls.append(int(NORMS_BG[word][2]))
# Warringer features
if word in NORMS_W:
V_ls.append(float(NORMS_W[word][0]))
A_ls.append(float(NORMS_W[word][1]))
D_ls.append(float(NORMS_W[word][2]))
if len(AoA_ls) > 0:
# 18. Average of Ao=pA (100-700) from Bristol, Gilhooly, and Logie norms
feats[0][17] = np.mean(AoA_ls)
# 21. Standard deviation of AoA (100-700) from Bristol, Gilhooly, and Logie norms
feats[0][20] = np.std(AoA_ls)
# 19. Average of IMG from Bristol, Gilhooly, and Logie norms
feats[0][18] = np.mean(IMG_ls)
# 22. Standard deviation of IMG from Bristol, Gilhooly, and Logie norms
feats[0][21] = np.std(IMG_ls)
# 20. Average of FAM from Bristol, Gilhooly, and Logie norms
feats[0][19] = np.mean(FAM_ls)
# 23. Standard deviation of FAM from Bristol, Gilhooly, and Logie norms
feats[0][22] = np.std(FAM_ls)
if len(V_ls) > 0:
# 24. Average of V.Mean.Sum from Warringer norms
feats[0][23] = np.mean(V_ls)
# 27. Standard deviation of V.Mean.Sum from Warringer norms
feats[0][26] = np.std(V_ls)
# 25. Average of A.Mean.Sum from Warringer norms
feats[0][24] = np.mean(A_ls)
# 28. Standard deviation of A.Mean.Sum from Warringer norms
feats[0][27] = np.std(A_ls)
# 26. Average of D.Mean.Sum from Warringer norms
feats[0][25] = np.mean(D_ls)
# 29. Standard deviation of D.Mean.Sum from Warringer norms
feats[0][28] = np.std(D_ls)
return feats
data = allOutput
feats = np.zeros((len(data), 30))
## fill in containers for norms features
# Load Bristol+GilhoolyLogie:
bg_file_path = 'Wordlists/BristolNorms+GilhoolyLogie.csv'
bg_file = open(bg_file_path, "r")
reader = csv.reader(bg_file)
for i, line in enumerate(reader):
if i > 0:
# dict structure: { word: (AoA, IMG, FAM), ... }
NORMS_BG[line[1]] = (line[3], line[4], line[5])
# Load Warringer
w_file_path = "Wordlists//Ratings_Warriner_et_al.csv"
w_file = open(w_file_path, "r")
reader = csv.reader(w_file)
for i, line in enumerate(reader):
if i > 0:
# dict structure: { word: (V, A, D), ... }
NORMS_W[line[1]] = (line[2], line[5], line[8])
## extract features
loop_starts_time = time.time()
for i in range(feats.shape[0]):
body = data[i]['body']
# don't feed into any empty string (usually deleted/removed comment)
if body == "":
continue
# Call extract for each datatpoint to find the 29 features.
feats[i][:-1] = extract(body)
class_map = { "Left": 0, "Center": 1, "Right": 2, "Alt": 3 }
# append label in the end
feats[i][-1] = class_map[data[i]['cat']]
if i % 30000 == 0:
print(i)
now = time.time()
print("It has been {0} seconds since the loop started".format(now - loop_starts_time))
feats.shape
# np.savez_compressed("feats.npz", feats)
In this section, I want to build classification models with various machine learning techniques and compare their performance, and pick the best-fit one to tune hyper-parameters and feature engineering. The tested models are as follows:
ML models:
Neural Network model:
From the experiments below, we can find MLP is the best model. The model's robust predicting ability render a 0.44 overall accuracy across 4 political groups. And the "left" has the best accuracy (0.68) than the "alt" (0.32~). The gap between the accuracies may be caused by: 1. "alt" has lower support than "left" 2. The language used by "alt" is more diverse than "left", which makes it hard to catch the sentiment.
Another finding is after feature selection and hyper-parameter tuning of MLP model, the overall performance didn't have a visible increase. I believe the neural model already reached the top performance based on the existing combination of features. Natural language processing has the nature of complexity and it's hard to obtain a "perfect model".
import argparse
import os
from scipy import stats
from scipy.stats import ttest_rel
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import SelectKBest
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
X = feats[..., :-1] # input (173 features)
y = feats[..., -1] # label (last column)
(X_train, X_test, y_train, y_test) = train_test_split(X, y, test_size=0.20)
def accuracy(C):
''' Compute accuracy given Numpy array confusion matrix C. Returns a floating point value '''
sum, correct = 0.0, 0.0
for i in range(C.shape[0]): # row i
for j in range(C.shape[1]): # col j
if i == j:
correct += C[i][j]
sum += C[i][j]
if sum == 0:
return 0.0
return correct / sum
def recall(C):
''' Compute recall given Numpy array confusion matrix C. Returns a list of floating point values '''
TP = np.diag(C)
FP = np.sum(C, axis=0) - TP
FN = np.sum(C, axis=1) - TP
return TP / (TP + FN)
def precision(C):
''' Compute precision given Numpy array confusion matrix C. Returns a list of floating point values '''
TP = np.diag(C)
FP = np.sum(C, axis=0) - TP
FN = np.sum(C, axis=1) - TP
return TP/(TP+FP)
def compare_nude_models(X_train, X_test, y_train, y_test):
'''
Parameters
X_train: NumPy array, with the selected training features
X_test: NumPy array, with the selected testing features
y_train: NumPy array, with the selected training classes
y_test: NumPy array, with the selected testing classes
Returns:
i: int, the index of the supposed best classifier
'''
y_true = y_test
results = []
best_accuracy = 0
iBest = 0
# 1. SGDClassifier: support vector machine with a linear kernel.
clf = make_pipeline(StandardScaler(),SGDClassifier())
clf.fit(X_train, y_train)
y_pred_sgd = clf.predict(X_test)
cm1 = confusion_matrix(y_true, y_pred_sgd)
results.append({
"classifier_name": "SGDClassifier",
"conf_matrix": cm1,
"accuracy": accuracy(cm1),
"recall": recall(cm1),
"precision": precision(cm1)
})
# 2. GaussianNB: a Gaussian naive Bayes classifier.
clf = GaussianNB()
clf.fit(X_train, y_train)
y_pred_gnb = clf.predict(X_test)
cm2 = confusion_matrix(y_true, y_pred_gnb)
results.append({
"classifier_name": "GaussianNB",
"conf_matrix": cm2,
"accuracy": accuracy(cm2),
"recall": recall(cm2),
"precision": precision(cm2)
})
# 3. RandomForestClassifier: with a maximum depth of 5, and 10 estimators.
clf = RandomForestClassifier(max_depth=5, random_state=10)
clf.fit(X_train, y_train)
y_pred_rf = clf.predict(X_test)
cm3 = confusion_matrix(y_true, y_pred_rf)
results.append({
"classifier_name": "RandomForestClassifier",
"conf_matrix": cm3,
"accuracy": accuracy(cm3),
"recall": recall(cm3),
"precision": precision(cm3)
})
# 4. MLPClassifier: A feed-forward neural network, with α = 0.05.
clf = MLPClassifier(alpha=0.05)
clf.fit(X_train, y_train)
y_pred_mlp = clf.predict(X_test)
cm4 = confusion_matrix(y_true, y_pred_mlp)
results.append({
"classifier_name": "MLPClassifier",
"conf_matrix": cm4,
"accuracy": accuracy(cm4),
"recall": recall(cm4),
"precision": precision(cm4)
})
# 5. AdaBoostClassifier: with the default hyper-parameters.
clf = AdaBoostClassifier()
clf.fit(X_train, y_train)
y_pred_ada = clf.predict(X_test)
cm5 = confusion_matrix(y_true, y_pred_ada)
results.append({
"classifier_name": "AdaBoostClassifier",
"conf_matrix": cm5,
"accuracy": accuracy(cm5),
"recall": recall(cm5),
"precision": precision(cm5)
})
# For each classifier, compute results and write the following output:
for index, model in enumerate(results):
if model["accuracy"] > best_accuracy:
iBest = index
best_accuracy = model["accuracy"]
print(f'Results for {model["classifier_name"]}:\n') # Classifier name
print(f'\tAccuracy: {model["accuracy"]:.4f}\n')
print(f'\tRecall: {[round(item, 4) for item in model["recall"]]}\n')
print(f'\tPrecision: {[round(item, 4) for item in model["precision"]]}\n')
print(f'\tConfusion Matrix: \n{model["conf_matrix"]}\n\n')
print("---------------------")
print("Best model index number is: ", iBest)
compare_nude_models(X_train, X_test, y_train, y_test)
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
pipe = Pipeline(steps=[('clf', MLPClassifier(max_iter=100))])
search_space = [{'clf__hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)],
'clf__activation': ['logistic', 'relu'],
'clf__solver': ['sgd', 'adam'],
'clf__alpha': [0.0001],
'clf__learning_rate': ['constant','adaptive']}]
clf = GridSearchCV(pipe, search_space, n_jobs=-1, cv=5)
clf.fit(X_train, y_train)
# Best paramete set
print('Best parameters found:\n', clf.best_params_)
# All results
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
print(" ")
y_true, y_pred = y_test , clf.predict(X_test)
from sklearn.metrics import classification_report
print('Results on the test set:')
print(classification_report(y_true, y_pred))