Text analysis and agile sarcasm prediction
Utilize classical methods to approach sarcastic sentiment classification problem
- Tradiontional approach to tackle sarcasm detection
- Basic text propertires between headlines and sarcasm in this dataset
- EDA
- Baseline Models
- Refine Corpus
- Stacking Classifiers
import re
from IPython.display import display
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score, f1_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from nltk.tokenize import casual_tokenize
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import StackingClassifier
from google.colab import files
# !pip install kaggle
uploaded = files.upload()
# Then move kaggle.json into the folder where the API expects to find it.
!mkdir -p ~/.kaggle/ && mv kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d rmisra/news-headlines-dataset-for-sarcasm-detection
!unzip /content/news-headlines-dataset-for-sarcasm-detection.zip
df_sarcasm_1 = pd.read_json('Sarcasm_Headlines_Dataset.json', lines=True)
df_sarcasm_2 = pd.read_json('Sarcasm_Headlines_Dataset_v2.json', lines=True)
display(df_sarcasm_1.head())
print("===============")
display(df_sarcasm_2.head())
print("df1_shape:", df_sarcasm_1.shape)
print("df2_shape:", df_sarcasm_2.shape)
#Combine two dataframe into one
df_sarcasm = pd.concat([df_sarcasm_1, df_sarcasm_2]).reset_index(drop=True)
del df_sarcasm_1, df_sarcasm_2
df_sarcasm.drop(['article_link'], inplace=True, axis=1)
print("df_shape:", df_sarcasm.shape)
df_sarcasm.head()
df_sarcasm.isna().sum()
sns.set_style("dark")
sns.countplot(data=df_sarcasm, x='is_sarcastic')
This dataset seems not so imbalanced to affect performance the chosen machine learning algorithms.
df_sarcasm['word_counts'] = df_sarcasm.headline.str.split().apply(len)
fig, axs = plt.subplots(1, 2, figsize=(10, 5))
# axs[0].hist(word_counts.values, color='red', range=(2, 5))
sns.boxplot(data=df_sarcasm.loc[df_sarcasm.is_sarcastic == 1], x='word_counts', color='red', ax=axs[0])
# sns.boxplot(data=df_sarcasm.loc[(df_sarcasm.is_sarcastic == 1) & (df_sarcasm.word_counts < 50)], x='word_counts', color='red', ax=axs[0])
axs[0].set_title('Sarcastic text')
# axs[1].hist(word_counts, color='green')
sns.boxplot(data=df_sarcasm.loc[df_sarcasm.is_sarcastic == 0], x='word_counts', color='green',ax=axs[1])
axs[1].set_title('Non-Sarcastic text')
df_sarcasm.drop(columns=['word_counts'], inplace=True)
plt.show()
If we remove the outlier(more than 150 words) in sarcastic text group, sarcastic and non-sarcastic texts actually share similiar word counts distribution
df_sarcasm['word_counts'] = df_sarcasm.headline.str.split().apply(len)
fig, axs = plt.subplots(1, 2, figsize=(10, 5))
sns.boxplot(data=df_sarcasm.loc[(df_sarcasm.is_sarcastic == 1) & (df_sarcasm.word_counts < 50)], x='word_counts', color='red', ax=axs[0])
axs[0].set_title('Sarcastic text')
sns.boxplot(data=df_sarcasm.loc[df_sarcasm.is_sarcastic == 0], x='word_counts', color='green',ax=axs[1])
axs[1].set_title('Non-Sarcastic text')
df_sarcasm.drop(columns=['word_counts'], inplace=True)
plt.show()
Chart below shows most frequent words in our corpus.
all_words = df_sarcasm.headline.str.split(expand=True).unstack().value_counts()[:30]
sns.set(rc={'figure.figsize':(20, 12)})
sns.barplot(all_words.index, all_words.values)
wordcloud = WordCloud(max_font_size=50, max_words=50, background_color="white").generate(df_sarcasm.loc[df_sarcasm.is_sarcastic == 1].headline.str.cat(sep=' '))
plt.figure(figsize=(20, 10))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title('Common words from Sarcastics Headline')
plt.show()
wordcloud = WordCloud(max_font_size=50, max_words=50, background_color="white").generate(df_sarcasm.loc[df_sarcasm.is_sarcastic == 0].headline.str.cat(sep=' '))
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title('Common words from Non-Sarcastics Headline')
plt.show()
X_train, X_test, y_train, y_test = train_test_split(df_sarcasm.headline, df_sarcasm.is_sarcastic, random_state=12020)
def cross_validation(pipeline, data, label):
models = []
kf = KFold(n_splits=5)
mean_score = []
for train_index, valid_index in kf.split(data):
X_train, y_train = data.iloc[train_index], label.iloc[train_index]
X_valid, y_valid = data.iloc[valid_index], label.iloc[valid_index]
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_valid)
models.append(pipeline)
# y_special = pipeline.predict(np.array(["Well, if you want to see less of me, maybe we should go out again"]))
# print("y_special:", y_special)
print('Precision: %.3f' % precision_score(y_true=y_valid, y_pred=y_pred))
print('Recall: %.3f' % recall_score(y_true=y_valid, y_pred=y_pred))
print('F1: %.3f' % f1_score(y_true=y_valid, y_pred=y_pred))
score = accuracy_score(y_true=y_valid, y_pred=y_pred)
print('Accuracy: %.3f' % score)
mean_score.append(score)
# sns.heatmap(confusion_matrix(y_valid, y_pred), annot=True)
# plt.show()
print("================================")
print(f"Finished, average accuracy is {np.mean(np.array(mean_score))}")
pipe = Pipeline([('count_vect', CountVectorizer()), ('clf', MultinomialNB())])
cross_validation(pipe, X_train, y_train)
pipe = Pipeline([('tfidf_vect', TfidfVectorizer()), ('clf', MultinomialNB())])
cross_validation(pipe, X_train, y_train)
pipe = Pipeline([('count_vect', CountVectorizer()), ('clf', LogisticRegression(max_iter=200))])
cross_validation(pipe, X_train, y_train)
pipe = Pipeline([('tfidf_vect', TfidfVectorizer()), ('clf', LogisticRegression(max_iter=200))])
cross_validation(pipe, X_train, y_train)
pipe = Pipeline([('count_vect', CountVectorizer()), ('clf', LinearSVC(random_state=0, tol=1e-5))])
cross_validation(pipe, X_train, y_train)
pipe = Pipeline([('tfidf_vect', TfidfVectorizer()), ('clf', LinearSVC(random_state=0, tol=1e-5))])
cross_validation(pipe, X_train, y_train)
pipe = Pipeline([('count_vect', CountVectorizer(ngram_range=(1, 3), tokenizer=casual_tokenize)),
('clf', MultinomialNB())])
cross_validation(pipe, X_train, y_train)
pipe = Pipeline([('count_vect', CountVectorizer(ngram_range=(1, 3), tokenizer=casual_tokenize)),
('clf', LogisticRegression(max_iter=200))])
cross_validation(pipe, X_train, y_train)
import spacy
spacy.load('en')
lemmatizer = spacy.lang.en.English()
def my_tokenizer(doc):
tokens = lemmatizer(doc)
return ([token.lemma_ for token in tokens])
custom_vec = CountVectorizer(tokenizer=my_tokenizer, ngram_range=(1, 3))
estimators = [('tokenizer', custom_vec),
('clf', MultinomialNB())]
pipe = Pipeline([estimators[0], estimators[1]])
cross_validation(pipe, X_train, y_train)
In conclusion, casual tokenization is slightly better than lemmatizatoin, so we will use casual tokenizer to emsemble our models.
Stacking method as many people have known are one of many powerful ways to reduce biases of prediction of each estimator. Here, we use stacking method to ensemble two estimators, specifically two predictions from two estimators 'MNB' and 'Logi' will act as inputs to train our last classifier(LinearSVC) to produce final prediction.
estimators = [
('MNB', make_pipeline(CountVectorizer(ngram_range=(1, 5), tokenizer=casual_tokenize),
MultinomialNB())),
('Logi', make_pipeline(CountVectorizer(ngram_range=(1, 3), tokenizer=casual_tokenize),
LogisticRegression(max_iter=200))),
# ('svr', make_pipeline(CountVectorizer(ngram_range=(1, 3), tokenizer=casual_tokenize),
# LinearSVC(random_state=0, tol=1e-5, max_iter=500)))
]
clf = StackingClassifier(
# estimators=estimators, final_estimator=LogisticRegression(max_iter=200))
estimators=estimators, final_estimator=LinearSVC(random_state=0, tol=1e-5, max_iter=500))
# clf.fit(X_train, y_train).score(X_test, y_test)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
score = accuracy_score(y_true=y_test, y_pred=y_pred)
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True)
plt.show()
print(score)
In this notebook, utilizting this traditional NLP approach, we improve accuracy of sarcasm sentiments detection from 88.9% to 96.2%.