With so many unstructured data online, in the form as text, video and sounds, how can we turn them into structured data? Nowadays we have access to thousands of reviews/comments on different social medias, while some reviews come with numeric ratings (like those on app Yelp), some don't (like those on Twitter). Business owners can certainly trace reviews/comments from customers' posts on all sorts of popular social media platfloms, however the process could become tedious especially when the number of reviews is large and the conclusion is more towards qualitative. For those reviews/comments that don't come with numeric ratings, if we are able to label these reviews as positive or negative (or on a scale of 1 to 5) by training models with labeled data(yelp data), we can help business owners to keep track of the feedback from customers in a faster and more interpretable way, so that they can adjust their services and offerings in response to customers' most-recent feedback.
The purpose of the project is to gain an understanding of yelp users' reviews and to predict sentimental feedback based on text from individual yelp reviews.
# New pacakges installment
# import nltk
# nltk.download('wordnet')
# !pip install gensim
# import nltk
# nltk.download('stopwords')
# ! pip install regex
#! pip install langdetect
#! pip install langid
#! pip install pydot
#! pip install wordcloud
# load necessary packages
from __future__ import division, print_function
import pandas as pd
import os
import json
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import re
import regex
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import matplotlib
mpl.rcParams['figure.figsize'] = (8, 8)
#inline_rc = dict(mpl.rcParams)
import re
import string
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import autocorrect
from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
import gensim
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np
from keras.callbacks import ModelCheckpoint
from keras.layers import Dense,Input, Dropout, Reshape, Activation,Flatten, concatenate, Input
from keras.layers import Bidirectional,GlobalMaxPooling1D,Conv1D, MaxPooling1D, Conv2D,MaxPool2D, MaxPooling2D
from keras.layers import Activation, Embedding, GRU
from keras.layers.recurrent import LSTM
from keras.models import Sequential
from keras.models import Model
import collections
from keras.models import load_model
from nltk.classify import textcat
from langdetect import detect
import langid
from keras.models import Model
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint
from numpy import savetxt
from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint
from keras.models import load_model
from keras.callbacks import History
import pydotplus
import keras
import pydot as pyd
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot
import graphviz
from keras import regularizers
from numpy import asarray
from numpy import savetxt
from numpy import loadtxt
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
from keras.models import load_model
from sklearn.metrics import classification_report
Specific Steps:
# Load Yelp review dataset
reviews = []
with open('yelp_academic_dataset_review.json') as fl:
for i, line in enumerate(fl):
reviews.append(json.loads(line))
#if i+1 >= 100000:
#break
df = pd.DataFrame(reviews)
df.head()
# Load Yelp business dataset
business = []
with open('yelp_academic_dataset_business.json') as fl:
for i, line in enumerate(fl):
business.append(json.loads(line))
df_busi = pd.DataFrame(business)
df_busi.head()
df.rename(columns={'stars': 'stars_review'}, inplace=True)
df_busi.rename(columns={'stars': 'stars_business'}, inplace=True)
df.shape
df_busi.shape
# Number of Restaurants in each city
pd.DataFrame(df_busi.city.value_counts(dropna=False)).head(10)
# filter for businesses in Phoenix city only
phoenix_busi_df = df_busi.loc[df_busi.city == "Phoenix", :]
# filter for reviews for Phoenix city only
phoenix_busi_id = df_busi.loc[df_busi.city == "Phoenix", "business_id"].tolist()
# Create a Dataframe containing reviews for restaurants in Phoenix only
phoenix_rev = df[df["business_id"].isin(phoenix_busi_id)]
# Merge business and review dataset for Phoenix city only
df_phoenix = pd.merge(phoenix_rev, phoenix_busi_df, on="business_id")
df_phoenix.to_csv("df_phoenix_reviews.csv")
df = pd.read_csv("df_phoenix_reviews.csv")
df.head()
df.columns
df.shape
# make sure there is no duplicate reviews
df.review_id.value_counts()
df.describe(include="all")
# Filter for only restaurants
is_restaurants = [re.search("restaurants",str(df.categories[i]).lower()) is not None for i in range(len(df))]
df = df.loc[is_restaurants,:]
# check shape of dataframe
df.shape
# looks like there is data type inconsistency
df.stars_review.value_counts()
# Check the distribution of target variable - dataset imbalanced
df.stars_review.value_counts(normalize = True)
labels = df.stars_review.value_counts().index.tolist()
labels
x=df['stars_review'].value_counts()
x=x.sort_index()
#plot
plt.figure(figsize=(8,4))
ax= sns.barplot(x.index, x.values, alpha=0.8)
plt.title("Yelp Star Rating Distribution")
plt.ylabel('# of Reviews', fontsize=12)
plt.xlabel('Star Ratings ', fontsize=12)
Noted that 5-star accounts for 44.1% of the total ratings, followed by 4-star representing 24.6% of all the ratings. The distribution on 5 different stars is not balanced but I don't think the mild imbalanced would be a big issue impacting the accaracy of the model predictions.
# one-hot-encoding the target variables, stars_review
df = pd.concat([df,pd.get_dummies(df['stars_review'], prefix='review_stars_')],axis=1)
# Create y
y = df["stars_review"]
# Split into train test
train, test = train_test_split(df, test_size=0.1, random_state=42, stratify = y)
# Creat ytrain
ytrain = train["stars_review"]
# Further splitting train into train and validation set
train, valid = train_test_split(train, test_size=0.2, random_state=42, stratify = ytrain)
train.stars_review.value_counts(normalize = True)
valid.stars_review.value_counts(normalize = True)
test.stars_review.value_counts(normalize = True)
train.to_csv("yelp_train.csv")
test.to_csv("yelp_test.csv")
valid.to_csv("yelp_valid.csv")
# Load back datasets if needed
train= pd.read_csv("yelp_train.csv")
valid= pd.read_csv("yelp_valid.csv")
test= pd.read_csv("yelp_test.csv")
train.columns
train_sentence = [sen for sen in train["text"]]
training_all_sens = ' '.join(train_sentence)
# Create and generate a word cloud image:
wordcloud = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(training_all_sens)
# Display the generated image:
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()
wordcloud.to_file("yelp_phoenix_wordcloud.png")
# Try removing some sentiment-neutral words
stopwords = set(STOPWORDS)
stopwords.update(["think", "one", "restaurant", "know", "meal", "say",
"eat"])
# Create and generate a word cloud image:
wordcloud_1 = WordCloud(max_font_size=50, max_words=100,stopwords=stopwords,
background_color="white").generate(training_all_sens)
# Display the generated image:
plt.figure()
plt.imshow(wordcloud_1, interpolation="bilinear")
plt.axis("off")
plt.show()
Create separate word clouds for low-ratings and high-rating revies
negative_ind = train.loc[(train.stars_review == 1) | (train.stars_review == 2) | (train.stars_review == 3), "text"]
positive_ind = train.loc[(train.stars_review == 4) | (train.stars_review == 5) , "text"]
training_all_neg = ' '.join(negative_ind)
training_all_pos = ' '.join(positive_ind)
# Create and generate a word cloud image:
wordcloud_neg = WordCloud(max_font_size=50, max_words=100,stopwords=stopwords,
background_color="white").generate(training_all_neg)
# Display the generated image:
plt.figure()
plt.imshow(wordcloud_neg, interpolation="bilinear")
plt.axis("off")
plt.show()
# Create and generate a word cloud image:
wordcloud_pos = WordCloud(max_font_size=50, max_words=100, stopwords=stopwords,
background_color="white").generate(training_all_pos)
# Display the generated image:
plt.figure()
plt.imshow(wordcloud_pos, interpolation="bilinear")
plt.axis("off")
plt.show()
*Stop words are a list of words that do not contribute to the deeper meaning of a sentence or phrase such as "a", "the", "is". Removing stop words can help us to reduce the vocabular size and therefore faster processing. While there are many stop words lists available online, it's important to consider if they are appropriate to the specific task of your project. For example, removing stop words such as "don't", "not", "aren't" would change a sentence from negative sentiment to a positive sentiment. Therefore I decided to customize a stop word list for this project.
See link for more details: https://medium.com/@limavallantin/why-is-removing-stop-words-not-always-a-good-idea-c8d35bd77214
A short list of additional considerations when cleaning text:
Source : https://machinelearningmastery.com/clean-text-machine-learning-python/
# Load back datasets if needed
train= pd.read_csv("yelp_train.csv")
valid= pd.read_csv("yelp_valid.csv")
test= pd.read_csv("yelp_test.csv")
striped_sen = [sen.strip() for sen in train['text']]
english_text_train = []
notlangs_train =[]
for i in range(len(train)):
try:
detected_lang = detect(striped_sen[i])
if detected_lang == "en":
english_text_train.append(i)
except:
notlangs_train.append(i)
notlangs_train
train["text"][notlangs_train]
# get index of all rows in train
full_ind = [i for i in range(len(train))]
# get index of non-english reviews
english_train_set = set(english_text_train) # this reduces the lookup time from O(n) to O(1)
noteng = [ind for ind in full_ind if ind not in english_train_set]
# Let's look what are identified as not in Englisht - not very accurate but decent
train.loc[noteng,"text"].head(10)
# Remove reviews that are not able to be identified as any languange
train.drop(index = noteng , axis=0, inplace= True )
train.shape
train['text'][8]
#Lowercasing before negation
lower_case = [[sen.lower()] for sen in train['text']]
# let's see an example below
lower_case[8]
#split sentence into words
words = [sen[0].split() for sen in lower_case]
# Check an example here
words[8]
# %load appos.py
appos = {
"aren't" : "are not",
"can't" : "cannot",
"couldn't" : "could not",
"didn't" : "did not",
"doesn't" : "does not",
"don't" : "do not",
"hadn't" : "had not",
"hasn't" : "has not",
"haven't" : "have not",
"he'd" : "he would",
"he'll" : "he will",
"he's" : "he is",
"i'd" : "I would",
"i'd" : "I had",
"i'll" : "I will",
"i'm" : "I am",
"isn't" : "is not",
"it's" : "it is",
"it'll":"it will",
"i've" : "I have",
"let's" : "let us",
"mightn't" : "might not",
"mustn't" : "must not",
"shan't" : "shall not",
"she'd" : "she would",
"she'll" : "she will",
"she's" : "she is",
"shouldn't" : "should not",
"that's" : "that is",
"there's" : "there is",
"they'd" : "they would",
"they'll" : "they will",
"they're" : "they are",
"they've" : "they have",
"we'd" : "we would",
"we're" : "we are",
"weren't" : "were not",
"we've" : "we have",
"what'll" : "what will",
"what're" : "what are",
"what's" : "what is",
"what've" : "what have",
"where's" : "where is",
"who'd" : "who would",
"who'll" : "who will",
"who're" : "who are",
"who's" : "who is",
"who've" : "who have",
"won't" : "will not",
"wouldn't" : "would not",
"you'd" : "you would",
"you'll" : "you will",
"you're" : "you are",
"you've" : "you have",
"'re": " are",
"wasn't": "was not",
"we'll":" will",
"didn't": "did not"
}
def appos_remove(sen):
return [appos[word] if word in appos else word for word in sen]
# Apostrophes connecting words replaced with uniform structures
appos_removed = [appos_remove(sen) for sen in words]
# confirm the function works: "i've" became 'I have', "you'll" became "you will"
appos_removed[8]
#rejoin again
rejoined_sen = [' '.join(sen) for sen in appos_removed]
#Lowercasing again
lower_sen = [sen.lower() for sen in rejoined_sen]
lower_sen[8]
# define a function to replace punctuations with white space
def remove_punct(text):
text_nopunct = ''
text_nopunct = re.sub('['+string.punctuation+']', ' ', text)
return text_nopunct
# replace punctuations in the text with white space
text_rem_punct = [remove_punct(sen) for sen in lower_sen]
# Now all punctuations are removed, e.g., forward slash in 'drinks/dessert' was removed
text_rem_punct[8]
#Tokenize text on whitespace
#token_word = [sen.split() for sen in lower_sen]
token_word = [WhitespaceTokenizer().tokenize(sen) for sen in text_rem_punct]
token_word[8]
# Remove non-alphabetic tokens, such as numbers
def remove_non_alpha(sen):
return [word for word in sen if word.isalpha()]
non_alpha_removed = [remove_non_alpha(sen) for sen in token_word]
train['text'][120]
# confirm the function works
" ".join(non_alpha_removed[120])
Customize a stopword list
see helpful link: https://programminghistorian.org/en/lessons/counting-frequencies
# Combine all words in the train set
all_train_words = [word for tokens in non_alpha_removed for word in tokens]
train_vocab = list(set(all_train_words))
print("The training set has a total of "+ str(len(all_train_words)) + " words with a vocab size of " + str(len(train_vocab))
+ " unique words" )
# def wordListToFreqDict(wordlist, vocab):
# wordfreq = [wordlist.count(w) for w in vocab]
# return dict(list(zip(vocab,wordfreq)))
#train_dict = wordListToFreqDict(wordlist = all_train_words, vocab = train_vocab)
# the function above is not recommended, not efficient
# create a vocab dictionary to record frequence of each words
train_dict = {}
for word in all_train_words:
try:
train_dict[word] += 1
except KeyError:
train_dict[word] = 1
# create function to sort the vocab dictionary from most frequent to least
def sortFreqDict(freqdict):
freqword_list = [(freqdict[key], key) for key in freqdict]
freqword_list.sort()
freqword_list.reverse()
return freqword_list
#sort the vocab dictionary from most frequent to least
sorted_train_dict = sortFreqDict(train_dict)
sorted_train_dict = dict(sorted_train_dict)
train_len = len(non_alpha_removed)
train_len
# find most frequent words
poplist=[]
for num,word in sorted_train_dict.items():
if num/len(all_train_words)>0.002:
poplist.append(word)
poplist
# identify any words that show up in more than 80% of the reviews in training set
stop_word_cand=[]
for word in poplist:
if sum([word in sen for sen in non_alpha_removed])/train_len >= 0.80:
stop_word_cand.append(word)
stop_word_cand
# Get stop words from NLTK
stoplist = stopwords.words('english')
stoplist
# create my own stopword list
custom_stop_words = ['i','me','my','myself',
'we','us','our','ours','ourselves',
'you',"you're","you've","you'll","you'd",'your','yours','yourself',
'yourselves',
'he','him','his','himself',
'she',"she's",'her','hers','herself',
'it',"it's", 'its','itself',
'they', 'them', 'their', 'theirs','themselves',
'what','which', 'who','whom', 'this','that', "that'll",'these','those',
'am', 'is','are', 'was','were','be', 'been', 'being','will','would',
'here','there',
'have','has','had', 'having', 'do','does','did','doing',
'a','an',"and",'the', 'or',
't','d','s', 'll','m','o','re','ve', 'ma',
'to','of','for','in','out','with','on','up', 'at','as','from','about']
def removeStopWords(tokens):
return [word for word in tokens if word not in custom_stop_words]
# remove stop words
filtered_words = [removeStopWords(sen) for sen in non_alpha_removed]
# check if stop words removed or not
filtered_words[8]
It would be nice if we can correct the spelling error in the text. However, we are skipping this step for now as the function takes too long.
reference:https://www.quora.com/Are-there-any-NLP-auto-correct-auto-complete-libraries-for-Python
# # create a function for auto correct spelling errors
# def correctspelling(tokens):
# spell = autocorrect.Speller(lang='en')
# return [spell(word) for word in tokens]
# # Check if function works
# correctspelling(['caaaar','mussage','hte'])
# # auto-correct spelling
# corrected_words = [correctspelling(sen) for sen in filtered_words]
# # check if stop words removed or not
# corrected_words[0]
"The goal of both stemming and lemmatization is to reduce inflected words to their word stem, base or root form—generally a written word form. For example, raining, rains, rained could be all stemmized to "rain".
The difference between stemming and lemmatization is the way they change the words. Stemming usually directly chops off the ends of words in the hope of achieving this goal correctly most of the time, and often includes the removal of derivational affixes. Lemmatization usually refers to doing things properly with the use of a vocabulary and morphological analysis of words, normally aiming to remove inflectional endings only and to return the base or dictionary form of a word, which is known as the lemma."
Comparing the text after lemmatized and snowball-stemmed, I prefer to use the text after only lemmatization since stemming actually changes some words to less interpretable, like 'happy' to 'happi'
Source : https://stackoverflow.com/questions/1787110/what-is-the-difference-between-lemmatization-vs-stemming
def lemmatization(tokens):
lemmatizer = WordNetLemmatizer()
return [lemmatizer.lemmatize(word) for word in tokens]
def porterstemming(tokens):
ps = PorterStemmer()
return [ps.stem(word) for word in tokens]
def snowballstemming(tokens):
ss = SnowballStemmer("english")
return [ss.stem(word) for word in tokens]
# Check if the function works
lemmatization(["rocking", "rains","rained","boys", "ran", "generously","happy"])
# Check if the function works
porterstemming(["rocking", "rains","rained","boys", "ran", "generously","happy"])
# Check if the function works
snowballstemming(["rocking", "rains","rained","boys", "ran", "generously","happy"])
# Apply lemmatization to text
lemmatized_words = [lemmatization(sen) for sen in filtered_words]
# # Apply stemming
# stemmed_words = [snowballstemming(sen) for sen in lemmatized_words]
# # Check reviews after lemmatized and stemmed
# stemmed_words[8]
# Check reviews after lemmatized
lemmatized_words[8]
# Make sure the cleaned text has the same length as the train dataframe
len(lemmatized_words) == len(train)
# Add cleaned(lemmatized) Text back to df
train['text_clean'] = [' '.join(sen) for sen in lemmatized_words]
# add tokenized cleaned Text back to df
train['tokens'] =lemmatized_words
train["text"][8]
train["text_clean"][8]
# Give a new name to appos_dict , in order to have the next function able to run
appos_dict = {
"aren't" : "are not",
"can't" : "cannot",
"couldn't" : "could not",
"didn't" : "did not",
"doesn't" : "does not",
"don't" : "do not",
"hadn't" : "had not",
"hasn't" : "has not",
"haven't" : "have not",
"he'd" : "he would",
"he'll" : "he will",
"he's" : "he is",
"i'd" : "I would",
"i'd" : "I had",
"i'll" : "I will",
"i'm" : "I am",
"isn't" : "is not",
"it's" : "it is",
"it'll":"it will",
"i've" : "I have",
"let's" : "let us",
"mightn't" : "might not",
"mustn't" : "must not",
"shan't" : "shall not",
"she'd" : "she would",
"she'll" : "she will",
"she's" : "she is",
"shouldn't" : "should not",
"that's" : "that is",
"there's" : "there is",
"they'd" : "they would",
"they'll" : "they will",
"they're" : "they are",
"they've" : "they have",
"we'd" : "we would",
"we're" : "we are",
"weren't" : "were not",
"we've" : "we have",
"what'll" : "what will",
"what're" : "what are",
"what's" : "what is",
"what've" : "what have",
"where's" : "where is",
"who'd" : "who would",
"who'll" : "who will",
"who're" : "who are",
"who's" : "who is",
"who've" : "who have",
"won't" : "will not",
"wouldn't" : "would not",
"you'd" : "you would",
"you'll" : "you will",
"you're" : "you are",
"you've" : "you have",
"'re": " are",
"wasn't": "was not",
"we'll":" will",
"didn't": "did not"
}
# define a function to remove apotrophes
def appos_remove(sen):
return [appos_dict[word] if word in appos_dict else word for word in sen]
# define a function to replace punctuations with white space
def remove_punct(text):
text_nopunct = ''
text_nopunct = re.sub('['+string.punctuation+']', ' ', text)
return text_nopunct
# def remove_punct_sen(sen):
# return [remove_punct(word) for word in sen]
# Remove non-alphabetic tokens, such as numbers
def remove_non_alpha(sen):
return [word for word in sen if word.isalpha()]
def removeStopWords(tokens):
return [word for word in tokens if word not in custom_stop_words]
# 7.Word Normalization : lemmatization
def lemmatization(tokens):
lemmatizer = WordNetLemmatizer()
return [lemmatizer.lemmatize(word) for word in tokens]
# create my own stopword list
custom_stop_words = ['i','me','my','myself',
'we','us','our','ours','ourselves',
'you',"you're","you've","you'll","you'd",'your','yours','yourself',
'yourselves',
'he','him','his','himself',
'she',"she's",'her','hers','herself',
'it',"it's", 'its','itself',
'they', 'them', 'their', 'theirs','themselves',
'what','which', 'who','whom', 'this','that', "that'll",'these','those',
'am', 'is','are', 'was','were','be', 'been', 'being','will','would',
'here','there',
'have','has','had', 'having', 'do','does','did','doing',
'a','an',"and",'the', 'or',
't','d','s', 'll','m','o','re','ve', 'ma',
'to','of','for','in','out','with','on','up', 'at','as','from','about']
# define a function to clean data
def text_cleaning(dataset):
# 1. Remove non-english reviews
# first Removing any leading and trailing whitespaces
striped_sen = [sen.strip() for sen in dataset["text"]]
english_text = []
notlangs =[]
for i in range(len(dataset)):
try:
detected_lang = detect(striped_sen[i])
if detected_lang == "en":
english_text.append(i)
except:
notlangs.append(i)
full_ind = [i for i in range(len(dataset))]
english_set = set(english_text) # this reduces the lookup time from O(n) to O(1)
not_eng = [ind for ind in full_ind if ind not in english_set]
dataset.drop(index =not_eng, axis=0, inplace= True )
# 2.Convert to lowercase
#first Lowercasing
lower_case = [[sen.lower()] for sen in dataset['text']]
# 3. Split words by whitespace
#split sentence into words
words = [sen[0].split() for sen in lower_case]
# 4. Remove Apostrophes
# Apostrophes connecting words replaced with uniform structures
appos_removed = [appos_remove(sen) for sen in words]
#rejoin again
rejoined_sen = [' '.join(sen) for sen in appos_removed]
#Lowercasing again
lower_sen = [sen.lower() for sen in rejoined_sen]
# 5.Punctuation
# replace punctuations in the text with white space
text_rem_punct = [remove_punct(sen) for sen in lower_sen]
# 6.split sentence into words
token_word = [WhitespaceTokenizer().tokenize(sen) for sen in text_rem_punct]
# 7.Remove non-alphabetic tokens
non_alpha_removed = [remove_non_alpha(sen) for sen in token_word]
# 8. Remove stop words
filtered_words = [removeStopWords(sen) for sen in non_alpha_removed]
# 8. Apply lemmatization
lemmatized_words = [lemmatization(sen) for sen in filtered_words]
# Add cleaned(lemmatized) Text back to df
dataset.loc[:,'text_clean'] = [' '.join(sen) for sen in lemmatized_words]
# add tokenized cleaned Text back to df
dataset.loc[:, 'tokens'] =lemmatized_words
return dataset
#train= pd.read_csv("yelp_train.csv")
valid= pd.read_csv("yelp_valid.csv")
test= pd.read_csv("yelp_test.csv")
#train = text_cleaning(train)
train.loc[:,["stars_review","tokens","text_clean","text"]].head()
valid= text_cleaning(valid)
valid.loc[:,["stars_review","tokens","text_clean","text"]].head()
valid.shape
valid['text'][6]
valid['text_clean'][6]
# save cleaned training dataset
train.to_csv("yelp_train_cleaned_0325.csv")
# save cleaned validation dataset
valid.to_csv("yelp_valid_cleaned_0325.csv")
train = pd.read_csv("yelp_train_cleaned_0325.csv")
valid = pd.read_csv("yelp_valid_cleaned_0325.csv")
train['text_clean'][8]
To find the most appropriate vocabulary size and maximum sequence length of each reviews for this dataset, I got inspiration from Paul Nation and Robert Waring in their paper VOCABULARY SIZE, TEXT COVERAGE AND WORD LISTS. The idea is a vocabulary of about 3000 words which provides coverage of at least 95% of a text allows new language learners (the models I will use) be able to efficiently learn from context with unknown words (Paul Nation and Robert Waring). http://www.fltr.ucl.ac.be/fltr/germ/etan/bibs/vocab/cup.html
Based on the research result above, I decided to use a vocabulary that is large enough to cover at least 98% (greater than 95%) of the entire train text and vocabulary size is no smaller than 3000. The maximum sequence length is at least as long as 95% of all reviews in the train set.
Let's take a look at the cleaned text to see how big the vocabulary size.
# perform the same tokenization on the test dataset
def check_vocab(dataset):
dataset_info = {}
all_words = [word for tokens in dataset.tokens for word in tokens]
vocab = sorted(list(set(all_words)))
sentence_lengths = [len(tokens) for tokens in dataset.tokens]
dataset_info["number_all_words"] = len(all_words)
dataset_info["number_vocab"] = len(vocab)
dataset_info["max_sentence_lengths"] = max(sentence_lengths)
print("%s words total, with a vocabulary size of %s" % (len(all_words), len(vocab)))
print("Max sentence length is %s" % max(sentence_lengths))
return dataset_info
# All words in the training set
train_info = check_vocab(train)
train_info
# average sequence length of all train set reviews is 59
np.mean([len(tokens) for tokens in train.tokens])
# 95% of the reviews with sequence shorter than 164
np.quantile([len(tokens) for tokens in train.tokens],0.95)
# 98% of the reviews with sequence no longer than 225
np.quantile([len(tokens) for tokens in train.tokens],0.98)
# Use 225 as max sequence length
max_sequence_len = 225
all_cleaned_train_words = [word for tokens in train["tokens"] for word in tokens]
cleaned_train_vocab = sorted(list(set(all_cleaned_train_words)))
print("The cleaned train set has a total of "+ str(len(all_cleaned_train_words)) +
" words with a vocab size of " + str(len(cleaned_train_vocab))+
" unique words." )
print("{:.2%}".format(len(cleaned_train_vocab)/len(all_cleaned_train_words)) + " of words in the train set are unique")
# create a vocab dictionary to record frequence of each words
cleaned_train_dict = {}
for word in all_cleaned_train_words:
try:
cleaned_train_dict[word] += 1
except KeyError:
cleaned_train_dict[word] = 1
# create function to sort the vocab dictionary from most frequent to least
def sortFreqDict(freqdict):
freqword_list = [(freqdict[key], key) for key in freqdict]
freqword_list.sort()
freqword_list.reverse()
return freqword_list
#sort the vocab dictionary from most frequent to least
sorted_cleaned_train_list = sortFreqDict(cleaned_train_dict)
# convert list to dict
sorted_cleaned_train_dict = dict(sorted_cleaned_train_list)
# total number of words in the combine train set
all_train_words = len(all_cleaned_train_words)
# sorted_cleaned_train_dict={}
# for (num,w) in sorted_cleaned_train_list:
# sorted_cleaned_train_dict[w] = num
# # calculate number of most frequent words that cover at least 95% of the all texts in train
# count = 0
# words_list=[]
# words_num =0
# for num, word in sorted_cleaned_train_dict.items():
# if count/all_train_words <= 0.99:
# count += num
# words_list.append(word)
# words_num += 1
count_1 = 0
words_list_1 =[]
words_num_1 =0
for (num,word) in sorted_cleaned_train_list:
if count_1/all_train_words <= 0.95:
count_1 += num
words_list_1.append(word)
words_num_1 += 1
# seems like 4911 most frequent words is enough, we will use 5000 for vocab size
print("The first " + str(words_num_1) + " most frequent words cover at least 95% of entire train text")
count = 0
words_list=[]
words_num =0
for (num,word) in sorted_cleaned_train_list:
if count/all_train_words <= 0.98:
count += num
words_list.append(word)
words_num += 1
# the first 10000 most frequent words will can have 98% coverage over the entire train set
print("The first " + str(words_num) + " most frequent words cover at least 98% of entire train text")
Based on the information above, I will map each word onto a 300 length real valued vector (vector length determined by the pre-trained word2vec). I will also limit the total number of words that we are interested in modeling to the 10000 most frequent words in the train set, and zero out the rest. Finally, the sequence length (number of words) in each review varies, so we will constrain each review to be 225 words, truncating long reviews and pad the shorter reviews with zero values.
# Based on the infomation above, I decided to use v
vocab_size=10000
max_sequence_len = 225
oov_tok = '<OOV>'
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok, lower=True, char_level=False)
tokenizer.fit_on_texts(train["text_clean"].tolist())
train_word_index= tokenizer.word_index
train_word_index
len_train_word_index = len(train_word_index)
len_train_word_index
print("Found %s unique tokens."% len(train_word_index))
train_sequences= tokenizer.texts_to_sequences(train["text_clean"].tolist())
#Need to pad our data as the sequence length (number of words) in each review varies.
train_padded = pad_sequences(train_sequences,
maxlen=max_sequence_len,
padding="post", truncating="post")
train_padded.shape
print(len(train_sequences[0]))
print(len(train_padded[0]))
print(len(train_sequences[10]))
print(len(train_padded[10]))
# Use the tokenizer and pad_sequences to transform valid dataset
valid_sequences = tokenizer.texts_to_sequences(valid["text_clean"].tolist())
valid_padded = pad_sequences(valid_sequences, maxlen=max_sequence_len,
padding="post", truncating="post")
valid_padded.shape
print(len(valid_sequences[0]))
print(len(valid_padded[0]))
print(len(valid_sequences[10]))
print(len(valid_padded[10]))
savetxt("train_padded.csv", train_padded, delimiter=',')
savetxt("valid_padded.csv", valid_padded, delimiter=',')
# load array
train_padded = loadtxt('train_padded.csv', delimiter=',')
# load array
valid_padded = loadtxt('valid_padded.csv', delimiter=',')
# print the array
train_padded.shape
# print the array
valid_padded.shape
reverse_word_index = dict([(value, key) for (key, value) in train_word_index.items()])
def decode_article(text):
return ' '.join([reverse_word_index.get(i, '?') for i in text])
print(decode_article(train_padded[10]))
print('---')
print(train.text_clean[10])
https://github.com/kk7nc/Text_Classification/blob/master/README.rst#term-frequency
I am going to use the pre-trained vectors that is trained on part of Google News dataset (about 100 billion words). The model contains 300-dimensional vectors for 3 million words and phrases.
# First load the Google's pre-trained Word2Vec model.
word2vec_path = "GoogleNews-vectors-negative300.bin.gz"
word2vec =gensim.models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True)
embedding_size = word2vec.vector_size
embedding_size
#Embedding weights for the entire vocabulary of the training set
all_train_embedding_weights = np.zeros((len(train_word_index)+1, embedding_size))
# create embedding weights for the entire train vocab
for word,index in train_word_index.items():
all_train_embedding_weights[index,:] = word2vec[word] if word in word2vec else np.random.rand(embedding_size)
print(all_train_embedding_weights.shape)
# save to csv file
savetxt('all_train_embedding_weights.csv', train_embedding_weights, delimiter=',')
# embedding weights for the chosen vocabulary size 10000
train_embedding_weights = np.zeros((vocab_size , embedding_size))
for word,index in train_word_index.items():
if index <= 10000:
train_embedding_weights[index-1,:] = word2vec[word] if word in word2vec else np.random.rand(embedding_size)
train_embedding_weights.shape
# save to csv file
savetxt('train_embedding_weights.csv', train_embedding_weights, delimiter=',')
# load back the embedding weights
train_embedding_weights = loadtxt('train_embedding_weights.csv', delimiter=',')
train_embedding_weights.shape
reverse_word_index = dict([(value, key) for (key, value) in train_word_index.items()])
def decode_article(text):
return ' '.join([reverse_word_index.get(i, '?') for i in text])
print(decode_article(train_padded[10]))
print('---')
print(train.text_clean[10])
from gensim.scripts.glove2word2vec import glove2word2vec
glove_input_file = 'glove.twitter.27B.200d.txt'
word2vec_output_file = 'glove.twitter.27B.200d.txt.word2vec'
glove2word2vec(glove_input_file, word2vec_output_file)
glove_path='glove.twitter.27B.200d.txt.word2vec'
glove =gensim.models.KeyedVectors.load_word2vec_format(glove_path, binary=False)
embedding_size_glove = glove.vector_size
embedding_size_glove
#Embedding weights for the entire vocabulary of the training set
all_train_embedding_weights_glove = np.zeros((len(train_word_index)+1, embedding_size_glove))
# create embedding weights for the entire train vocab
for word,index in train_word_index.items():
all_train_embedding_weights_glove[index,:] = glove[word] if word in glove else np.random.rand(embedding_size_glove)
print(all_train_embedding_weights_glove.shape)
# save to csv file
savetxt('all_train_embedding_weights_glove.csv', all_train_embedding_weights_glove, delimiter=',')
# embedding weights for the chosen vocabulary size 10000
train_embedding_weights_glove = np.zeros((vocab_size , embedding_size_glove))
for word,index in train_word_index.items():
if index <= 10000:
train_embedding_weights_glove[index-1,:] = glove[word] if word in glove else np.random.rand(embedding_size_glove)
train_embedding_weights_glove.shape
# save to csv file
savetxt('train_embedding_weights_glove.csv', train_embedding_weights_glove, delimiter=',')
# # load back the embedding weights
# train_embedding_weights_glove = loadtxt('train_embedding_weights_glove.csv', delimiter=',')
# train_embedding_weights_glove.shape
ytrain = train[['review_stars__1.0', 'review_stars__2.0',
'review_stars__3.0', 'review_stars__4.0', 'review_stars__5.0']]
yvalid = valid[['review_stars__1.0', 'review_stars__2.0',
'review_stars__3.0', 'review_stars__4.0', 'review_stars__5.0']]
ytrain.head()
yvalid.head()
labels = train.stars_review.value_counts().index.tolist()
labels
# let's review a list of parameters we set up , max_sequence_len =300
vocab_size=10000
max_sequence_len = 225
oov_tok = '<OOV>'
embedding_size=300
#len_train_word_index = 87529
First train the following 4 models on the word2Vec embedded data.
I noticed that with the word2Vec embedded training data, the best model among the 4 above is BiLST. I then trained BiLSTM model with GloVe embedded data and noticed that GloVe embedding model performed lightly better than the word2Vec model in predicting the rating of reviews.
Here are two setup I implemented to prevent overfitting. Early Stopping and Checkpoints We use early stopping to monitor the loss on the validation dataset and use the model checkpoint to save the best models based on validation set accuracy. Also, we set patience of early stopping of 10 epoches.
parameters for lstm_model :
max_sequence_len =225
vocab_size =10000
embedding_size =300
# Create an instance of Sequential called "model_rnn"
lstm_model = Sequential()
#add an Embedding layer
lstm_model.add(Embedding(input_dim =vocab_size,
output_dim = embedding_size,
weights=[train_embedding_weights],
input_length=max_sequence_len,
trainable=False))
# Add a LSTM layer
lstm_model.add(LSTM(units = 64, return_sequences=True, recurrent_dropout=0.2))
# Add 2nd LSTM layer
lstm_model.add(LSTM(units = 64, recurrent_dropout=0.2))
# Add a dropout layer
lstm_model.add(Dropout(rate=0.2))
lstm_model.add(Dense(32, activation="relu"))
lstm_model.add(Dropout(rate=0.2))
# Add a Dense Layer
lstm_model.add(Dense(units=5, activation = 'softmax'))
# Compile
lstm_model.compile(optimizer = "adam", loss = 'categorical_crossentropy',
metrics = ["accuracy"])
lstm_model.summary()
#patient early stopping
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=10)
mc = ModelCheckpoint('best_model.h5', monitor='val_accuracy', mode='max', verbose=1,
save_best_only=True)
lstm_model.fit(x = train_padded, y = ytrain, batch_size = 128 , epochs = 20,
validation_split=0.25, verbose=1, callbacks=[es, mc])
# load the best_model
saved_best_lstm_model = load_model('best_lstm_model.h5')
keras.utils.vis_utils.pydot = pyd
#Visualize Model
def visualize_model(model):
return SVG(model_to_dot(model,dpi=45).create(prog='dot', format='svg'))
#call the function on your model
visualize_model(saved_best_lstm_model)
_, train_acc = saved_best_lstm_model.evaluate(train_padded, ytrain, verbose=1)
_, valid_acc = saved_best_lstm_model.evaluate(valid_padded, yvalid, verbose=1)
print('Train: %.3f, Test: %.3f' % (train_acc, valid_acc))
valid_pred_lstm = saved_best_lstm_model.predict(valid_padded)
valid_pred_lstm_result=[]
for i in range(len(valid_pred_lstm)):
valid_pred_lstm_result.append(np.argmax(valid_pred_lstm[i])+1)
print(classification_report(valid['stars_review'], valid_pred_lstm_result))
parameters for bilstm_model:
max_sequence_len =225
embedding_size =300
vocab_size=10000
The keras.layers.Bidirectional wrapper can also be used with an RNN layer. This propagates the input forward and backwards through the RNN layer and then concatenates the output. This helps the RNN to learn long range dependencies.
# embedding layer output_dim
embedding_size =300
# Create an instance of Sequential called "model_rnn"
bilstm_model = Sequential()
#add an Embedding layer
bilstm_model.add(Embedding(input_dim =vocab_size,
output_dim = embedding_size,
weights=[train_embedding_weights],
input_length=max_sequence_len,
trainable=False))
# Add a LSTM layer
bilstm_model.add(Bidirectional(LSTM(units = 256, return_sequences=True,
recurrent_dropout=0.5)))
# Add 2nd LSTM layer
bilstm_model.add(Bidirectional(LSTM(units = 64,
dropout=0.2, recurrent_dropout=0.5)))
# Add a Dense Layer
bilstm_model.add(Dense(32, activation="relu"))
# add a dropout layer
bilstm_model.add(Dropout(rate=0.5))
# Add a Dense Layer
bilstm_model.add(Dense(units=5, activation = 'softmax'))
# Compile
bilstm_model.compile(optimizer = "adam", loss = 'categorical_crossentropy',
metrics = ["accuracy"])
bilstm_model.summary()
keras.utils.vis_utils.pydot = pyd
#Visualize Model
def visualize_model(model):
return SVG(model_to_dot(model,dpi=45).create(prog='dot', format='svg'))
#call the function on your model
visualize_model(saved_best_bilstm_model)
#patient early stopping
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=10)
mc = ModelCheckpoint('best_bilstm_model.h5', monitor='val_accuracy', mode='max', verbose=1,
save_best_only=True)
bilstm_hist= bilstm_model.fit(x = train_padded, y = ytrain, batch_size = 128 , epochs = 20,
validation_split=0.25, verbose=1, callbacks=[es, mc])
# load the best_model
saved_best_bilstm_model = load_model('best_bilstm_model.h5')
_, train_acc = saved_best_bilstm_model.evaluate(train_padded, ytrain, verbose=1)
_, valid_acc = saved_best_bilstm_model.evaluate(valid_padded, yvalid, verbose=1)
print('Train: %.3f, Test: %.3f' % (train_acc, valid_acc))
valid_pred_bilstm = saved_best_bilstm_model.predict(valid_padded)
valid_pred_bilstm_result=[]
for i in range(len(valid_pred_bilstm)):
valid_pred_bilstm_result.append(np.argmax(valid_pred_bilstm[i])+1)
print(classification_report(valid['stars_review'], valid_pred_bilstm_result))
keras.utils.vis_utils.pydot = pyd
#Visualize Model
def visualize_model(model):
return SVG(model_to_dot(model,dpi=45).create(prog='dot', format='svg'))
visualize_model(saved_best_bilstm_model)
max_sequence_len
def build_model_cnn(word_index_len,embedding_dim, embedding_matrix, nclasses,
MAX_SEQUENCE_LENGTH,num_filters = 64,dropout_rate=0.5):
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = Embedding(input_dim = word_index_len,
output_dim = embedding_dim,
weights=[embedding_matrix],
input_length= MAX_SEQUENCE_LENGTH,
trainable=False)(sequence_input)
convs = []
filter_sizes = [2,3,4,5,6]
for filter_size in filter_sizes:
l_conv = Conv1D(filters=num_filters ,
kernel_size=filter_size,
activation='relu',
name='Conv_'+'_'+str(filter_size))(embedded_sequences)
l_pool = GlobalMaxPooling1D()(l_conv)
convs.append(l_pool)
l_merge = concatenate(convs, axis=1)
x = Dropout(dropout_rate)(l_merge)
x = Dense(128, activation='relu')(x)
x = Dropout(dropout_rate)(x)
preds = Dense(nclasses, activation="softmax")(x)
model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
optimizer='adam',
metrics=['accuracy'])
model.summary()
return model
cnn_model = build_model_cnn(word_index_len = vocab_size,
embedding_dim= embedding_size,
embedding_matrix = train_embedding_weights,
nclasses = len(labels),
MAX_SEQUENCE_LENGTH= max_sequence_len,
num_filters =200,
dropout_rate=0.5)
cnn_model.summary()
keras.utils.vis_utils.pydot = pyd
#Visualize Model
def visualize_model(model):
return SVG(model_to_dot(model,dpi=45).create(prog='dot', format='svg'))
visualize_model(cnn_model)
#patient early stopping
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=10)
mc = ModelCheckpoint('cnn_model_best.h5', monitor='val_accuracy', mode='max', verbose=1,
save_best_only=True)
cnn_model_hist = cnn_model.fit(x = train_padded, y = ytrain, batch_size = 128 , epochs = 20,
validation_split=0.2, verbose=1, callbacks=[es, mc])
# load the best_model
saved_best_cnn_model = load_model('cnn_model_best.h5')
# get the loss value & the accuracy value on the test data.
_, train_acc = saved_best_cnn_model.evaluate(train_padded, ytrain, verbose=1)
_, valid_acc = saved_best_cnn_model.evaluate(valid_padded, yvalid, verbose=1)
print('Train: %.3f, Valid: %.3f' % (train_acc, valid_acc))
def plot_graphs(model, metric):
plt.plot(model.history[metric])
plt.plot(model.history['val_'+metric], '')
plt.xlabel("Epochs")
plt.ylabel(metric)
plt.legend([metric, 'val_'+metric])
plt.show()
plot_graphs(cnn_model_hist,"accuracy" )
plot_graphs(cnn_model_hist,"loss" )
valid_pred_cnn = saved_best_cnn_model.predict(valid_padded)
valid_pred_cnn_result=[]
for i in range(len(valid_pred_cnn)):
valid_pred_cnn_result.append(np.argmax(valid_pred_cnn[i])+1)
print(classification_report(valid['stars_review'], valid_pred_cnn_result))
max_sequence_len =225
embedding_size =300
vocab_size =10000
def build_cnn_lstm_model(word_index_len, embedding_matrix, nclasses, MAX_SEQUENCE_LENGTH,
embedding_dim, num_filters, dropout_rate ):
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = Embedding(input_dim = word_index_len,
output_dim = embedding_dim,
weights=[embedding_matrix],
input_length= MAX_SEQUENCE_LENGTH,
trainable=False)(sequence_input)
convs = []
filter_sizes = [2,3,4,5]
for filter_size in filter_sizes:
l_conv = Conv1D(filters=num_filters ,
kernel_size=filter_size,
padding='same',
activation='relu')(embedded_sequences)
l_pool = MaxPooling1D(pool_size=MAX_SEQUENCE_LENGTH - filter_size + 1)(l_conv)
convs.append(l_pool)
l_merge = concatenate(convs, axis=1)
l_lstm = Bidirectional(LSTM(units = 256,
recurrent_dropout=0.5))(l_merge)
x = Dropout(dropout_rate)(l_lstm)
x = Dense(128, activation='relu', kernel_regularizer=regularizers.l2(1))(x)
x = Dropout(dropout_rate)(x)
preds = Dense(nclasses, activation="softmax")(x)
model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy',
optimizer='adam',
metrics=['accuracy'])
model.summary()
return model
cnn_lstm_model = build_cnn_lstm_model(word_index_len = vocab_size,
embedding_matrix = train_embedding_weights,
nclasses = len(labels),
MAX_SEQUENCE_LENGTH = max_sequence_len,
embedding_dim= embedding_size,
num_filters =100 ,
dropout_rate=0.5)
cnn_lstm_model.summary()
keras.utils.vis_utils.pydot = pyd
#Visualize Model
def visualize_model(model):
return SVG(model_to_dot(model,dpi=45).create(prog='dot', format='svg'))
#create your model
#then call the function on your model
visualize_model(cnn_lstm_model)
#patient early stopping
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=8)
mc = ModelCheckpoint('cnn_lstm_best_model.h5', monitor='val_accuracy', mode='max', verbose=1,
save_best_only=True)
cnn_lstm_model_hist = cnn_lstm_model.fit(x = train_padded, y = ytrain, batch_size = 128 ,
epochs = 15,
validation_split=0.2, verbose=1, callbacks=[es, mc])
# load the best_model
saved_best_cnn_lstm_model = load_model('cnn_lstm_best_model.h5')
_, train_acc = saved_best_cnn_lstm_model.evaluate(train_padded, ytrain, verbose=1)
_, valid_acc = saved_best_cnn_lstm_model.evaluate(valid_padded, yvalid, verbose=1)
print('Train: %.3f, Valid: %.3f' % (train_acc, valid_acc))
plot_graphs(cnn_lstm_model_hist, 'accuracy')
plot_graphs(cnn_lstm_model_hist, 'loss')
valid_pred_cnn_lstm = saved_best_cnn_lstm_model.predict(valid_padded)
valid_pred_cnn_lstm_result=[]
for i in range(len(valid_pred_cnn_lstm)):
valid_pred_cnn_lstm_result.append(np.argmax(valid_pred_cnn_lstm[i])+1)
print(classification_report(valid['stars_review'], valid_pred_cnn_lstm_result))
parameters for bilstm_model:
max_sequence_len =225
embedding_size =200
vocab_size=10000
# embedding layer output_dim
embedding_size_glove
# Create an instance of Sequential called "model_rnn"
glove_bilstm_model = Sequential()
#add an Embedding layer
glove_bilstm_model.add(Embedding(input_dim =vocab_size,
output_dim = embedding_size_glove ,
weights=[train_embedding_weights_glove],
input_length=max_sequence_len,
trainable=False))
# Add a LSTM layer
glove_bilstm_model.add(Bidirectional(LSTM(units = 256, return_sequences=True,
recurrent_dropout=0.5)))
# Add 2nd LSTM layer
glove_bilstm_model.add(Bidirectional(LSTM(units = 64,
dropout=0.2, recurrent_dropout=0.5)))
# add a dropout layer
glove_bilstm_model.add(Dropout(rate=0.5))
# Add a Dense Layer
glove_bilstm_model.add(Dense(32, activation="relu"))
# add a dropout layer
glove_bilstm_model.add(Dropout(rate=0.5))
# Add a Dense Layer
glove_bilstm_model.add(Dense(units=5, activation = 'softmax'))
# Compile
glove_bilstm_model.compile(optimizer = "adam", loss = 'categorical_crossentropy',
metrics = ["accuracy"])
glove_bilstm_model.summary()
keras.utils.vis_utils.pydot = pyd
#Visualize Model
def visualize_model(model):
return SVG(model_to_dot(model,dpi=45).create(prog='dot', format='svg'))
#call the function on your model
visualize_model(glove_bilstm_model)
#patient early stopping
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=10)
mc = ModelCheckpoint('best_glove_bilstm_model.h5', monitor='val_accuracy', mode='max', verbose=1,
save_best_only=True)
glove_bilstm_hist= glove_bilstm_model.fit(x = train_padded, y = ytrain, batch_size = 128 , epochs = 20,
validation_split=0.25, verbose=1, callbacks=[es, mc])
The training process stopped unexpectedly during the epoch 6 (jupyter kernel shutdown). I will load the saved best model and evaluate its on train and valid dataset and then continue training it.
# load the best_model
saved_best_glove_bilstm_model = load_model('best_glove_bilstm_model.h5')
_, train_acc = saved_best_glove_bilstm_model.evaluate(train_padded, ytrain, verbose=1)
_, valid_acc = saved_best_glove_bilstm_model.evaluate(valid_padded, yvalid, verbose=1)
print('Train: %.3f, Test: %.3f' % (train_acc, valid_acc))
valid_pred_glove_bilstm = saved_best_glove_bilstm_model.predict(valid_padded)
labels=[1,2,3,4,5]
valid_pred_glove_bilstm_result=[]
for p in valid_pred_glove_bilstm:
valid_pred_glove_bilstm_result.append(labels[np.argmax(p)])
print(classification_report(valid['stars_review'], valid_pred_glove_bilstm_result))
Continue training glove_bilstm model
#patient early stopping
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)
mc = ModelCheckpoint('best_glove_bilstm_model.h5', monitor='val_accuracy', mode='max', verbose=1,
save_best_only=True)
glove_bilstm_hist= saved_best_glove_bilstm_model.fit(x = train_padded, y = ytrain, batch_size = 128 , epochs = 10,
validation_split=0.25, verbose=1, callbacks=[es, mc])
# load the best_model
saved_best_glove_bilstm_model_2 = load_model('best_glove_bilstm_model.h5')
_, train_acc = saved_best_glove_bilstm_model_2.evaluate(train_padded, ytrain, verbose=1)
_, valid_acc = saved_best_glove_bilstm_model_2.evaluate(valid_padded, yvalid, verbose=1)
print('Train: %.3f, Test: %.3f' % (train_acc, valid_acc))
valid_pred_glove_bilstm_2 = saved_best_glove_bilstm_model_2.predict(valid_padded)
labels=[1,2,3,4,5]
valid_pred_glove_bilstm_result_2=[]
for p in valid_pred_glove_bilstm_2:
valid_pred_glove_bilstm_result_2.append(labels[np.argmax(p)])
print(classification_report(valid['stars_review'], valid_pred_glove_bilstm_result_2))
test = pd.read_csv("yelp_test.csv")
y_test = test[['review_stars__1.0', 'review_stars__2.0',
'review_stars__3.0', 'review_stars__4.0', 'review_stars__5.0']]
test = text_cleaning(test)
test.loc[:,["stars_review","tokens","text_clean","text"]].head()
test.to_csv("test_cleaned_032520.csv")
#test = pd.read_csv("test_cleaned_032520.csv")
# Use the tokenizer and pad_sequences to transform valid dataset
test_sequences = tokenizer.texts_to_sequences(test["text_clean"].tolist())
test_padded = pad_sequences(test_sequences, maxlen=max_sequence_len,
padding="post", truncating="post")
savetxt("test_padded.csv", test_padded, delimiter=',')
test_pred_bilstm = saved_best_bilstm_model.predict(test_padded, batch_size=2048,
verbose=1)
labels = [1,2,3,4,5]
test_pred_bilstm_result=[]
for p in test_pred_bilstm:
test_pred_bilstm_result.append(labels[np.argmax(p)])
print(classification_report(test['stars_review'], test_pred_bilstm_result))
test.loc[:,["stars_review","tokens","text_clean","text"]].head()
# Use the tokenizer and pad_sequences to transform valid dataset
test_sequences = tokenizer.texts_to_sequences(test["text_clean"].tolist())
test_padded = pad_sequences(test_sequences, maxlen=max_sequence_len,
padding="post", truncating="post")
test_pred_glove_bilstm = saved_best_glove_bilstm_model_2.predict(test_padded, batch_size=2048,
verbose=1)
labels = [1,2,3,4,5]
test_pred_glove_bilstm_result=[]
for p in test_pred_glove_bilstm:
test_pred_glove_bilstm_result.append(labels[np.argmax(p)])
print(classification_report(test['stars_review'], test_pred_glove_bilstm_result))
The BiLSTM model with GloVe embedding is slightly better than the Word2Vec model as overall accuracy score of GloVe model is 68% and 67% for the Word2Vec model.
If we look at the recall score for each class (stars), the model is really good at differentiating between 1-star and 5-star reviews, moderate at 4-star and slightly poor on 2- and 3- stars. This make sense as the 1-star and 5-star reviews are more likely to present stronger sentiment, either strong positive or strong negative, the model can pick up the differences between 1-star and 5-star reviews and predict more correctly than middle ratings(star 2,3,4)