Performing Twitter Sentiment Analysis-For beginner-

Learn how to check whether tweets are racist or no

Lamiae Hana
4 min readAug 1, 2020

Import Basic Libraries

import sklearnimport numpy as npimport pandas as pd

Connection with google drive

from google.colab import drivedrive.mount('/content/gdrive')

Import Data

# training datatrain = pd.read_csv("/content/gdrive/My Drive/Colab Notebooks/data/train.csv")# test datatest = pd.read_csv("/content/gdrive/My Drive/Colab Notebooks/data/test.csv")

Data Exploration (Exploratory Data Analysis)

train.head()
test.tail()
# non-racist/sexist related tweetssum(train["label"] == 0)

29720

# racist/sexist related tweetssum(train["label"] == 1)

2242

# check if there are any missing valuestrain.isnull().sum()#train.isnull().values.any()

Data cleaning

#install tweet-preprocessor to clean tweets!pip install tweet-preprocessor
# remove special characters using the regular expression libraryimport re#set up punctuations we want to be replacedREPLACE_NO_SPACE = re.compile("(\.)|(\;)|(\:)|(\!)|(\')|(\?)|(\,)|(\")|(\|)|(\()|(\))|(\[)|(\])|(\%)|(\$)|(\>)|(\<)|(\{)|(\})")REPLACE_WITH_SPACE = re.compile("(<br\s/><br\s/?)|(-)|(/)|(:).")import preprocessor as p# custum function to clean the dataset (combining tweet_preprocessor and reguar expression)def clean_tweets(df):tempArr = []for line in df:# send to tweet_processortmpL = p.clean(line)# remove puctuationtmpL = REPLACE_NO_SPACE.sub("", tmpL.lower()) # convert all tweets to lower casestmpL = REPLACE_WITH_SPACE.sub(" ", tmpL)tempArr.append(tmpL)return tempArr# clean training datatrain_tweet = clean_tweets(train["tweet"])train_tweet = pd.DataFrame(train_tweet)# append cleaned tweets to the training datatrain["clean_tweet"] = train_tweet# compare the cleaned and uncleaned tweetstrain.head(10)
# clean the test data and append the cleaned tweets to the test datatest_tweet = clean_tweets(test["tweet"])test_tweet = pd.DataFrame(test_tweet)# append cleaned tweets to the training datatest["clean_tweet"] = test_tweet# compare the cleaned and uncleaned tweetstest.tail()

Test and Train split

from sklearn.model_selection import train_test_split# extract the labels from the train datay = train.label.values# use 70% for the training and 30% for the testx_train, x_test, y_train, y_test = train_test_split(train.clean_tweet.values, y,stratify=y,random_state=1,test_size=0.3, shuffle=True)

Vectorize tweets using CountVectorizer

CountVectorizer Example

from sklearn.feature_extraction.text import CountVectorizerdocuments = ["This is Import Data's Youtube channel","Data science is my passion and it is fun!","Please subscribe to my channel"]# initializing the countvectorizervectorizer = CountVectorizer()# tokenize and make the document into a matrixdocument_term_matrix = vectorizer.fit_transform(documents)# check the resultpd.DataFrame(document_term_matrix.toarray(), columns = vectorizer.get_feature_names())
from sklearn.feature_extraction.text import CountVectorizer# vectorize tweets for model buildingvectorizer = CountVectorizer(binary=True, stop_words='english')# learn a vocabulary dictionary of all tokens in the raw documentsvectorizer.fit(list(x_train) + list(x_test))# transform documents to document-term matrixx_train_vec = vectorizer.transform(x_train)x_test_vec = vectorizer.transform(x_test)

Model building

Apply Support Vetor Classifier (SVC)

from sklearn import svm# classify using support vector classifiersvm = svm.SVC(kernel = 'linear', probability=True)# fit the SVC model based on the given training dataprob = svm.fit(x_train_vec, y_train).predict_proba(x_test_vec)# perform classification and prediction on samples in x_testy_pred_svm = svm.predict(x_test_vec)

Accuracy score for SVC

from sklearn.metrics import accuracy_scoreprint("Accuracy score for SVC is: ", accuracy_score(y_test, y_pred_svm) * 100, '%')

References:

--

--

Lamiae Hana

I write about AI, Machine learning and data Science, Come join the discussion.