Compare nltk and textblob's sentiment analysis

· 2020-03-21 · # NLP # Source Code

"This is a comparison report between nltk and textblob in sentiment analysis."

Overview

lexicon

1. NLTK's vader_lexicon.txt : NLTK's default lexicon file (online source)

# Part of vader_lexicon.txt
....
# emoticon
(-*	1.3	1.26886	[4, 1, 2, 0, 2, -1, 1, 2, 1, 1]
(-:	1.6	0.8	[2, 2, 1, 3, 1, 1, 1, 3, 1, 1]
(-:0	2.8	0.87178	[3, 2, 3, 4, 3, 2, 3, 1, 4, 3]
(-:<	-0.4	2.15407	[-3, 3, -1, -1, 2, -1, -2, 3, -3, -1]
(-:o	1.5	0.67082	[3, 1, 1, 2, 2, 2, 1, 1, 1, 1]
(-:O	1.5	0.67082	[3, 1, 1, 2, 2, 2, 1, 1, 1, 1]
(-:{	-0.1	1.57797	[-2, -3, 1, -2, 1, 1, 0, 0, 2, 1]
(-:|>*	1.9	0.83066	[3, 2, 2, 1, 0, 2, 3, 2, 2, 2]
(-;	1.3	1.18743	[3, 2, 3, 0, 1, -1, 1, 2, 1, 1]
(-;|	2.1	1.13578	[3, 2, 2, 4, 1, 1, 1, 4, 2, 1]
(8	2.6	1.0198	[4, 2, 1, 3, 3, 3, 3, 1, 2, 4]
.... 
# word
cheer	2.3	0.64031	[2, 1, 2, 2, 2, 3, 3, 3, 2, 3]
cheered	2.3	0.78102	[2, 3, 3, 4, 2, 1, 2, 2, 2, 2]
cheerer	1.7	0.45826	[1, 2, 2, 2, 1, 1, 2, 2, 2, 2]
cheerers	1.8	0.87178	[2, 2, 3, 2, 1, 2, 0, 1, 3, 2]
cheerful	2.5	0.67082	[3, 2, 3, 2, 2, 2, 4, 2, 3, 2]
cheerfuller	1.9	0.83066	[3, 3, 2, 3, 2, 1, 1, 2, 1, 1]
cheerfullest	3.2	0.87178	[4, 4, 4, 4, 3, 2, 2, 3, 2, 4]
cheerfully	2.1	0.83066	[3, 2, 2, 2, 1, 3, 1, 3, 1, 3]
cheerfulness	2.1	0.9434	[3, 2, 1, 2, 3, 4, 1, 2, 1, 2]
cheerier	2.6	0.4899	[2, 2, 3, 3, 2, 3, 3, 2, 3, 3]
cheeriest	2.2	0.6	[3, 2, 3, 1, 2, 2, 3, 2, 2, 2]
....

# load lexicon in Python
from nltk.sentiment.vader import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()
type(analyzer.lexicon)      # dict
len(analyzer.lexicon)       # 7502
analyzer.lexicon

'''
output:
{ 
 ...
 '(-*': 1.3,
 '(-:': 1.6,
 '(-:0': 2.8,
 '(-:<': -0.4,
 '(-:o': 1.5,
 '(-:O': 1.5,
 '(-:{': -0.1,
 '(-:|>*': 1.9,
 '(-;': 1.3,
 '(-;|': 2.1,
 '(8': 2.6,
 ...
}
'''

2. TextBlob's en-sentiment.xml : TextBlob's default lexicon file (online source)

<sentiment language="en" version="1.3" author="Tom De Smedt, Walter Daelemans" license="PDDL">
<word form="13th" wordnet_id="a-02203763" pos="JJ" sense="coming next after the twelfth in position" polarity="0.0" subjectivity="0.0" intensity="1.0" confidence="0.9" />
<word form="20th" cornetto_synset_id="n_a-531612" wordnet_id="a-02204716" pos="JJ" sense="coming next after the nineteenth in position" polarity="0.0" subjectivity="0.0" intensity="1.0" confidence="0.9" />
<word form="21st" wordnet_id="a-02204823" pos="JJ" sense="coming next after the twentieth in position" polarity="0.0" subjectivity="0.0" intensity="1.0" confidence="0.9" />
<word form="2nd" wordnet_id="a-02202146" pos="JJ" sense="coming next after the first in position in space or time or degree or magnitude" polarity="0.0" subjectivity="0.0" intensity="1.0" confidence="0.9" />
<word form="3rd" cornetto_synset_id="n_a-530634" wordnet_id="a-02202307" pos="JJ" sense="coming next after the second and just before the fourth in position" polarity="0.0" subjectivity="0.0" intensity="1.0" confidence="0.9" />
<word form="abhorrent" wordnet_id="a-1625063" pos="JJ" sense="offensive to the mind" polarity="-0.7" subjectivity="0.8" intensity="1.0" reliability="0.9" />
<word form="able" cornetto_synset_id="n_a-534450" wordnet_id="a-01017439" pos="JJ" sense="having a strong healthy body" polarity="0.5" subjectivity="1.0" intensity="1.0" confidence="0.9" />
<word form="able" wordnet_id="a-00001740" pos="JJ" sense="(usually followed by 'to') having the necessary means or skill or know-how or authority to do something" polarity="0.5" subjectivity="0.5" intensity="1.0" confidence="0.9" />
....
<word form="implicit in" cornetto_synset_id="n_a-520863" wordnet_id="a-00941940" pos="JJ" sense="in the nature of something though not readily apparent" polarity="0.0" subjectivity="0.1" intensity="1.0" confidence="0.8" />
<word form="important" cornetto_synset_id="d_a-9178" wordnet_id="a-01830403" pos="JJ" sense="having authority or ascendancy or influence" polarity="0.5" subjectivity="1.0" intensity="1.0" confidence="0.9" />
<word form="important" cornetto_synset_id="n_a-527688" wordnet_id="a-02161432" pos="JJ" sense="important in effect or meaning" polarity="0.5" subjectivity="1.0" intensity="1.0" confidence="0.9" />
<word form="important" wordnet_id="a-00655779" pos="JJ" sense="of extreme importance" polarity="0.0" subjectivity="1.0" intensity="1.0" confidence="0.9" />
<word form="important" wordnet_id="a-01275562" pos="JJ" sense="of great significance or value" polarity="0.5" subjectivity="1.0" intensity="1.0" confidence="0.9" />
<word form="important" wordnet_id="a-01539887" pos="JJ" sense="having or suggesting a consciousness of high position" polarity="0.5" subjectivity="1.0" intensity="1.0" confidence="0.9" />
<word form="impossible" cornetto_synset_id="n_a-511166" wordnet_id="a-02418692" pos="JJ" sense="totally unlikely" polarity="-0.5" subjectivity="1.0" intensity="1.0" confidence="0.9" />
<word form="impossible" cornetto_synset_id="n_a-521243" wordnet_id="a-02436025" pos="JJ" sense="used of persons or their behavior" polarity="-1.0" subjectivity="1.0" intensity="1.0" confidence="0.9" />
<word form="impossible" wordnet_id="a-01823092" pos="JJ" sense="not capable of occurring or being accomplished or dealt with" polarity="-0.5" subjectivity="1.0" intensity="1.0" confidence="0.9" />
<word form="impressed" cornetto_synset_id="n_a-509653" wordnet_id="a-00071142" pos="JJ" sense="deeply or markedly affected or influenced" polarity="1.0" subjectivity="1.0" intensity="1.0" confidence="0.9" />
<word form="impressive" cornetto_synset_id="n_a-524894" wordnet_id="a-00835292" pos="JJ" sense="producing a strong effect" polarity="1.0" subjectivity="1.0" intensity="1.0" confidence="0.9" />
<word form="impressive" wordnet_id="a-01282014" pos="JJ" sense="making a strong or vivid impression" polarity="1.0" subjectivity="1.0" intensity="1.0" confidence="0.9" />
<word form="in good taste" cornetto_synset_id="n_a-528044" wordnet_id="a-00689215" pos="JJ" sense="satisfying generally accepted social or esthetic standards" polarity="0.9" subjectivity="1.0" intensity="1.0" confidence="0.8" />
<word form="in stock" cornetto_synset_id="n_a-533683" wordnet_id="a-00184543" pos="JJ" sense="available for use or sale" polarity="0.1" subjectivity="0.4" intensity="1.0" confidence="0.8" />
....
</sentiment>

# load lexicon in Python
from textblob.en import sentiment as pattern_sentiment
type(pattern_sentiment)         # textblob.en.Sentiment
len(pattern_sentiment)          # 2860
pattern_sentiment['great']      # {'JJ': [0.8, 0.75, 1.0], None: [0.8, 0.75, 1.0]}
pattern_sentiment('great')      # (0.8, 0.75)
for k,v in pattern_sentiment.items():
    print(k,v)

'''
output:
13th {'JJ': [0.0, 0.0, 1.0], None: [0.0, 0.0, 1.0]}
20th {'JJ': [0.0, 0.0, 1.0], None: [0.0, 0.0, 1.0]}
21st {'JJ': [0.0, 0.0, 1.0], None: [0.0, 0.0, 1.0]}
2nd {'JJ': [0.0, 0.0, 1.0], None: [0.0, 0.0, 1.0]}
3rd {'JJ': [0.0, 0.0, 1.0], None: [0.0, 0.0, 1.0]}
abhorrent {'JJ': [-0.7, 0.8, 1.0], None: [-0.7, 0.8, 1.0]}
able {'JJ': [0.5, 0.625, 1.0], None: [0.5, 0.625, 1.0]}
above {'JJ': [0.0, 0.1, 1.0], None: [0.0, 0.1, 1.0]}
....
13th {'JJ': [0.0, 0.0, 1.0], None: [0.0, 0.0, 1.0]}
20th {'JJ': [0.0, 0.0, 1.0], None: [0.0, 0.0, 1.0]}
21st {'JJ': [0.0, 0.0, 1.0], None: [0.0, 0.0, 1.0]}
2nd {'JJ': [0.0, 0.0, 1.0], None: [0.0, 0.0, 1.0]}
3rd {'JJ': [0.0, 0.0, 1.0], None: [0.0, 0.0, 1.0]}
abhorrent {'JJ': [-0.7, 0.8, 1.0], None: [-0.7, 0.8, 1.0]}
able {'JJ': [0.5, 0.625, 1.0], None: [0.5, 0.625, 1.0]}
above {'JJ': [0.0, 0.1, 1.0], None: [0.0, 0.1, 1.0]}
....
'''

Summary

Main Algorithm

1. NLTK's methods

score_valence(self, sentiments, text)

def score_valence(self, sentiments, text):
    if sentiments:
        ''' calculate compound value '''
        sum_s = float(sum(sentiments))
        # compute and add emphasis from punctuation in text
        punct_emph_amplifier = self._punctuation_emphasis(sum_s, text)
        if sum_s > 0:
            sum_s += punct_emph_amplifier
        elif sum_s < 0:
            sum_s -= punct_emph_amplifier
        compound = self.constants.normalize(sum_s)

        ''' calculate positive, negative and neutral values respectively '''
        pos_sum, neg_sum, neu_count = self._sift_sentiment_scores(sentiments)
        if pos_sum > math.fabs(neg_sum):
            pos_sum += punct_emph_amplifier
        elif pos_sum < math.fabs(neg_sum):
            neg_sum -= punct_emph_amplifier
        total = pos_sum + math.fabs(neg_sum) + neu_count
        pos = math.fabs(pos_sum / total)
        neg = math.fabs(neg_sum / total)
        neu = math.fabs(neu_count / total)
    else:
        compound = 0.0
        pos = 0.0
        neg = 0.0
        neu = 0.0
    sentiment_dict = {
        "neg": round(neg, 3),
        "neu": round(neu, 3),
        "pos": round(pos, 3),
        "compound": round(compound, 4),
    }
    return sentiment_dict

Parameter sentiments is a list of sentiment values corresponding to each token in the text.

In brief, NLTK first sums up all individual words' sentiment values (positive real numbers and negative real numbers), then performs a text puctuation statistical emphasis operation, and finally normalizes the final result to obtain the "Compound" value.

And NLTK uses the _sift_sentiment_scores method to get three values of pos, neg, and neu. We will look at this method later. Pos, neg, and neu have also undergone puctuation statistical emphasis.

sentiment_valence(self, valence, sentitext, item, i, sentiments)

def sentiment_valence(self, valence, sentitext, item, i, sentiments):
    is_cap_diff = sentitext.is_cap_diff
    words_and_emoticons = sentitext.words_and_emoticons
    item_lowercase = item.lower()
    if item_lowercase in self.lexicon:
        # get the sentiment valence
        valence = self.lexicon[item_lowercase]

        # Rule 1 : check if sentiment laden word is in ALL CAPS (while others aren't)
        if item.isupper() and is_cap_diff:
            if valence > 0:
                valence += self.constants.C_INCR
            else:
                valence -= self.constants.C_INCR

        # Context Rules
        for start_i in range(0, 3):
            if (
                i > start_i
                and words_and_emoticons[i - (start_i + 1)].lower()
                not in self.lexicon
            ):
                # dampen the scalar modifier of preceding words and emoticons
                # (excluding the ones that immediately preceed the item) based
                # on their distance from the current item.
                # Rule 2 : preceding booster word
                s = self.constants.scalar_inc_dec(
                    words_and_emoticons[i - (start_i + 1)], valence, is_cap_diff
                )
                if start_i == 1 and s != 0:
                    s = s * 0.95
                if start_i == 2 and s != 0:
                    s = s * 0.9
                valence = valence + s
                # Rule 3 : preceding "never" word
                valence = self._never_check(
                    valence, words_and_emoticons, start_i, i
                )
                # Rule 4 : idioms check
                if start_i == 2:
                    valence = self._idioms_check(valence, words_and_emoticons, i)

        # Rule 5 : check for negation case using "least"
        valence = self._least_check(valence, words_and_emoticons, i)

    sentiments.append(valence)
    return sentiments

sentiment_valence is a method of calculating and adjusting the sentiment value of a single word. It is based on multiple rules. First the word has to be in the lexicon databse. After obtaining the basic sentiment value of the word, adjust the sentiment value of the current word again according to the ALL CAPS emphasis, the preceding booster word, "never", idioms, and "least".

polarity_scores(self, text)

def polarity_scores(self, text):
    """
    Return a float for sentiment strength based on the input text.
    Positive values are positive valence, negative value are negative
    valence.
    """
    # text, words_and_emoticons, is_cap_diff = self.preprocess(text)
    sentitext = SentiText(text, self.constants.PUNC_LIST,
                            self.constants.REGEX_REMOVE_PUNCTUATION)
    sentiments = []
    words_and_emoticons = sentitext.words_and_emoticons
    for item in words_and_emoticons:
        valence = 0
        i = words_and_emoticons.index(item)
        if (
            i < len(words_and_emoticons) - 1
            and item.lower() == "kind"
            and words_and_emoticons[i + 1].lower() == "of"
        ) or item.lower() in self.constants.BOOSTER_DICT:
            sentiments.append(valence)
            continue

        sentiments = self.sentiment_valence(valence, sentitext, item, i, sentiments)

    sentiments = self._but_check(words_and_emoticons, sentiments)

    return self.score_valence(sentiments, text)

After obtaining the list of sentiment values, "but check" rule is also performed.