Topic modeling in Gensim

· 2020-03-24 · # gensim # topic modeling

"Research the source code of Topic Modeling in gensim"

A Simple Example Code

from nltk.tokenize import regexp_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

tokens = []
for doc in my_docs:
    words = regexp_tokenize(doc.lower(), r'[A-Za-z]+')
    words = [w for w in words if w not in stopwords.words('english')]
    words = [lemmatizer.lemmatize(w) for w in words]
    tokens.append(words)

# Gensim tf-idf
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
my_dict = Dictionary(tokens)
my_dict.filter_extremes(no_below=5, no_above=0.90)
dtm = [my_dict.doc2bow(doc) for doc in tokens]
tfidf = TfidfModel(dtm)
for doc in tfidf[dtm]:
    print([[my_dict[i], np.around(freq, decimals=2)] for i, freq in doc])

'''
output:
[['advertising', 0.16], ['bbc', 0.04], ['bill', 0.05], ['book', 0.05], ....]
[['book', 0.19], ['company', 0.05], ['firm', 0.11], ['month', 0.04], ['telecom', 0.26], ....]
[['bbc', 0.09], ['moment', 0.11], ['month', 0.05], ['bos', 0.12], ['cross', 0.15], ....]
[['home', 0.06], ['play', 0.07], ['face', 0.31], ['game', 0.11], ['league', 0.1], ....]
...
'''

# Gensim: LSI
from gensim.models import LsiModel, CoherenceModel
lsi_model = LsiModel(corpus=dtm, id2word=my_dict, num_topics=5)
# lsi_model.print_topics(-1)
lsi_model.print_topics(num_topics=5, num_words=5)

# Determining optimum number of topics using coherence values 
coherence_values = []
lsi_model_list = []
min_topics, max_topics, step = 1, 5, 1
for i in range(min_topics, max_topics, step):
    lsi_model = LsiModel(dtm, id2word=my_dict, num_topics=i)
    lsi_model_list.append(lsi_model)
    coherencemodel = CoherenceModel(model=lsi_model, texts=tokens, \
        dictionary=my_dict, coherence='c_v')
    coherence_values.append(coherencemodel.get_coherence())

# Gensim: LDA
from gensim.models import LdaModel, LdaMulticore
lda_model = LdaModel(dtm, num_topics=3, id2word=my_dict, passes=10)
# lda_model.print_topics(-1)
lda_model.print_topics(num_topics=3, num_words=3)
lda_model_mc = LdaMulticore(dtm, num_topics=3, id2word=my_dict, passes=10, workers=4)
lda_model_mc.print_topics(-1)
lda_model.save('my_lda_model.lda')               # Save LDA model

# Evaluating LDA models: Topic coherence
goodLdaModel = LdaModel(corpus=dtm, id2word=my_dict, iterations=50, num_topics=2)
badLdaModel = LdaModel(corpus=dtm, id2word=my_dict, iterations=1, num_topics=2)
goodcm = CoherenceModel(model=goodLdaModel, corpus=dtm, dictionary=my_dict, coherence='u_mass')
badcm  = CoherenceModel(model=badLdaModel, corpus=dtm, dictionary=my_dict, coherence='u_mass')
goodcm.get_coherence()
badcm.get_coherence()
goodcm = CoherenceModel(model=goodLdaModel, texts=dtm, dictionary=my_dict, coherence='c_v')
badcm  = CoherenceModel(model=badLdaModel, texts=dtm, dictionary=my_dict, coherence='c_v')

# Display LDA outputs (runs only on HTML platforms like Jupyter)
import pyLDAvis.gensim              
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, dtm, my_dict)
pyLDAvis.display(vis)

1. Dictionary (source)

>>> from gensim.corpora import Dictionary
>>>
>>> texts = [['human', 'interface', 'computer']]
>>> dct = Dictionary(texts)  # initialize a Dictionary
>>> dct.add_documents([["cat", "say", "meow"], ["dog"]])  # add more document (extend the vocabulary)
>>> dct.doc2bow(["dog", "computer", "non_existent_word"])
[(0, 1), (6, 1)]

The most important methods is doc2bow(['..','..',...])

def doc2bow(self, document, allow_update=False, return_missing=False):
    """Convert `document` into the bag-of-words (BoW) format = list of `(token_id, token_count)` tuples."""
    ...
    # Construct (word, frequency) mapping.
    counter = defaultdict(int)
    for w in document:
        counter[w if isinstance(w, unicode) else unicode(w, 'utf-8')] += 1
    token2id = self.token2id
    result = {token2id[w]: freq for w, freq in iteritems(counter) if w in token2id}
    ...
    result = sorted(iteritems(result))
    ...
    return result

doc2bow() is very similar with nltk.FreqDist(). The result returned is [(worid_id, freq),(worid_id, freq),...].

2. TfidfModel (source)

from gensim.models import TfidfModel

dtm = [my_dict.doc2bow(doc) for doc in tokens]
tfidf_vectorizer = TfidfModel(dtm) 
tfidf = tfidf_vectorizer[dtm]

# idf:
self.idfs = precompute_idfs(self.wglobal, self.dfs, self.num_docs)

print(self.wglobal)     # <function gensim.models.tfidfmodel.df2idf(docfreq, totaldocs, log_base=2.0, add=0.0)>

def precompute_idfs(wglobal, dfs, total_docs):
    return {termid: wglobal(df, total_docs) for termid, df in iteritems(dfs)}

def df2idf(docfreq, totaldocs, log_base=2.0, add=0.0):
    return add + np.log(float(totaldocs) / docfreq) / np.log(log_base)

idf = 0 + \frac{\log(\frac{totalDocs}{docFreq})} {\log2} = \log_2(\frac{totalDocs}{docFreq}) = \log_2(\frac{N}{n_k})

# tf: dtm 
# tf-idf: as example of dtm[0]
from gensim.models import TfidfModel

termid_array, tf_array = [], []
for termid, tf in dtm[0]:
    termid_array.append(termid)
    tf_array.append(tf)

model = TfidfModel(dtm)
print(model.idfs)   # it is the idf of each token in dictionary
print(model.eps)    # 1e-12

# if a word almost appears in all documents, which makes idf of this word is very close to 0, then we remove it in tfidf.
vector = [(termid, tf * model.idfs.get(termid)) for termid, tf in zip(termid_array, tf_array) if abs(model.idfs.get(termid, 0.0)) > model.eps]

# next we use l2 normalize this vector (default normalize)
length = 1.0 * math.sqrt(sum(val ** 2 for _, val in vector))
tfidf = [(termid, val/length) for termid, val in vector] 

# This is what happend behind the code "model[dtm[0]]"
# l1 : length = float(sum(abs(val) for _, val in vector))
# l2 : length = 1.0 * math.sqrt(sum(val ** 2 for _, val in vector))
# unique: length = 1.0 * len(vector)

After obtaining the product of tf and idf of the current word, Cosine normalization is performed.

length = 1.0 \times \sqrt{\sum{val^2 \in vector}} = \sqrt{\sum^{t}_{k=1}w^2_{i_k}}

tfidf = \frac{val \in vector} {length}

In fact, the above tfidf calculation methods are all default, that is, we can specify the calculation method according to requirements.

# We can modify this attribute when we create an instance of TfidfModel
class TfidfModel(interfaces.TransformationABC):
    def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.identity,
                    wglobal=df2idf, normalize=True, smartirs=None, pivot=None, slope=0.25):
        ....
"""
smartirs : str, optional
    SMART (System for the Mechanical Analysis and Retrieval of Text) Information Retrieval System,
    a mnemonic scheme for denoting tf-idf weighting variants in the vector space model.
    The mnemonic for representing a combination of weights takes the form XYZ,
    for example 'ntc', 'bpn' and so on, where the letters represents the term weighting of the document vector.
    Term frequency weighing:
        * `b` - binary,
        * `t` or `n` - raw,
        * `a` - augmented,
        * `l` - logarithm,
        * `d` - double logarithm,
        * `L` - log average.
    Document frequency weighting:
        * `x` or `n` - none,
        * `f` - idf,
        * `t` - zero-corrected idf,
        * `p` - probabilistic idf.
    Document normalization:
        * `x` or `n` - none,
        * `c` - cosine,
        * `u` - pivoted unique,
        * `b` - pivoted character length.
    Default is 'nfc'.
    For more information visit `SMART Information Retrieval System
    <https://en.wikipedia.org/wiki/SMART_Information_Retrieval_System>`_.
"""

SMART Information Retrieval System

3. LsiModel (source)

Model for Latent Semantic Indexing https://en.wikipedia.org/wiki/Latent_semantic_analysis#Latent_semantic_indexing

# Gensim: LSI
from gensim.models import LsiModel, CoherenceModel
lsi_model = LsiModel(corpus=dtm, id2word=my_dict, num_topics=5)
# lsi_model.print_topics(-1)
lsi_model.print_topics(num_topics=5, num_words=5)

def __init__(self, m, k, docs=None, use_svdlibc=False, power_iters=P2_EXTRA_ITERS, extra_dims=P2_EXTRA_DIMS, dtype=np.float64):

## __init__()
self.projection = Projection(
self.num_terms, self.num_topics, power_iters=self.power_iters, extra_dims=self.extra_samples, dtype=dtype
)


## add_document()
update = Projection(
self.num_terms, self.num_topics, job, extra_dims=self.extra_samples,
power_iters=self.power_iters, dtype=self.dtype
)

def stochastic_svd(corpus, rank, num_terms, chunksize=20000, extra_dims=None,
                   power_iters=0, dtype=np.float64, eps=1e-6):
    pass

u, s = stochastic_svd(
                    docs, k, chunksize=sys.maxsize,
                    num_terms=m, power_iters=self.power_iters,
                    extra_dims=self.extra_dims, dtype=dtype)

self.show_topics(num_topics=num_topics, num_words=num_words, log=True)

def show_topics(self, num_topics=-1, num_words=10, log=False, formatted=True):
    """Get the most significant topics.
    log : bool, optional
        If True - log topics with logger.
    formatted : bool, optional
        If True - each topic represented as string, otherwise - in BoW format.
    Returns
    -------
    list of (int, str)
        If `formatted=True`, return sequence with (topic_id, string representation of topics) **OR**
    list of (int, list of (str, float))
        Otherwise, return sequence with (topic_id, [(word, value), ... ]).
    """
    shown = []
    if num_topics < 0:
        num_topics = self.num_topics
    for i in range(min(num_topics, self.num_topics)):
        if i < len(self.projection.s):
            if formatted:
                topic = self.print_topic(i, topn=num_words)
            else:
                topic = self.show_topic(i, topn=num_words)
            shown.append((i, topic))
            if log:
                logger.info("topic #%i(%.3f): %s", i, self.projection.s[i], topic)
    return shown

4. LdaModel

5. LdaMulticore

6. CoherenceModel

Example of sklearn topic modeling

https://cloud.tencent.com/developer/article/1530432

docs = ["In the middle of the night",
        "When our hopes and fears collide",
        "In the midst of all goodbyes",
        "Where all human beings lie",
        "Against another lie"]
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(docs)
terms = vectorizer.get_feature_names()
print(terms)

n_pick_topics = 3            # 设定主题数为3
lsa = TruncatedSVD(n_pick_topics)               
X2 = lsa.fit_transform(X)
X2


n_pick_docs= 2
topic_docs_id = [X2[:,t].argsort()[:-(n_pick_docs+1):-1] for t in range(n_pick_topics)]
topic_docs_id

n_pick_keywords = 4
topic_keywords_id = [lsa.components_[t].argsort()[:-(n_pick_keywords+1):-1] for t in range(n_pick_topics)]
topic_keywords_id

for t in range(n_pick_topics):
    print("topic %d:" % t)
    print("    keywords: %s" % ", ".join(terms[topic_keywords_id[t][j]] for j in range(n_pick_keywords)))
    for i in range(n_pick_docs):
        print("    doc %d" % i)
        print("\t"+docs[topic_docs_id[t][i]])