Topic modeling in Gensim
"Research the source code of Topic Modeling in gensim"
A Simple Example Code
from nltk.tokenize import regexp_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
tokens = []
for doc in my_docs:
words = regexp_tokenize(doc.lower(), r'[A-Za-z]+')
words = [w for w in words if w not in stopwords.words('english')]
words = [lemmatizer.lemmatize(w) for w in words]
tokens.append(words)
# Gensim tf-idf
from gensim.corpora import Dictionary
from gensim.models import TfidfModel
my_dict = Dictionary(tokens)
my_dict.filter_extremes(no_below=5, no_above=0.90)
dtm = [my_dict.doc2bow(doc) for doc in tokens]
tfidf = TfidfModel(dtm)
for doc in tfidf[dtm]:
print([[my_dict[i], np.around(freq, decimals=2)] for i, freq in doc])
'''
output:
[['advertising', 0.16], ['bbc', 0.04], ['bill', 0.05], ['book', 0.05], ....]
[['book', 0.19], ['company', 0.05], ['firm', 0.11], ['month', 0.04], ['telecom', 0.26], ....]
[['bbc', 0.09], ['moment', 0.11], ['month', 0.05], ['bos', 0.12], ['cross', 0.15], ....]
[['home', 0.06], ['play', 0.07], ['face', 0.31], ['game', 0.11], ['league', 0.1], ....]
...
'''
# Gensim: LSI
from gensim.models import LsiModel, CoherenceModel
lsi_model = LsiModel(corpus=dtm, id2word=my_dict, num_topics=5)
# lsi_model.print_topics(-1)
lsi_model.print_topics(num_topics=5, num_words=5)
# Determining optimum number of topics using coherence values
coherence_values = []
lsi_model_list = []
min_topics, max_topics, step = 1, 5, 1
for i in range(min_topics, max_topics, step):
lsi_model = LsiModel(dtm, id2word=my_dict, num_topics=i)
lsi_model_list.append(lsi_model)
coherencemodel = CoherenceModel(model=lsi_model, texts=tokens, \
dictionary=my_dict, coherence='c_v')
coherence_values.append(coherencemodel.get_coherence())
# Gensim: LDA
from gensim.models import LdaModel, LdaMulticore
lda_model = LdaModel(dtm, num_topics=3, id2word=my_dict, passes=10)
# lda_model.print_topics(-1)
lda_model.print_topics(num_topics=3, num_words=3)
lda_model_mc = LdaMulticore(dtm, num_topics=3, id2word=my_dict, passes=10, workers=4)
lda_model_mc.print_topics(-1)
lda_model.save('my_lda_model.lda') # Save LDA model
# Evaluating LDA models: Topic coherence
goodLdaModel = LdaModel(corpus=dtm, id2word=my_dict, iterations=50, num_topics=2)
badLdaModel = LdaModel(corpus=dtm, id2word=my_dict, iterations=1, num_topics=2)
goodcm = CoherenceModel(model=goodLdaModel, corpus=dtm, dictionary=my_dict, coherence='u_mass')
badcm = CoherenceModel(model=badLdaModel, corpus=dtm, dictionary=my_dict, coherence='u_mass')
goodcm.get_coherence()
badcm.get_coherence()
goodcm = CoherenceModel(model=goodLdaModel, texts=dtm, dictionary=my_dict, coherence='c_v')
badcm = CoherenceModel(model=badLdaModel, texts=dtm, dictionary=my_dict, coherence='c_v')
# Display LDA outputs (runs only on HTML platforms like Jupyter)
import pyLDAvis.gensim
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, dtm, my_dict)
pyLDAvis.display(vis)
1. Dictionary (source)
>>> from gensim.corpora import Dictionary
>>>
>>> texts = [['human', 'interface', 'computer']]
>>> dct = Dictionary(texts) # initialize a Dictionary
>>> dct.add_documents([["cat", "say", "meow"], ["dog"]]) # add more document (extend the vocabulary)
>>> dct.doc2bow(["dog", "computer", "non_existent_word"])
[(0, 1), (6, 1)]
The most important methods is doc2bow(['..','..',...])
def doc2bow(self, document, allow_update=False, return_missing=False):
"""Convert `document` into the bag-of-words (BoW) format = list of `(token_id, token_count)` tuples."""
...
# Construct (word, frequency) mapping.
counter = defaultdict(int)
for w in document:
counter[w if isinstance(w, unicode) else unicode(w, 'utf-8')] += 1
token2id = self.token2id
result = {token2id[w]: freq for w, freq in iteritems(counter) if w in token2id}
...
result = sorted(iteritems(result))
...
return result
doc2bow() is very similar with nltk.FreqDist(). The result returned is [(worid_id, freq),(worid_id, freq),...].
2. TfidfModel (source)
from gensim.models import TfidfModel
dtm = [my_dict.doc2bow(doc) for doc in tokens]
tfidf_vectorizer = TfidfModel(dtm)
tfidf = tfidf_vectorizer[dtm]
# idf:
self.idfs = precompute_idfs(self.wglobal, self.dfs, self.num_docs)
print(self.wglobal) # <function gensim.models.tfidfmodel.df2idf(docfreq, totaldocs, log_base=2.0, add=0.0)>
def precompute_idfs(wglobal, dfs, total_docs):
return {termid: wglobal(df, total_docs) for termid, df in iteritems(dfs)}
def df2idf(docfreq, totaldocs, log_base=2.0, add=0.0):
return add + np.log(float(totaldocs) / docfreq) / np.log(log_base)
# tf: dtm
# tf-idf: as example of dtm[0]
from gensim.models import TfidfModel
termid_array, tf_array = [], []
for termid, tf in dtm[0]:
termid_array.append(termid)
tf_array.append(tf)
model = TfidfModel(dtm)
print(model.idfs) # it is the idf of each token in dictionary
print(model.eps) # 1e-12
# if a word almost appears in all documents, which makes idf of this word is very close to 0, then we remove it in tfidf.
vector = [(termid, tf * model.idfs.get(termid)) for termid, tf in zip(termid_array, tf_array) if abs(model.idfs.get(termid, 0.0)) > model.eps]
# next we use l2 normalize this vector (default normalize)
length = 1.0 * math.sqrt(sum(val ** 2 for _, val in vector))
tfidf = [(termid, val/length) for termid, val in vector]
# This is what happend behind the code "model[dtm[0]]"
# l1 : length = float(sum(abs(val) for _, val in vector))
# l2 : length = 1.0 * math.sqrt(sum(val ** 2 for _, val in vector))
# unique: length = 1.0 * len(vector)
After obtaining the product of tf and idf of the current word, Cosine normalization is performed.
In fact, the above tfidf calculation methods are all default, that is, we can specify the calculation method according to requirements.
# We can modify this attribute when we create an instance of TfidfModel
class TfidfModel(interfaces.TransformationABC):
def __init__(self, corpus=None, id2word=None, dictionary=None, wlocal=utils.identity,
wglobal=df2idf, normalize=True, smartirs=None, pivot=None, slope=0.25):
....
"""
smartirs : str, optional
SMART (System for the Mechanical Analysis and Retrieval of Text) Information Retrieval System,
a mnemonic scheme for denoting tf-idf weighting variants in the vector space model.
The mnemonic for representing a combination of weights takes the form XYZ,
for example 'ntc', 'bpn' and so on, where the letters represents the term weighting of the document vector.
Term frequency weighing:
* `b` - binary,
* `t` or `n` - raw,
* `a` - augmented,
* `l` - logarithm,
* `d` - double logarithm,
* `L` - log average.
Document frequency weighting:
* `x` or `n` - none,
* `f` - idf,
* `t` - zero-corrected idf,
* `p` - probabilistic idf.
Document normalization:
* `x` or `n` - none,
* `c` - cosine,
* `u` - pivoted unique,
* `b` - pivoted character length.
Default is 'nfc'.
For more information visit `SMART Information Retrieval System
<https://en.wikipedia.org/wiki/SMART_Information_Retrieval_System>`_.
"""
SMART Information Retrieval System
3. LsiModel (source)
Model for Latent Semantic Indexing https://en.wikipedia.org/wiki/Latent_semantic_analysis#Latent_semantic_indexing
# Gensim: LSI
from gensim.models import LsiModel, CoherenceModel
lsi_model = LsiModel(corpus=dtm, id2word=my_dict, num_topics=5)
# lsi_model.print_topics(-1)
lsi_model.print_topics(num_topics=5, num_words=5)
def __init__(self, m, k, docs=None, use_svdlibc=False, power_iters=P2_EXTRA_ITERS, extra_dims=P2_EXTRA_DIMS, dtype=np.float64):
## __init__()
self.projection = Projection(
self.num_terms, self.num_topics, power_iters=self.power_iters, extra_dims=self.extra_samples, dtype=dtype
)
## add_document()
update = Projection(
self.num_terms, self.num_topics, job, extra_dims=self.extra_samples,
power_iters=self.power_iters, dtype=self.dtype
)
def stochastic_svd(corpus, rank, num_terms, chunksize=20000, extra_dims=None,
power_iters=0, dtype=np.float64, eps=1e-6):
pass
u, s = stochastic_svd(
docs, k, chunksize=sys.maxsize,
num_terms=m, power_iters=self.power_iters,
extra_dims=self.extra_dims, dtype=dtype)
self.show_topics(num_topics=num_topics, num_words=num_words, log=True)
def show_topics(self, num_topics=-1, num_words=10, log=False, formatted=True):
"""Get the most significant topics.
log : bool, optional
If True - log topics with logger.
formatted : bool, optional
If True - each topic represented as string, otherwise - in BoW format.
Returns
-------
list of (int, str)
If `formatted=True`, return sequence with (topic_id, string representation of topics) **OR**
list of (int, list of (str, float))
Otherwise, return sequence with (topic_id, [(word, value), ... ]).
"""
shown = []
if num_topics < 0:
num_topics = self.num_topics
for i in range(min(num_topics, self.num_topics)):
if i < len(self.projection.s):
if formatted:
topic = self.print_topic(i, topn=num_words)
else:
topic = self.show_topic(i, topn=num_words)
shown.append((i, topic))
if log:
logger.info("topic #%i(%.3f): %s", i, self.projection.s[i], topic)
return shown
4. LdaModel
5. LdaMulticore
6. CoherenceModel
Example of sklearn topic modeling
https://cloud.tencent.com/developer/article/1530432
docs = ["In the middle of the night",
"When our hopes and fears collide",
"In the midst of all goodbyes",
"Where all human beings lie",
"Against another lie"]
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(docs)
terms = vectorizer.get_feature_names()
print(terms)
n_pick_topics = 3 # 设定主题数为3
lsa = TruncatedSVD(n_pick_topics)
X2 = lsa.fit_transform(X)
X2
n_pick_docs= 2
topic_docs_id = [X2[:,t].argsort()[:-(n_pick_docs+1):-1] for t in range(n_pick_topics)]
topic_docs_id
n_pick_keywords = 4
topic_keywords_id = [lsa.components_[t].argsort()[:-(n_pick_keywords+1):-1] for t in range(n_pick_topics)]
topic_keywords_id
for t in range(n_pick_topics):
print("topic %d:" % t)
print(" keywords: %s" % ", ".join(terms[topic_keywords_id[t][j]] for j in range(n_pick_keywords)))
for i in range(n_pick_docs):
print(" doc %d" % i)
print("\t"+docs[topic_docs_id[t][i]])