LDA(Latent-Direchlet Allocation) Analysis


LDA analysis is another “clustering” technique used in text analysis.

To understand LDA analysis, I’d like you to think about what archetypal khipu text might exist. As we learned in the previous clustering exercise, archetypal khipu texts might be centered around censuses or around tributes. Latent Dirichlet Allocation attempts to find those archetypes. Each document can then be labeled with the statistical probability/proportion of each of these archetypes - for example a sample document might be about 30% census, and 70% tribute.

Code
# Based on [this article](https://www.machinelearningplus.com/nlp/topic-modeling-visualization-how-to-present-results-lda-models/) as a template.

import sys
# !{sys.executable} -m spacy download en
import re, numpy as np, pandas as pd
from pprint import pprint

import logging, warnings

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spaCy
import spacy
nlp = spacy.load("es_core_news_md") # or spacy.load("es_core_news_md")

import matplotlib.pyplot as plt

# NLTK Stop words
# nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('spanish')
stop_words.extend([])

# Initialize plotly
plotly.offline.init_notebook_mode(connected = False)

#warnings.filterwarnings("ignore",category=DeprecationWarning)
#logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

Build dataframe of text documents

Code
csv_dir = f"{kq.project_directory()}/textosbook/data/CSV"
metadata_df = pd.read_csv(f"{csv_dir}/TA_metadata.csv")
summary_df = pd.read_csv(f"{csv_dir}/TA_summary.csv")

text_docs_df = summary_df[['name', 'spanish_text']].rename(columns = {'spanish_text':'text'})
doc_names = text_docs_df['name'].tolist()

Tokenize Sentences and Clean

Removing the emails, new line characters, single quotes and finally split the sentence into a list of words using gensim’s simple_preprocess(). Setting the deacc=True option removes punctuations.

Code
def sent_to_words(sentences):
    for sent in sentences:
        sent = re.sub('\s+', ' ', sent)  # remove newline chars
        sent = re.sub("\'", "", sent)  # remove single quotes
        sent = simple_preprocess(str(sent), deacc=True) 
        yield(sent)  

# Convert to list
data = text_docs_df.text.values.tolist()
data_words = list(sent_to_words(data))
#print(data_words[:1])
Code
from collections import Counter
def most_common_5_words(word_list): 
    word_list = [x for x in word_list if x not in stop_words]
    return [name for (name,count) in Counter(word_list).most_common(5)]
most_common_words = {doc_names[index]:str(most_common_5_words(data_words[index])) for index in range(len(data_words))}
common_doc_words = [most_common_words[name] for name in doc_names]

Build the Bigram, Trigram Models and Lemmatize

Let’s form the bigram and trigrams using the Phrases model. This is passed to Phraser() for efficiency in speed of execution.

Next, lemmatize each word to its root form, keeping only nouns, adjectives, verbs and adverbs.

We keep only these POS tags because they are the ones contributing the most to the meaning of the sentences. Here, I use spacy for lemmatization

Code
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# !python3 -m spacy download en  # run in terminal once
def process_words(texts, stop_words=stop_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """Remove Stopwords, Form Bigrams, Trigrams and Lemmatization"""
    texts = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
    texts = [bigram_mod[doc] for doc in texts]
    texts = [trigram_mod[bigram_mod[doc]] for doc in texts]
    texts_out = []
    nlp = spacy.load('es_core_news_md', disable=['parser', 'ner'])
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    # remove stopwords once more after lemmatization
    texts_out = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts_out]    
    return texts_out

data_ready = process_words(data_words)  # processed Text Data!

Build the Topic Model

To build the LDA topic model using LdaModel(), you need the corpus and the dictionary. Let’s create them first and then build the model. The trained topics (keywords and weights) are printed below as well.

Code
# Create Dictionary
id2word = corpora.Dictionary(data_ready)

# Create Corpus: Term Document Frequency
corpus = [id2word.doc2bow(text) for text in data_ready]

# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=4, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=10,
                                           passes=10,
                                           alpha='symmetric',
                                           iterations=100,
                                           per_word_topics=True)

pprint(lda_model.print_topics())
[(0,
  '0.136*"indio" + 0.129*"pueblo" + 0.080*"principal" + 0.070*"llamar" + '
  '0.034*"decir" + 0.026*"dicho" + 0.025*"cacique" + 0.018*"parcialidad" + '
  '0.015*"casa" + 0.012*"sujeto"'),
 (1,
  '0.106*"dar" + 0.056*"partida" + 0.035*"pieza" + 0.026*"carnero" + '
  '0.024*"ropa" + 0.020*"fanega" + 0.019*"da_gastado" + 0.017*"indio" + '
  '0.017*"cocar" + 0.015*"mas"'),
 (2,
  '0.102*"dicho" + 0.054*"dar" + 0.036*"ano" + 0.033*"pesos" + 0.026*"tambien" '
  '+ 0.018*"indio" + 0.015*"peso" + 0.015*"decir" + 0.012*"encomendero" + '
  '0.011*"repartimiento"'),
 (3,
  '0.032*"casa" + 0.026*"casado" + 0.025*"ano" + 0.021*"declarar" + '
  '0.018*"morada" + 0.016*"arriba" + 0.014*"dicho" + 0.011*"pueblo" + '
  '0.009*"teniar" + 0.008*"nombrado"')]

What is the Dominant topic and its percentage contribution in each document

In LDA models, each document is composed of multiple topics. But, typically only one of the topics is dominant. The below code extracts this dominant topic for each sentence and shows the weight of the topic and the keywords in a nicely formatted output.

This way, you will know which document belongs predominantly to which topic.

Code
import warnings

warnings.filterwarnings("ignore")
def format_topics_sentences(ldamodel=None, corpus=corpus, texts=data):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row_list in enumerate(ldamodel[corpus]):
        row = row_list[0] if ldamodel.per_word_topics else row_list            
        # print(row)
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                new_record = pd.Series([int(topic_num), round(prop_topic,4), topic_keywords])
                sent_topics_df = sent_topics_df.append(new_record, ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)

df_topic_sents_keywords = format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=data_ready)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']
df_dominant_topic.head(10)
Document_No Dominant_Topic Topic_Perc_Contrib Keywords Text
0 0 3 0.6446 casa, casado, ano, declarar, morada, arriba, dicho, pueblo, teniar, nombrado [domingo, manco, cayo, poro]
1 1 1 0.7849 dar, partida, pieza, carnero, ropa, fanega, da_gastado, indio, cocar, mas [provincia, fortaleza, cinamba, cabezar, provincia, vrcusllo, curosllo, ymrarca, paso, adelante, luego, conquisto, salir, obediencia, miedo, paz, luego, provincia, guailla, guanuco, provincia, conquisto, chuncho, pillar, asimismo, conquisto, provincia, guayocondo, luego, palpar, chimo, dema, salir, paz, luego, conquisto, provincia, prender, picar, capac, tenian, luego, provincia, hacer, frontera, poner, mitimaes, volver, luego, entrar, hermano, provincia, and, conquisto, provincia, paucarmayo, tomín, yscayssinga, luego, pasar, delante, conquistar, provincia, opatari, manari, luego, conquistar, provincia, luego, aualapi, manupampa, chicorio, prender, batalla, llamado, conquista, mojón, uillcanoto, empezo, conquistar, capachanco, pocoypoco, conquisto, fortaleza, collaguo, arapa, pucarac, hacer, fuerte, provincia, conquisto, persona, prendio, llamado, chucachuco, castigar, paso, adelante, luego, conquisto, ...]
2 2 3 0.9406 casa, casado, ano, declarar, morada, arriba, dicho, pueblo, teniar, nombrado [comenzar, camino, chinchaysuyu, salir, barrio, comprehendiar, huaca, orden, primero, ceque, cargo, parcialidad, ayllu, teniar, huaca, siguiente, primero, llamar, michosamaro, arrimado, deciar, fingir, salir, primero, inca, mancocapac, pacaritampu, referir, mujer, salir, dicho, cueva, mato, desacato, torno, animar, aparecio, lugar, sacrificio, huaco, antiguo, siempre, oro, ropa, conchas, cosa, hacer, buen, temporal, segundo, diputo, sacrificio, murio, inca, despu, suceder, hacer, aqui, sacrificio, ordinario, ofreciar, generalmente, cosa, consumian, sacrificio, salud, inco, tercero, huaco, llamar, pilcopuquio, fuente, casa, sobredicha, salir, acequia, contar, indio, hecho, casa, sacrificios, salir, agua, hacer, sacrificio, ordinario, cuarto, granizo, temiar, sacrificar, destruyesir, sembrado, quinto, postrera, huaco, teniar, nombre, antiguo, ofrecer, sacrificio, ...]
3 3 1 0.3954 dar, partida, pieza, carnero, ropa, fanega, da_gastado, indio, cocar, mas [quedar, cuzco, continuo, indio, indio, hacer, pared, moria, dar, dar, indio, sembrar, chacara, cuzco, comer, gente, hacer, camarico, guaynacar, indio, continuo, guardar, cuerpo, inca_yupanqui, muerto, indio, continuo, guardar, arma, yanacona, guardar, guardar, quyto, indio, guardar, cuerpo, guaynacar, muerto, indio, hacer, pluma, indio, mas, sacar, miel, indio, dar, cumbico, indio, mas, hacer, tintura, color, indio, guardar, oveja, indio, dar, guardar, chacara, llevar, mayor, parte, cuzco, demas, deposito, mas, dar, indio, sembrar, llevar, dar, hacer, sal, vez, indio, dar, indio, hacer, coca, coco, llevar, cuzco, deposito, guanuco, vez, llevar, costal, dar, indio, persona, inco, tomar, benado, dar, indio, hacer, suela, llevar, cuzco, ...]
4 4 2 0.8534 dicho, dar, ano, pesos, tambien, indio, peso, decir, encomendero, repartimiento [decir, dar, dias, pieza, algodon, mes, pieza, camiseta, decir, aqui_adelante, dar, pieza, dias, dar, dar, mes, costal, coca, decir, aqui_adelante, dar, dar, dias, cantaros, decir, aqui_adelante, dar, cantaros, dar, dias, costal, hecho, decir, dar, aqui_adelante, dar, dias, pan, cera, decir, dar, aqui_adelante, dar, dias, par, alpargat, decir, aqui_adelante, dar, par, decir, vez, dar, decir, dar, mes, pan, sal, verano, invierno, dar, aqui_adelante, dar, semana, fanega, decir, aqui_adelante, dar, fanega, semana, papas, dar, gallina, dia, decir, aqui_adelante, podian, dar, dar, dar, semana, oveja, decir, asimismo, dar, aqui_adelante, dar, carpintero, obra, amar, decir, asi, dar, aqui_adelante, dar, cumbico, mujer, hacer, ropa, lán, ...]
5 5 0 0.7096 indio, pueblo, principal, llamar, decir, dicho, cacique, parcialidad, casa, sujeto [amador, dicho, dar, principal, indio, guar, cacique, dar, indio, indio, inco, dar, indio, indio, indio]
6 6 1 0.9718 dar, partida, pieza, carnero, ropa, fanega, da_gastado, indio, cocar, mas [primeramente, perder, jornada, indio, mujer, dar, oro, plato, oro, pesos, plata, dar, peso, mas, dar, ropa, pieza, mujer, dar, manta, caballo, dar, carnero, mas, dar, fanega, dar, indio, perder, jornada, mas, dar, capitar, mandado, marqu, indio, indio, indio, indio, morir, batalla, dar, inca, traer, perdido, indio, indio, traer, indio, dar, llego, xauxa, cuento, fanega, dar, quinua, fanega, dar, papa, fanega, mas, dar, cuento, carnero, mas, dar, cuento, cordero, mas, dar, olla, dar, ojota, par, mas, dar, perdiz, mas, dar, cuento, libras, pescado, mas, dar, cuento, carga, lén, mas, dar, cuento, carga, dar, indio, cargo, perder, indio, mujer, dar, camino, fanega, ...]
7 7 1 0.9502 dar, partida, pieza, carnero, ropa, fanega, da_gastado, indio, cocar, mas [plomo, par, vestido, indio, bolsa, cumbe, frazada, tierra, costal, lana, par, alpargat, par, ojota, soga, braza, paco, buen, tamano, cordero, grande, vellón, lana, pescuezo, carnero, venado, grande, candela, sebo, puerco, ano, lechón, capado, cabrito, almud, fanega, papa, fanega, quinua, ysanga, cabi, fanega, almud, trigo, pan, libra, cantaros, chicha, botija, venir, cantaros, olla, vasija, gallina, huevo, perdiz, ysanga, pepino, fruta, panecillo, sal, almud, pescado, pica, braza, caja, arcabuz, baqueta, arcabuz, rollet, mecha, arcabuz, banco, sentar, batea, grande, atacas, palo, toldo, estaca, toldo, carga, lén, lén, petaca, aforrada, cuero, benado, petaca, comido, carga, yerbo, carga, indio, indio, bohio, hacer, gente, indio, hacer, ...]
8 8 1 0.9889 dar, partida, pieza, carnero, ropa, fanega, da_gastado, indio, cocar, mas [indio, carga, indio, fanega, almud, fanega, almud, chicho, fanega, papa, fanega, almud, quinua, fanega, cabi, fanega, trigo, pan, libra, oveja, cordero, tierra, puerco, cabra, gallina, huevo, perdiz, pescado, petaquilla, fruta, cobre, plomo, pica, mecha, arcabuz, par, alpargat, ojota, jaquima, guasca, olla, cazuela, plato, queros, oveja, arreld, manta, caballo, costal, vellón, lán, petaca, carga, lén, carga, carbon, bateas, mesa, cuchara, caja, arcabuz, banco, palo, toldo, grande, palo, toldo, mas, chico]
9 9 1 0.9882 dar, partida, pieza, carnero, ropa, fanega, da_gastado, indio, cocar, mas [indio, cargo, fanega, almud, fanega, almud, harina, fanega, almud, hacer, chicha, fanega, almud, quinua, fanega, almud, cabi, fanega, trigo, pan, libra, oveja, cordero, puerco, lechón, cabra, gallina, huevo, perdiz, pescado, petaquilla, fruta, plomo, pica, mecha, arcabuz, par, alpargat, ojota, jaquima, guasca, queros, oveja, arreld, mantecar, sobremesa, chuspa, manta, caballo, costal, vellón, lán, petaca, carga, lén, costal, mesa, caja, arcabuz, cucharas, banco, sentar, palo, toldo]

The most representative sentence for each topic

Sometimes you want to get samples of sentences that most represent a given LDA topic model. This code gets the most exemplar sentence for each LDA topic model.

Code
# Display setting to show more characters in column
pd.options.display.max_colwidth = 100
sent_topics_sorted_df_mallet = pd.DataFrame()
sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')

for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorted_df_mallet = pd.concat([sent_topics_sorted_df_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=False).head(1)], axis=0)

# Reset Index    
sent_topics_sorted_df_mallet.reset_index(drop=True, inplace=True)
# Format
sent_topics_sorted_df_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Representative Text"]
# Show
sent_topics_sorted_df_mallet
Topic_Num Topic_Perc_Contrib Keywords Representative Text
0 0 0.9992 indio, pueblo, principal, llamar, decir, dicho, cacique, parcialidad, casa, sujeto [provincia, pueblo, andar, cacique, cocinga, catapinga, indio, principal, sujeto, pueblo, saquixaguán, cacique, principal, sujeto, indio, cacique, vrcoxupasco, pueblo, conchacallo, cacique, quiquijaguana, cacique, sujeto, principal, indio, cacique, aucax, pueblo, quico, pueblo, guaracondor, principal, principal, cacique, principal, acostopar, principal, clamo, pueblo, lanbaqui, principal, guamanalla, pueblo, canco, cacique, ancaipullo, pueblo, pata, cacique, pueblo, tocojo, cacique, llamar, llaquixaguanca, indio, cacique, llamar, xuca, llamar, gualla, principal, llamar, marcoa, sayallos, mas, fuerar, cacique, maico, indio, caciqu, principal, ahora, fuerar, dicho, pueblo, provincia, charcas, sujeto, cacique, llamar, indio, principal, dicho, pueblo, indio, dicho, pueblo, llamar, llamar, principal, chuca, residir, pueblo, chaqui, indio, pueblo, llamar, camali, pueblesuelo, junto, ...]
1 1 0.9889 dar, partida, pieza, carnero, ropa, fanega, da_gastado, indio, cocar, mas [indio, carga, indio, fanega, almud, fanega, almud, chicho, fanega, papa, fanega, almud, quinua, fanega, cabi, fanega, trigo, pan, libra, oveja, cordero, tierra, puerco, cabra, gallina, huevo, perdiz, pescado, petaquilla, fruta, cobre, plomo, pica, mecha, arcabuz, par, alpargat, ojota, jaquima, guasca, olla, cazuela, plato, queros, oveja, arreld, manta, caballo, costal, vellón, lán, petaca, carga, lén, carga, carbon, bateas, mesa, cuchara, caja, arcabuz, banco, palo, toldo, grande, palo, toldo, mas, chico]
2 2 0.9958 dicho, dar, ano, pesos, tambien, indio, peso, decir, encomendero, repartimiento [tasado, peso, ensayado, ano, montar, enero, postrero, julio, ano, ano, mes, pesos, ensayado, asimismo, mandar, dicho, tasa, caciqu, principal, indio, dicho, repartimiento, anpara, dar, ano, encomendero, fanega, puesto, mitad, coger, monta, dicho, tiempo, ano, mes, medio, fanega, dicho, maiz, fanega, almud, peso, grano, montar, pesos, mandar, dar, ano, oveja, mediano, pesos, ensayado, escoger, querer, puesto, plata, mes, mitad, montar, dicho, tiempo, oveja, montar, ano, cestillo, aji, montar, dicho, tiempo, cestillo, peso, ano, gallina, montar, dicho, tiempo, tomín, montar, peso, tomín, pato, patas, ano, montar, dicho, tiempo, tomín, montar, pesos, tomín, semana, huevo, ano, semanas, montar, dicho, tiempo, huevo, uebo, tomin, ...]
3 3 0.9406 casa, casado, ano, declarar, morada, arriba, dicho, pueblo, teniar, nombrado [comenzar, camino, chinchaysuyu, salir, barrio, comprehendiar, huaca, orden, primero, ceque, cargo, parcialidad, ayllu, teniar, huaca, siguiente, primero, llamar, michosamaro, arrimado, deciar, fingir, salir, primero, inca, mancocapac, pacaritampu, referir, mujer, salir, dicho, cueva, mato, desacato, torno, animar, aparecio, lugar, sacrificio, huaco, antiguo, siempre, oro, ropa, conchas, cosa, hacer, buen, temporal, segundo, diputo, sacrificio, murio, inca, despu, suceder, hacer, aqui, sacrificio, ordinario, ofreciar, generalmente, cosa, consumian, sacrificio, salud, inco, tercero, huaco, llamar, pilcopuquio, fuente, casa, sobredicha, salir, acequia, contar, indio, hecho, casa, sacrificios, salir, agua, hacer, sacrificio, ordinario, cuarto, granizo, temiar, sacrificar, destruyesir, sembrado, quinto, postrera, huaco, teniar, nombre, antiguo, ofrecer, sacrificio, ...]

Visualization of the Latent-Direchlet Allocation model.

Now that we have an n-dimensional model, let’s project it on to the 2d plane and graph it. tSNE is used to do the projection.

Code
import matplotlib.colors as mcolors

# Get topic weights and dominant topics ------------
from sklearn.manifold import TSNE

# Get topic weights
topic_weights = []
for i, row_list in enumerate(lda_model[corpus]):
    topic_weights.append([w for i, w in row_list[0]])

    # Array of topic weights    
arr = pd.DataFrame(topic_weights).fillna(0).values

# Keep the well separated points (optional)
# arr = arr[np.amax(arr, axis=1) > 0.35]

# Dominant topic number in each doc
topic_num = np.argmax(arr, axis=1)

# tSNE Dimension Reduction
tsne_model = TSNE(n_components=2, verbose=1, random_state=0, angle=.99, init='pca')
tsne_lda = tsne_model.fit_transform(arr) 
[t-SNE] Computing 71 nearest neighbors...
[t-SNE] Indexed 72 samples in 0.000s...
[t-SNE] Computed neighbors for 72 samples in 0.002s...
[t-SNE] Computed conditional probabilities for sample 72 / 72
[t-SNE] Mean sigma: 0.049524
[t-SNE] KL divergence after 250 iterations with early exaggeration: 50.609718
[t-SNE] KL divergence after 1000 iterations: 0.039663
Code
common_doc_words = [most_common_words[name] for name in doc_names]
tsne_df = pd.DataFrame({'name':doc_names, 'text':common_doc_words, 'x':tsne_lda[:,0].tolist(), 'y':tsne_lda[:,1].tolist(), 'lda_topic_num':topic_num.tolist()})
tsne_df.to_csv(f"{csv_dir}/TA_LDA.csv")
#tsne_df

plt_colors = np.array([color for name, color in mcolors.TABLEAU_COLORS.items()])
point_colors = [plt_colors[topic_num] for topic_num in tsne_df['lda_topic_num'].values.tolist()]
fig = (px.scatter(tsne_df, x="x", y="y", color=point_colors,
                  hover_name='name', hover_data=['text', 'lda_topic_num'], 
                  title=f"<b>Khipu Texts Clustered by LDA Topic Model</b>",
                  width=944, height=944)
        .update_layout(showlegend=False, paper_bgcolor='#fffff8')
        .update(layout_coloraxis_showscale=False)
        .show()
      )

Code
# A review:
LDA_topic_names = {}
topic_colors = ['Purple', 'Red   ', 'Green ', 'Blue  ']
for LDA_topic_num in range(0,4):
    LDA_topic_df = tsne_df[tsne_df['lda_topic_num']==LDA_topic_num]
    LDA_topic_names[LDA_topic_num] = LDA_topic_df['name'].tolist()
    topic_name_text = "\t " + ku.multiline(LDA_topic_names[LDA_topic_num], line_length=160, continuation_char="\n\t ")
    print(f"{topic_colors[LDA_topic_num].upper()} - Topic {LDA_topic_num} has {len(LDA_topic_names[LDA_topic_num])} documents:\n{topic_name_text}")
PURPLE - Topic 0 has 42 documents:
     ['m_06', 'm_08', 'm_09', 'm_10', 'm_20', 'm_21', 'm_22', 'm_23', 'm_24', 'm_25', 'm_26', 'm_27', 'm_28', 'm_29', 'm_30', 'm_31', 'm_32', 'm_33', 'm_34', 'm_35',
     'm_36', 'm_37', 'm_38', 'm_39', 'm_40', 'm_41', 'm_42', 'm_43', 'm_44', 'm_45', 'm_46', 'm_47', 'm_50', 'm_51', 'm_53', 'm_54', 'm_55', 'm_57', 'm_58', 'm_59',
     'm_64', 'm_66']
RED    - Topic 1 has 17 documents:
     ['m_02', 'm_04', 'm_07', 'm_11', 'm_14', 'm_15', 'm_16', 'm_17', 'm_19', 'm_48', 'm_52', 'm_65', 'm_67', 'm_68', 'm_69', 'm_71', 'm_72']
GREEN  - Topic 2 has 12 documents:
     ['m_03', 'm_05', 'm_12', 'm_13', 'm_18', 'm_49', 'm_56', 'm_60', 'm_61', 'm_62', 'm_63', 'm_70']
BLUE   - Topic 3 has 1 documents:
     ['m_01']
Code
# Handy code for next session..
def load_LDA_topics():
    LDA_df = pd.read_csv(f"{csv_dir}/TA_LDA.csv")
    LDA_topic_names = {}
    topic_colors = ['Purple', 'Red   ', 'Green ', 'Blue  ']
    for LDA_topic_num in range(0,4):
        LDA_topic_df = LDA_df[LDA_df['lda_topic_num']==LDA_topic_num]
        LDA_topic_names[LDA_topic_num] = LDA_topic_df['name'].tolist()
    return LDA_topic_names

LDA_topic_names = load_LDA_topics()
LDA_Purple = LDA_topic_names[0]
LDA_Red = LDA_topic_names[1]
LDA_Green = LDA_topic_names[2]
LDA_Blue = LDA_topic_names[3]

print(f"{LDA_Purple=}")
print(f"{LDA_Red=}")
print(f"{LDA_Green=}")
print(f"\n{LDA_Blue=}")
LDA_Purple=['m_06', 'm_08', 'm_09', 'm_10', 'm_20', 'm_21', 'm_22', 'm_23', 'm_24', 'm_25', 'm_26', 'm_27', 'm_28', 'm_29', 'm_30', 'm_31', 'm_32', 'm_33', 'm_34', 'm_35', 'm_36', 'm_37', 'm_38', 'm_39', 'm_40', 'm_41', 'm_42', 'm_43', 'm_44', 'm_45', 'm_46', 'm_47', 'm_50', 'm_51', 'm_53', 'm_54', 'm_55', 'm_57', 'm_58', 'm_59', 'm_64', 'm_66']
LDA_Red=['m_02', 'm_04', 'm_07', 'm_11', 'm_14', 'm_15', 'm_16', 'm_17', 'm_19', 'm_48', 'm_52', 'm_65', 'm_67', 'm_68', 'm_69', 'm_71', 'm_72']
LDA_Green=['m_03', 'm_05', 'm_12', 'm_13', 'm_18', 'm_49', 'm_56', 'm_60', 'm_61', 'm_62', 'm_63', 'm_70']

LDA_Blue=['m_01']

We now have three fairly cleanly clustered sets, Purple, Red, and Green and the very unique mm_01 by itself as a Blue cluster.

After we have completed the next section, An Introduction to Measured Nouns, we can explore what makes each of these LDA clusters interesting.