Vocabulary Review


Code
csv_dir = f"{kq.project_directory()}/notebook/textos_andinos/data/CSV"
nouns_df = pd.read_csv(f"{csv_dir}/TA_measured_nouns.csv")
word_df = pd.read_csv(f"{csv_dir}/TA_words.csv")
summary_df = pd.read_csv(f"{csv_dir}/TA_summary.csv")
metadata_df = pd.read_csv(f"{csv_dir}/TA_metadata.csv")
LDA_df = pd.read_csv(f"{csv_dir}/TA_LDA.csv")
Code
# From LDA Analysis page
right_topics=['m_07', 'm_08', 'm_09', 'm_10', 'm_11', 'm_14', 'm_15', 'm_16', 'm_17', 'm_21',
    'm_22', 'm_26', 'm_29', 'm_30', 'm_32', 'm_34', 'm_35', 'm_36', 'm_40', 'm_41',
    'm_42', 'm_43', 'm_45', 'm_46', 'm_50', 'm_51', 'm_53', 'm_55', 'm_59', 'm_65',
    'm_66', 'm_67']



left_topics=['m_01', 'm_02', 'm_03', 'm_19', 'm_48', 'm_49', 'm_60', 'm_61', 'm_62', 'm_63',
             'm_64', 'm_68', 'm_69', 'm_70', 'm_71', 'm_72']
top_topics=['m_20', 'm_23', 'm_31', 'm_36', 'm_37', 'm_38', 'm_47']
bottom_topics=['m_07', 'm_14', 'm_15', 'm_17', 'm_21', 'm_22', 'm_50']
Code
from nltk.corpus import stopwords
stop_words = stopwords.words('spanish')
stop_words.extend(['.', ','])
Code
# Build a plot of word frequency for topics
right_topic_words = word_df[word_df['name'].isin(right_topics) & (~word_df['lemma'].isin(stop_words))]
right_topic_word_counts = right_topic_words['lemma'].value_counts()
left_topic_words = word_df[word_df['name'].isin(left_topics) & (~word_df['lemma'].isin(stop_words))]
left_topic_word_counts = left_topic_words['lemma'].value_counts()
top_topic_words = word_df[word_df['name'].isin(top_topics) & (~word_df['lemma'].isin(stop_words))]
top_topic_word_counts = top_topic_words['lemma'].value_counts()
bottom_topic_words = word_df[word_df['name'].isin(bottom_topics) & (~word_df['lemma'].isin(stop_words))]
bottom_topic_word_counts = bottom_topic_words['lemma'].value_counts()
Code
left_counts = ku.jitter(pd.Series(left_topic_word_counts))
right_counts = ku.jitter(pd.Series(right_topic_word_counts))

#Build left/right frequency
lr_words = sorted(list(set(left_topic_words['lemma'].tolist()+right_topic_words['lemma'].tolist())))
left_topic_word_counts_df = pd.DataFrame({'word':left_topic_word_counts.index.tolist(), 'count': left_counts})
right_topic_word_counts_df = pd.DataFrame({'word':right_topic_word_counts.index.tolist(), 'count':right_counts})
lr_wc_df = left_topic_word_counts_df.merge(right_topic_word_counts_df, how='outer', on='word', )
lr_wc_df = lr_wc_df.fillna(0)
Code
# Initialize plotly
plotly.offline.init_notebook_mode(connected = False)

layout = dict(plot_bgcolor='white',width=1500, height=1500,
              title="<b>\nWords in both LEFT and RIGHT topics</b>",
              margin=dict(t=20, l=20, r=20, b=20),
              xaxis=dict(title='Right Topics',
                         #range=[0.9, 5.5],
                         linecolor='#d9d9d9',
                         showgrid=False,
                         mirror=True),
              yaxis=dict(title='Left Topics',
                         #range=[95.5, 99.5],
                         linecolor='#d9d9d9',
                         showgrid=False,
                         mirror=True))

data = go.Scatter(x=lr_wc_df['count_y'],
                  y=lr_wc_df['count_x'], 
                  text=lr_wc_df['word'],
                  textposition='top right',
                  textfont=dict(color='#000000'),
                  mode='markers+text',
                  marker=dict(color='#5D69B1', size=8),
                  name='word'
                  )

fig = (go.Figure(data=data, layout=layout)
         .update_xaxes(type="log")
         .update_yaxes(type="log")
         .add_shape(type='line',
                x0=1,
                y0=1,
                x1=500,
                y1=500,
                line=dict(color='Red',),
                xref='x',
                yref='y')
         .show())
Code
left_topic_word_counts_df = pd.DataFrame({'word':left_topic_word_counts.index.tolist(), 'count': pd.Series(left_topic_word_counts)})

words_in_left_only = left_topic_word_counts_df[~left_topic_word_counts_df['word'].isin(right_topic_word_counts_df['word'])].sort_values(by=['count'],ascending=False).head(100)
words_in_left_only = words_in_left_only.reindex(index=words_in_left_only.index[::-1])

fig = px.bar(words_in_left_only, x="count", y="word", orientation='h',width=1500, height=2000, title="<b>Words Only in LEFT Topics</b>").show()
Code
words_in_left_only
word count
maldonado maldonado 7
ventana ventana 7
bara bara 7
buhío buhío 7
cacra cacra 7
... ... ...
morada morada 71
fuente fuente 72
ceque ceque 88
ofrecer ofrecer 95
guaca guaca 195
Code
right_topic_word_counts_df = pd.DataFrame({'word':right_topic_word_counts.index.tolist(), 'count': pd.Series(right_topic_word_counts)})

words_in_right_only = right_topic_word_counts_df[~right_topic_word_counts_df['word'].isin(left_topic_word_counts_df['word'])].sort_values(by=['count'],ascending=False).head(100)
words_in_right_only = words_in_right_only.reindex(index=words_in_right_only.index[::-1])
len(words_in_right_only)

fig = px.bar(words_in_right_only, x="count", y="word", orientation='h',width=1500, height=2000, title="<b>Words Only in RIGHT Topics</b>").show()
100
Code
words_in_both_df = left_topic_word_counts_df[left_topic_word_counts_df['word'].isin(right_topic_word_counts_df['word'])].sort_values(by=['count'],ascending=False).head(100)
words_in_both_df = words_in_both_df.reindex(index=words_in_both_df.index[::-1])
len(words_in_both_df)

fig = px.bar(words_in_both_df, x="count", y="word", orientation='h',width=1500, height=2000, title="<b>Words in BOTH Topics</b>").show()
100
Code
top_vals = jitter_series(pd.Series(top_topic_word_counts))
bottom_vals = jitter_series(pd.Series(bottom_topic_word_counts))

tb_words = sorted(list(set(top_topic_words['lemma'].tolist()+bottom_topic_words['lemma'].tolist())))
top_topic_word_counts_df = pd.DataFrame({'word':top_topic_word_counts.index.tolist(), 'count': top_vals.values.tolist()})
bottom_topic_word_counts_df = pd.DataFrame({'word':bottom_topic_word_counts.index.tolist(), 'count':bottom_vals.values.tolist()})
tb_wc_df = top_topic_word_counts_df.merge(bottom_topic_word_counts_df, how='outer', on='word', )
tb_wc_df = tb_wc_df.fillna(0)
Code
# Initialize plotly
plotly.offline.init_notebook_mode(connected = False)

layout = dict(plot_bgcolor='white',width=1500, height=1500,
              margin=dict(t=20, l=20, r=20, b=20),
              xaxis=dict(title='Bottom Topics',
                         #range=[0.9, 5.5],
                         linecolor='#d9d9d9',
                         showgrid=False,
                         mirror=True),
              yaxis=dict(title='Top Topics',
                         #range=[95.5, 99.5],
                         linecolor='#d9d9d9',
                         showgrid=False,
                         mirror=True))

data = go.Scatter(x=tb_wc_df['count_x'],
                  y=tb_wc_df['count_y'], 
                  text=tb_wc_df['word'],
                  textposition='top right',
                  textfont=dict(color='#000000'),
                  mode='markers+text',
                  marker=dict(color='#5D69B1', size=8),
                  name='word',
                  )

fig = (go.Figure(data=data, layout=layout)
         .update_xaxes(type="log")
         .update_yaxes(type="log")
         .add_shape(type='line',
                x0=1,
                y0=1,
                x1=500,
                y1=500,
                line=dict(color='Red',),
                xref='x',
                yref='y')
         .show())
Code
top_topic_word_counts_df = pd.DataFrame({'word':top_topic_word_counts.index.tolist(), 'count': pd.Series(top_topic_word_counts)})

words_in_top_only = top_topic_word_counts_df[~top_topic_word_counts_df['word'].isin(bottom_topic_word_counts_df['word'])].sort_values(by=['count'],ascending=False).head(100)
words_in_top_only = words_in_top_only.reindex(index=words_in_top_only.index[::-1])

fig = px.bar(words_in_top_only, x="count", y="word", orientation='h',width=1500, height=2000, title="<b>Words Only in TOP Topics</b>").show()
Code
bottom_topic_word_counts_df = pd.DataFrame({'word':bottom_topic_word_counts.index.tolist(), 'count': pd.Series(bottom_topic_word_counts)})

words_in_bottom_only = bottom_topic_word_counts_df[~bottom_topic_word_counts_df['word'].isin(top_topic_word_counts_df['word'])].sort_values(by=['count'],ascending=False).head(100)
words_in_bottom_only = words_in_bottom_only.reindex(index=words_in_bottom_only.index[::-1])

fig = px.bar(words_in_bottom_only, x="count", y="word", orientation='h',width=1500, height=2000, title="<b>Words Only in BOTTOM Topics</b>").show()
Code
words_in_both_df = top_topic_word_counts_df[top_topic_word_counts_df['word'].isin(bottom_topic_word_counts_df['word'])].sort_values(by=['count'],ascending=False).head(100)
words_in_both_df = words_in_both_df.reindex(index=words_in_both_df.index[::-1])
len(words_in_both_df)

fig = px.bar(words_in_both_df, x="count", y="word", orientation='h',width=1500, height=2000, title="<b>Words in BOTH TOP & BOTTOM Topics</b>").show()
79
Code
isinstance(words_in_both_df['count'], pd.core.series.Series)
True
Code
foo = [1,2,3]
isinstance(foo, pd.core.series.Series)
False