Vocabulary Review


Code
csv_dir = f"{kq.project_directory()}/notebook/textos_andinos/data/CSV"
nouns_df = pd.read_csv(f"{csv_dir}/TA_measured_nouns.csv")
word_df = pd.read_csv(f"{csv_dir}/TA_words.csv")
summary_df = pd.read_csv(f"{csv_dir}/TA_summary.csv")
metadata_df = pd.read_csv(f"{csv_dir}/TA_metadata.csv")
LDA_df = pd.read_csv(f"{csv_dir}/TA_LDA.csv")
Code
# From LDA Analysis page
right_topics=['m_07', 'm_08', 'm_09', 'm_10', 'm_11', 'm_14', 'm_15', 'm_16', 'm_17', 'm_21',
    'm_22', 'm_26', 'm_29', 'm_30', 'm_32', 'm_34', 'm_35', 'm_36', 'm_40', 'm_41',
    'm_42', 'm_43', 'm_45', 'm_46', 'm_50', 'm_51', 'm_53', 'm_55', 'm_59', 'm_65',
    'm_66', 'm_67']



left_topics=['m_01', 'm_02', 'm_03', 'm_19', 'm_48', 'm_49', 'm_60', 'm_61', 'm_62', 'm_63',
             'm_64', 'm_68', 'm_69', 'm_70', 'm_71', 'm_72']
top_topics=['m_20', 'm_23', 'm_31', 'm_36', 'm_37', 'm_38', 'm_47']
bottom_topics=['m_07', 'm_14', 'm_15', 'm_17', 'm_21', 'm_22', 'm_50']
Code
from nltk.corpus import stopwords
stop_words = stopwords.words('spanish')
stop_words.extend(['.', ','])
Code
# Build a plot of word frequency for topics
right_topic_words = word_df[word_df['name'].isin(right_topics) & (~word_df['lemma'].isin(stop_words))]
right_topic_word_counts = right_topic_words['lemma'].value_counts()
left_topic_words = word_df[word_df['name'].isin(left_topics) & (~word_df['lemma'].isin(stop_words))]
left_topic_word_counts = left_topic_words['lemma'].value_counts()
top_topic_words = word_df[word_df['name'].isin(top_topics) & (~word_df['lemma'].isin(stop_words))]
top_topic_word_counts = top_topic_words['lemma'].value_counts()
bottom_topic_words = word_df[word_df['name'].isin(bottom_topics) & (~word_df['lemma'].isin(stop_words))]
bottom_topic_word_counts = bottom_topic_words['lemma'].value_counts()
Code
left_counts = ku.jitter(pd.Series(left_topic_word_counts))
right_counts = ku.jitter(pd.Series(right_topic_word_counts))

#Build left/right frequency
lr_words = sorted(list(set(left_topic_words['lemma'].tolist()+right_topic_words['lemma'].tolist())))
left_topic_word_counts_df = pd.DataFrame({'word':left_topic_word_counts.index.tolist(), 'count': left_counts})
right_topic_word_counts_df = pd.DataFrame({'word':right_topic_word_counts.index.tolist(), 'count':right_counts})
lr_wc_df = left_topic_word_counts_df.merge(right_topic_word_counts_df, how='outer', on='word', )
lr_wc_df = lr_wc_df.fillna(0)
Code
# Initialize plotly
plotly.offline.init_notebook_mode(connected = False)

layout = dict(plot_bgcolor='white',width=1500, height=1500,
              title="<b>\nWords in both LEFT and RIGHT topics</b>",
              margin=dict(t=20, l=20, r=20, b=20),
              xaxis=dict(title='Right Topics',
                         #range=[0.9, 5.5],
                         linecolor='#d9d9d9',
                         showgrid=False,
                         mirror=True),
              yaxis=dict(title='Left Topics',
                         #range=[95.5, 99.5],
                         linecolor='#d9d9d9',
                         showgrid=False,
                         mirror=True))

data = go.Scatter(x=lr_wc_df['count_y'],
                  y=lr_wc_df['count_x'], 
                  text=lr_wc_df['word'],
                  textposition='top right',
                  textfont=dict(color='#000000'),
                  mode='markers+text',
                  marker=dict(color='#5D69B1', size=8),
                  name='word'
                  )

fig = (go.Figure(data=data, layout=layout)
         .update_xaxes(type="log")
         .update_yaxes(type="log")
         .add_shape(type='line',
                x0=1,
                y0=1,
                x1=500,
                y1=500,
                line=dict(color='Red',),
                xref='x',
                yref='y')
         .show())
serindiodichohaberpuebloañodartenerdecir;casa2declarar43piedracerrocadapesodonllamar5hacerprincipal6primeromartínllamadoincacasado:8arribajuntomismocaminopedroniñotercerosegundo10fanegatierrapuestociertograndemaízdiego20alonsomediollanopotosíviejoasimismojuan1250ropameschácaramozoplataallítomínrepartimiento7caciqueprovinciairdespuésencomenderomaneratiempoorogonzalonombreaquel13mayordomocarneroyupanquifranciscosalirponerordinariomandarpiezasembrarcosademásluegomitadcocacercafortalezaciudadúltimoningunopartecargocarpinterocuartoedadtopatasamujerasídíalugarbiennombrararrobacesto11coger16pasarcuzcoseñalarmorircontar14descargarhernandogastargallinaoveja10015lópezsusodichoparecervezsiguientecorrienteadonde25pagarmantaaquívolverseñorcumbipoderabascacristóbal9cabezavenirtributolimatraer30tomartrigoaguapatobuenhallarsemanaminaahora22parcialidadllegarganadovisitarsolerllevarsacarperderquemarrealsalcaxa1000adelante27razónpescadotrataralguno18vallecantidadsi40caber21antoniopesarentrarplazatamboquerercaballoexcepto31juliochuñolanalorenzo2324procederollerosaberhembraacabar17huevoservirpuercodeclaradoayudarchicoguardarhermanoandespaztupapaucar48harinamitimaelopecostalparialunaasientovestidopersonaquispepedirguaman3533fuerzaguerra32montarenviarpreguntar19carga44españolprimeramentepizarromenosfrazadaguarda37tejuelo85bueyreino55prenderformahombre70pasadoguamangaensayadopapaperdizefectovecinovermar200comidaserviciomatarsegúnsolobatallahijo26tal140presentepartido68tantapargualcarodrigopaulochacayanallamapachaavedesherbarsemillacuyoleñapaloaudienciapropiovenderbeneficiarrecibirpomasanta28proveerreligiososeboclérigocausa150maltamañocondesuyo450valormitarepartirmandilsiempresustentaciónvencerpilco130sentar51coyabateacapiquinuamanteca41postreropresoguascamagueycomerayudaalgodónquedar3657camariconuncaojota240reyes3449finocharquisecochachapoyasvivo71quipotrabajarguarcaquitoríopaño56sobremesapobladomollemachaguayyauyosencomendado60corderoentendersierrasubirmachocantarillomitayochuquicabalgadura10880xequepuxamuerto19339respetocollaodañopucara266sementeracapitáncinchacabestrojáquimahierba42sogamalqui64sucedirnaturalovejerocarlospadremielbajardejargentedesusoquipucamayoceracontenidopescuezopaja54lanza47526572sazón29beneficioajívueltacestilloatunxauxamanoansídoctrinartrillarcabuyaencerrado500ocuparbocamedianablancoycaguarache86bilcacolquedóndeausentartrujilloplatoachomangovichucoolla458859marco107plomoviudamollo43maray203117corregidoralzardefenderpresidentegasca84pascuanúmero192cuentavalersustentarvidaejemplodoctrinejustocómododiezmochichasalariodineromenestermejorpolicíaleycatólicofeencargarconcienciaescrúpulosueltalátigoalgarrobacastillaescogeryotala3000600090oficialdesgranadoespigacostamudargrano600fresco75panecilloperjuiciocabuyomaderapan300candelacriarpocchatoco4669angosbilcasmechapaco58chaquicocochincheagravio53nuevomesabeberpuentecarisumaaunquepunavenidaleguamuertepartirjornadaparedóntarapacaamboscharcasdestruirricotayaguanucosujetotitotoparpalpapuertoandaseguirlagunaandarquitararmadisminuirmandadofuegocontenergeneralquillachilacachucatonochinopicoquinocalachalaalcacancha1281332843505404000cántarochincha750poblezueloalláasentarlibracestasucesivodichanecesario210cuento3681251025100251000125102510025
Words in both LEFT and RIGHT topicsRight TopicsLeft Topics
Code
left_topic_word_counts_df = pd.DataFrame({'word':left_topic_word_counts.index.tolist(), 'count': pd.Series(left_topic_word_counts)})

words_in_left_only = left_topic_word_counts_df[~left_topic_word_counts_df['word'].isin(right_topic_word_counts_df['word'])].sort_values(by=['count'],ascending=False).head(100)
words_in_left_only = words_in_left_only.reindex(index=words_in_left_only.index[::-1])

fig = px.bar(words_in_left_only, x="count", y="word", orientation='h',width=1500, height=2000, title="<b>Words Only in LEFT Topics</b>").show()
050100150200maldonadoventanabarabuhíocacracollascaídolenguaantiguoxulcaguamanpuertahaciafronterovistacatalinafontezuelaleizeguicarmengamembillapachacaorejónmamarsetogerónimolargohechocaxallibiagrangastadosétimopayansepulturaveneraciónantón151testigouniversalordencayocachecerrillohachasolemnecabestrerofelipevillasanapostrerainclusoreysaludsantiagocaroallibiacollanacomunidadjoanconquistarmolidofiestadécimacumplimientopururaucassólocayaoguanacauricercadoadorarranchoídolomanantialtemploquintoxulcaayllunovenoadoratoriopleitoquebradaviudosétimadomingoquintasolabajoencimamuchachosacrificaroctavocuartapartidanombradoparedsextopequeñoconcharoblessacrificiomoradafuentecequeofrecerguaca
Words Only in LEFT Topicscountword
Code
words_in_left_only
word count
maldonado maldonado 7
ventana ventana 7
bara bara 7
buhío buhío 7
cacra cacra 7
... ... ...
morada morada 71
fuente fuente 72
ceque ceque 88
ofrecer ofrecer 95
guaca guaca 195
Code
right_topic_word_counts_df = pd.DataFrame({'word':right_topic_word_counts.index.tolist(), 'count': pd.Series(right_topic_word_counts)})

words_in_right_only = right_topic_word_counts_df[~right_topic_word_counts_df['word'].isin(left_topic_word_counts_df['word'])].sort_values(by=['count'],ascending=False).head(100)
words_in_right_only = words_in_right_only.reindex(index=words_in_right_only.index[::-1])
len(words_in_right_only)

fig = px.bar(words_in_right_only, x="count", y="word", orientation='h',width=1500, height=2000, title="<b>Words Only in RIGHT Topics</b>").show()
100
020406080100porcocomúnmentetinajamercadillovellónmediaaderezocharcayungaalbaradocuero234taramaincasgómezcharco863chuquichanbicalaguana124121143mosqueracocidoyçangachuquisaca11062bancoçelissotomenesesdesbaratarpablohernándezcopacaguanadepositarcondori250tasar125huyabaharrieroarcabuzcarbóncargadoescudillamoromorocasoysangavacasalazartoldo400almagrocanchayacastrocobrealdeapescadormachahuirsobrecargaporongocargarninachuquiadobadoentregar120cabeceravasijadespobladopicaranchearcusichacalmudcaracaravilcapreciotesoreromenudomontemayorpelearalvaradomariscalfruta1razadochamelicoxauxacómovisitaciónmorocoalpargatecuántorobarsoldadovillaestanciamarqués
Words Only in RIGHT Topicscountword
Code
words_in_both_df = left_topic_word_counts_df[left_topic_word_counts_df['word'].isin(right_topic_word_counts_df['word'])].sort_values(by=['count'],ascending=False).head(100)
words_in_both_df = words_in_both_df.reindex(index=words_in_both_df.index[::-1])
len(words_in_both_df)

fig = px.bar(words_in_both_df, x="count", y="word", orientation='h',width=1500, height=2000, title="<b>Words in BOTH Topics</b>").show()
100
0100200300400500parteningunoúltimociudadfortalezacocacercamitadluegodemáscosasembrarpiezamandarordinarioponercarneroyupanquimayordomosalirfrancisco13aquelnombregonzaloorotiempomaneraprovinciairencomenderodespuéscacique7repartimientoallítomínchácaramozoplatamesropa5012juanasimismoviejopotosíalonsollanomedio20diegomaízgrandeciertopuestotierrafanega10segundoterceroniñopedrocaminomismojuntoarriba8:casadoincallamadomartínprimero6principalhacer5llamardonpesocadacerropiedra34declarar2casa;decirtenerdarañopueblohaberdichoindioser
Words in BOTH Topicscountword
Code
top_vals = jitter_series(pd.Series(top_topic_word_counts))
bottom_vals = jitter_series(pd.Series(bottom_topic_word_counts))

tb_words = sorted(list(set(top_topic_words['lemma'].tolist()+bottom_topic_words['lemma'].tolist())))
top_topic_word_counts_df = pd.DataFrame({'word':top_topic_word_counts.index.tolist(), 'count': top_vals.values.tolist()})
bottom_topic_word_counts_df = pd.DataFrame({'word':bottom_topic_word_counts.index.tolist(), 'count':bottom_vals.values.tolist()})
tb_wc_df = top_topic_word_counts_df.merge(bottom_topic_word_counts_df, how='outer', on='word', )
tb_wc_df = tb_wc_df.fillna(0)
Code
# Initialize plotly
plotly.offline.init_notebook_mode(connected = False)

layout = dict(plot_bgcolor='white',width=1500, height=1500,
              margin=dict(t=20, l=20, r=20, b=20),
              xaxis=dict(title='Bottom Topics',
                         #range=[0.9, 5.5],
                         linecolor='#d9d9d9',
                         showgrid=False,
                         mirror=True),
              yaxis=dict(title='Top Topics',
                         #range=[95.5, 99.5],
                         linecolor='#d9d9d9',
                         showgrid=False,
                         mirror=True))

data = go.Scatter(x=tb_wc_df['count_x'],
                  y=tb_wc_df['count_y'], 
                  text=tb_wc_df['word'],
                  textposition='top right',
                  textfont=dict(color='#000000'),
                  mode='markers+text',
                  marker=dict(color='#5D69B1', size=8),
                  name='word',
                  )

fig = (go.Figure(data=data, layout=layout)
         .update_xaxes(type="log")
         .update_yaxes(type="log")
         .add_shape(type='line',
                x0=1,
                y0=1,
                x1=500,
                y1=500,
                line=dict(color='Red',),
                xref='x',
                yref='y')
         .show())
indiopueblodecirllamardichoprincipalparcialidadsertenercasacacique151020660poblado18vallemitimaeyungassujeto10030estancia4provinciaasimismo50127incaseñor8200270:13012580manera41capi1persona65753coca28mitima4535yungatasaquinientoscercaciudadhabertiempo500gonzalopizarroallíovejero40puesto255gómez9despobladoalonso117mismo1101831208912345678910234567891002345125102510025
Bottom TopicsTop Topics
Code
top_topic_word_counts_df = pd.DataFrame({'word':top_topic_word_counts.index.tolist(), 'count': pd.Series(top_topic_word_counts)})

words_in_top_only = top_topic_word_counts_df[~top_topic_word_counts_df['word'].isin(bottom_topic_word_counts_df['word'])].sort_values(by=['count'],ascending=False).head(100)
words_in_top_only = words_in_top_only.reindex(index=words_in_top_only.index[::-1])

fig = px.bar(words_in_top_only, x="count", y="word", orientation='h',width=1500, height=2000, title="<b>Words Only in TOP Topics</b>").show()
012345678sibahachupararribaestarcatacorcoychaumullo136ruyzyvcuramullo381ychocutiscatacohabahacachasaquinistacacoyobahatelacamolayticatacoasaahaha158256escaynabahaescochunabahacollabahachiquosacochaçecatarnaturalçama1804866nanavaha11121039yunguyo1341cano1663186guatarancamachaguay153vxicopuno1438chanbilla10701470hilauillangama1207440aimara12213471233perjuiciomaçuelasorejón378chondaçoco720xilinerynchusconrmachupaycobratariquepaycartaripamatarrarcorcoancapongocotayacochiribatocoyacopiscoguamanrorosachacolanaacorachicuitochinchasuyocochunaçepita55chuquichanbipomatalluctapescadoramboscayocaxuliurosdepositaraymarapartidoaymaraeslurinsayaanansayauro
Words Only in TOP Topicscountword
Code
bottom_topic_word_counts_df = pd.DataFrame({'word':bottom_topic_word_counts.index.tolist(), 'count': pd.Series(bottom_topic_word_counts)})

words_in_bottom_only = bottom_topic_word_counts_df[~bottom_topic_word_counts_df['word'].isin(top_topic_word_counts_df['word'])].sort_values(by=['count'],ascending=False).head(100)
words_in_bottom_only = words_in_bottom_only.reindex(index=words_in_bottom_only.index[::-1])

fig = px.bar(words_in_bottom_only, x="count", y="word", orientation='h',width=1500, height=2000, title="<b>Words Only in BOTTOM Topics</b>").show()
050100150200250300350400450cabecerasazónharrierosiguientecuzcoojotacomersegundomayordomobatalla23viejoparjuntopelearchaquipagarvenirningunosalircosamedioganado21ovejamacha13entregarsemanavasijaxauxa;vestidoropacabezasebomorirpilcolátigoleñapatocántarohacermielpescuezollevarcerasallanamandilcabestrogallinacaracaramarquésmandarfrazadapiezacinchaviudacuentochuño11gualcahuevopreciojáquimamontemayorirvisitaciónserviciocorderotierradespuéspapaquinuavalercómoalgunosimorocodóndemaízdoncuántocarneroplatavillacorrienteencomenderofanegapotosíarrobaquipocargapreguntarcadaañovenderpesodar
Words Only in BOTTOM Topicscountword
Code
words_in_both_df = top_topic_word_counts_df[top_topic_word_counts_df['word'].isin(bottom_topic_word_counts_df['word'])].sort_values(by=['count'],ascending=False).head(100)
words_in_both_df = words_in_both_df.reindex(index=words_in_both_df.index[::-1])
len(words_in_both_df)

fig = px.bar(words_in_both_df, x="count", y="word", orientation='h',width=1500, height=2000, title="<b>Words in BOTH TOP & BOTTOM Topics</b>").show()
79
01020304050607080120tasayunga3545mitima28coca3quinientos75persona1capi41manera80130:65cerca125haber183110mismo117despoblado9gómez5alonsopuesto25500gonzalopizarrotiempoallíovejero40ciudad7022008señorinca4provinciaasimismo50712estancia30100sujetoyungasmitimaevalle18poblado606152010caciquecasatenerserparcialidadprincipaldichollamardecirpuebloindio
Words in BOTH TOP & BOTTOM Topicscountword
Code
isinstance(words_in_both_df['count'], pd.core.series.Series)
True
Code
foo = [1,2,3]
isinstance(foo, pd.core.series.Series)
False