Part 3 - Themes Over Time and Party Affiliation - Final Project

What are the most common themes in inaugural addresses?
How has this changed over time?
Does a certain party have more polarizing speech?

import pandas as pd
import numpy as np
import spacy
from collections import Counter

inaugural = pd.read_csv('data/inaugural_address.csv').iloc[:, 1:]
#convert to dt
inaugural['date'] = pd.to_datetime(inaugural['date'])
inaugural.head()

What are the most common themes in inaugural addresses?¶

from gensim.corpora import Dictionary
from gensim.models import LdaModel
import pyLDAvis
import pyLDAvis.gensim_models
from spacy import displacy

# If did not get env from Makefile, run:
!python -m spacy download en_core_web_sm

nlp = spacy.load("en_core_web_sm")

def preprocess_text(text): 
    doc = nlp(text) 
    return [
        token.lemma_.lower() for token in doc 
        if not (token.is_stop or token.is_punct or token.is_space) 
        and len(token.lemma_) > 3
    ]

processed_docs = inaugural["text"].apply(preprocess_text)

dictionary = Dictionary(processed_docs) 
dictionary.filter_extremes(no_below=5, no_above=0.5)
corpus = [dictionary.doc2bow(doc) for doc in processed_docs] # approx 25 seconds

lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=5, random_state=42, passes=10)

print("Inaugural Addresses LDA Themes:")
for i, topic in lda_model.print_topics(-1): 
    print(f"Theme: {i}")
    print(f"Words: {topic}")
    print()

Inaugural Addresses LDA Themes:
Theme: 0
Words: 0.005*"method" + 0.004*"establish" + 0.004*"race" + 0.004*"community" + 0.004*"rest" + 0.004*"commercial" + 0.004*"officer" + 0.004*"local" + 0.004*"relation" + 0.004*"election"

Theme: 1
Words: 0.010*"business" + 0.006*"federal" + 0.006*"increase" + 0.005*"ought" + 0.005*"legislation" + 0.004*"tariff" + 0.004*"revenue" + 0.004*"proper" + 0.004*"race" + 0.004*"trade"

Theme: 2
Words: 0.009*"thank" + 0.008*"today" + 0.007*"like" + 0.006*"task" + 0.006*"face" + 0.005*"child" + 0.005*"civilization" + 0.004*"industrial" + 0.004*"help" + 0.004*"wish"

Theme: 3
Words: 0.013*"today" + 0.012*"americans" + 0.009*"century" + 0.007*"democracy" + 0.007*"child" + 0.006*"generation" + 0.006*"earth" + 0.006*"promise" + 0.006*"challenge" + 0.006*"help"

Theme: 4
Words: 0.005*"opinion" + 0.005*"object" + 0.004*"general" + 0.004*"exist" + 0.004*"revenue" + 0.004*"limit" + 0.004*"regard" + 0.003*"circumstance" + 0.003*"effect" + 0.003*"experience"

pyLDAvis.enable_notebook()
pyLDAvis.gensim_models.prepare(lda_model, corpus, dictionary)

How have the major themes changed over time?¶

inaugural_1700s = inaugural[inaugural['date'].dt.year < 1800]
inaugural_1800s = inaugural[(1800 <= inaugural['date'].dt.year) & (inaugural['date'].dt.year < 1900)]
inaugural_1900s = inaugural[(1900 <= inaugural['date'].dt.year) & (inaugural['date'].dt.year < 2000)]
inaugural_2000s = inaugural[2000 < inaugural['date'].dt.year]

processed_docs_1700s = inaugural_1700s["text"].apply(preprocess_text)
processed_docs_1800s = inaugural_1800s["text"].apply(preprocess_text)
processed_docs_1900s = inaugural_1900s["text"].apply(preprocess_text)
processed_docs_2000s = inaugural_2000s["text"].apply(preprocess_text)

# use same dictionary for all:
dictionary = Dictionary(processed_docs)
dictionary.filter_extremes(no_below=5, no_above=0.5)

corpus_1700s = [dictionary.doc2bow(doc) for doc in processed_docs_1700s]
corpus_1800s = [dictionary.doc2bow(doc) for doc in processed_docs_1800s]
corpus_1900s = [dictionary.doc2bow(doc) for doc in processed_docs_1900s]
corpus_2000s = [dictionary.doc2bow(doc) for doc in processed_docs_2000s]

lda_model_1700s = LdaModel(corpus=corpus_1700s, id2word=dictionary, num_topics=5, random_state=42, passes=10)
lda_model_1800s = LdaModel(corpus=corpus_1800s, id2word=dictionary, num_topics=5, random_state=42, passes=10)
lda_model_1900s = LdaModel(corpus=corpus_1900s, id2word=dictionary, num_topics=5, random_state=42, passes=10)
lda_model_2000s = LdaModel(corpus=corpus_2000s, id2word=dictionary, num_topics=5, random_state=42, passes=10)

print("Inaugural Addresses 18th Century LDA Themes:")
for i, topic in lda_model_1700s.print_topics(-1): 
    print(f"Theme: {i}")
    print(f"Words: {topic}")
    print()

Inaugural Addresses 18th Century LDA Themes:
Theme: 0
Words: 0.003*"voice" + 0.003*"execute" + 0.003*"official" + 0.003*"entertain" + 0.003*"instance" + 0.003*"constitutional" + 0.003*"function" + 0.003*"occasion" + 0.003*"endeavor" + 0.003*"presence"

Theme: 1
Words: 0.001*"knowledge" + 0.001*"general" + 0.001*"legislature" + 0.001*"choice" + 0.001*"establish" + 0.001*"resolution" + 0.001*"virtuous" + 0.001*"wish" + 0.001*"feel" + 0.001*"idea"

Theme: 2
Words: 0.001*"happiness" + 0.001*"ought" + 0.001*"establish" + 0.001*"blessing" + 0.001*"humble" + 0.001*"nature" + 0.001*"influence" + 0.001*"particular" + 0.001*"decide" + 0.001*"expedient"

Theme: 3
Words: 0.007*"ought" + 0.005*"happiness" + 0.005*"establish" + 0.005*"circumstance" + 0.005*"nature" + 0.004*"private" + 0.004*"influence" + 0.004*"blessing" + 0.004*"execute" + 0.004*"particular"

Theme: 4
Words: 0.008*"legislature" + 0.006*"virtuous" + 0.006*"general" + 0.006*"choice" + 0.006*"knowledge" + 0.004*"happiness" + 0.004*"consider" + 0.004*"attachment" + 0.004*"establish" + 0.004*"little"

print("Inaugural Addresses 19th Century LDA Themes:")
for i, topic in lda_model_1800s.print_topics(-1): 
    print(f"Theme: {i}")
    print(f"Words: {topic}")
    print()

Inaugural Addresses 19th Century LDA Themes:
Theme: 0
Words: 0.006*"protection" + 0.005*"object" + 0.005*"revenue" + 0.005*"extend" + 0.004*"importance" + 0.004*"blessing" + 0.004*"territory" + 0.004*"regard" + 0.004*"opinion" + 0.004*"happy"

Theme: 1
Words: 0.008*"revenue" + 0.007*"business" + 0.006*"legislation" + 0.005*"countryman" + 0.005*"constant" + 0.004*"patriotic" + 0.004*"enterprise" + 0.004*"partisan" + 0.004*"federal" + 0.004*"benefit"

Theme: 2
Words: 0.006*"officer" + 0.006*"revenue" + 0.005*"community" + 0.005*"method" + 0.005*"increase" + 0.005*"expect" + 0.004*"occasion" + 0.004*"debt" + 0.004*"reason" + 0.004*"opinion"

Theme: 3
Words: 0.006*"opinion" + 0.005*"experience" + 0.005*"happiness" + 0.004*"feel" + 0.004*"circumstance" + 0.004*"constitutional" + 0.004*"countryman" + 0.003*"position" + 0.003*"general" + 0.003*"result"

Theme: 4
Words: 0.005*"object" + 0.005*"general" + 0.005*"effect" + 0.005*"exist" + 0.005*"case" + 0.005*"opinion" + 0.005*"grant" + 0.004*"proper" + 0.004*"term" + 0.004*"appear"

print("Inaugural Addresses 20th Century LDA Themes:")
for i, topic in lda_model_1900s.print_topics(-1): 
    print(f"Theme: {i}")
    print(f"Words: {topic}")
    print()

Inaugural Addresses 20th Century LDA Themes:
Theme: 0
Words: 0.007*"island" + 0.007*"rest" + 0.005*"inhabitant" + 0.005*"million" + 0.005*"faithful" + 0.004*"problem" + 0.004*"relation" + 0.004*"treaty" + 0.004*"solve" + 0.004*"establish"

Theme: 1
Words: 0.010*"business" + 0.007*"increase" + 0.006*"ought" + 0.006*"race" + 0.005*"proper" + 0.005*"amendment" + 0.005*"federal" + 0.005*"international" + 0.005*"tariff" + 0.005*"trade"

Theme: 2
Words: 0.008*"task" + 0.006*"civilization" + 0.006*"opportunity" + 0.006*"face" + 0.005*"thought" + 0.005*"industrial" + 0.005*"leadership" + 0.005*"problem" + 0.005*"wish" + 0.005*"today"

Theme: 3
Words: 0.014*"today" + 0.012*"century" + 0.012*"americans" + 0.008*"help" + 0.007*"earth" + 0.007*"democracy" + 0.006*"friend" + 0.006*"child" + 0.006*"begin" + 0.006*"challenge"

Theme: 4
Words: 0.001*"today" + 0.001*"federal" + 0.001*"economic" + 0.001*"business" + 0.001*"opportunity" + 0.001*"increase" + 0.001*"self" + 0.001*"help" + 0.001*"problem" + 0.001*"democracy"

print("Inaugural Addresses 21st Century LDA Themes:")
for i, topic in lda_model_2000s.print_topics(-1): 
    print(f"Theme: {i}")
    print(f"Words: {topic}")
    print()

Inaugural Addresses 21st Century LDA Themes:
Theme: 0
Words: 0.001*"thank" + 0.001*"today" + 0.001*"generation" + 0.001*"child" + 0.001*"americans" + 0.001*"like" + 0.001*"woman" + 0.001*"build" + 0.001*"hard" + 0.001*"begin"

Theme: 1
Words: 0.006*"permanent" + 0.006*"excuse" + 0.006*"feel" + 0.005*"tyranny" + 0.005*"deep" + 0.005*"goal" + 0.004*"fulfill" + 0.004*"tradition" + 0.004*"oppression" + 0.004*"idealism"

Theme: 2
Words: 0.018*"thank" + 0.014*"today" + 0.014*"americans" + 0.009*"child" + 0.009*"democracy" + 0.009*"like" + 0.007*"challenge" + 0.007*"promise" + 0.007*"dream" + 0.007*"unity"

Theme: 3
Words: 0.011*"today" + 0.010*"generation" + 0.009*"americans" + 0.007*"woman" + 0.007*"child" + 0.006*"word" + 0.006*"face" + 0.006*"journey" + 0.005*"ideal" + 0.005*"hard"

Theme: 4
Words: 0.001*"today" + 0.001*"child" + 0.001*"generation" + 0.001*"thank" + 0.001*"americans" + 0.001*"like" + 0.001*"word" + 0.001*"forward" + 0.001*"build" + 0.001*"dream"

# 18th century topics visaualization
pyLDAvis.gensim_models.prepare(lda_model_1700s, corpus_1700s, dictionary)

# 19th century topics visaualization
pyLDAvis.gensim_models.prepare(lda_model_1800s, corpus_1800s, dictionary)

# 20th century topics visaualization
pyLDAvis.gensim_models.prepare(lda_model_1900s, corpus_1900s, dictionary)

# 21st century topics visaualization
pyLDAvis.gensim_models.prepare(lda_model_2000s, corpus_2000s, dictionary)

Does a certain party have more polarizing speech?¶

inaugural['party'] = np.array([
    'None',
    'None',
    'Federalist',
    'Democratic-Republican',
    'Democratic-Republican',
    'Democratic-Republican',
    'Democratic-Republican',
    'Democratic-Republican',
    'Democratic-Republican',
    'National Republican',
    'Democratic',
    'Democratic',
    'Democratic',
    'Whig',
    'Democratic',
    'Whig',
    'Democratic',
    'Democratic',
    'Republican',
    'Republican',
    'Republican',
    'Republican',
    'Republican',
    'Republican',
    'Democratic',
    'Republican',
    'Democratic',
    'Republican',
    'Republican',
    'Republican',
    'Republican',
    'Democratic',
    'Democratic',
    'Republican',
    'Republican',
    'Republican',
    'Democratic',
    'Democratic',
    'Democratic',
    'Democratic',
    'Republican',
    'Democratic',
    'Republican',
    'Republican',
    'Republican',
    'Republican',
    'Democratic',
    'Democratic',
    'Republican',
    'Republican',
    'Democratic',
    'Democratic',
    'Republican',
    'Democratic',
    'Republican'
])
inaugural = inaugural[['president_name', 'party', 'president_number', 'date', 'text']]
inaugural.head()

from afinn import Afinn
import warnings
# ignore groupby warnings:
warnings.filterwarnings("ignore")

afinn = Afinn()

def calculate_sentiment_metrics(tokens):
    """
    Calculates sentiment metrics for tokens
    """
    scores = [afinn.score(token) for token in tokens]
    
    # Remove neutral words for polarization_score
    non_zero_scores = [s for s in scores if s != 0]
    
    return {
        'sentiment_score': sum(scores),
        'positive_words': sum(1 for s in scores if s > 0),
        'negative_words': sum(1 for s in scores if s < 0),
        'sentiment_variance': np.var(non_zero_scores) if non_zero_scores else 0
    }

sentiment_metrics = processed_docs.apply(calculate_sentiment_metrics)
sentiment_df = pd.DataFrame(sentiment_metrics.tolist())

# Combine with original data
inaugural_with_sentiment = pd.concat([inaugural, sentiment_df], axis=1)

def calculate_polarization(group):
    """
    Polarization metrics function for groupby
    """
    return pd.Series({
        'mean_sentiment_variance': group['sentiment_variance'].mean(),
        'total_positive_words': group['positive_words'].sum(),
        'total_negative_words': group['negative_words'].sum(),
        'net_sentiment': group['sentiment_score'].mean(),
        'num_speeches': len(group)
    })

polarization_by_party = inaugural_with_sentiment.groupby('party').apply(calculate_polarization)

polarization_by_party['avg_positive_per_speech'] = (
    polarization_by_party['total_positive_words'] / polarization_by_party['num_speeches']
)
polarization_by_party['avg_negative_per_speech'] = (
    polarization_by_party['total_negative_words'] / polarization_by_party['num_speeches']
)

/Users/calvin/miniconda3/envs/inaugural-address/lib/python3.11/site-packages/afinn/afinn.py:97: DeprecationWarning: invalid escape sequence '\w'
  self._word_pattern = re.compile('\w+', flags=re.UNICODE)

(
    polarization_by_party
    .sort_values('net_sentiment', ascending=False)
    [['total_positive_words', 'total_negative_words', 'net_sentiment', 'num_speeches']]
)

(
    polarization_by_party
    .sort_values('avg_positive_per_speech', ascending=False)
    [['avg_positive_per_speech', 'avg_negative_per_speech', 'mean_sentiment_variance']]
)