Exploring State of the Union Speeches with Data Science - Part 1

Can we use data science to explore president's State of the Union speeches?

Imagem de capa

The Tutorial Video

If you enjoyed this video or found it helpful in any way, I would love you forever if you passed me along a dollar or two to help fund my machine learning education and research! Every dollar helps me get a little closer and I’m forever grateful.

The Notebook

class SOTUFile:

    def __init__(self, directory, filename):

        parts = filename.split('_')

        assert(len(parts) == 2)

        self._name = parts[0]
        self._year = parts[1].split('.')[0]

        with open(directory + '/' + filename) as in_file:
            self._text = in_file.read()

    def __str__(self):
        return self._name + ' -- ' + self._year + '\n\n' + self._text[:100]

    def __repr__(self):
        return self.__str__()

import os

directory = 'SOTU_corpus'

SOTUs = []
for filename in os.listdir(directory):
    SOTUs.append(SOTUFile(directory, filename))

washington = [x for x in SOTUs if x._name == 'Washington']

presidents = {}

for file in SOTUs:

    if file._name not in presidents:
        presidents[file._name] = [file]
    else:
        presidents[file._name].append(file)

from collections import Counter

class President:

    def __init__(self, files):

        assert(len(files))

        self._name = files[0]._name
        self._years = sorted([int(f._year) for f in files])
        self._texts = [f._text for f in files]

        all_text = '\n'.join(self._texts)

        words = []
        for w in all_text.split():
            words.append(w.strip().lower())

        self._counter = Counter(words)

    def __str__(self):
        return self._name + ': ' + str(self._years)

    def __repr__(self):
        return self.__str__()

washington = President(presidents['Washington'])
washington._counter.most_common(20)

harding = President(presidents['Harding'])
harding._counter.most_common(20)

vocab_cnts = []

for name in presidents.keys():
    p = President(presidents[name])

    vocab_cnts.append((len(p._counter), name))

%matplotlib inline

import matplotlib
import matplotlib.pyplot as plt

plt.rcParams['figure.figsize'] = [20, 15]

x = [v[1] for v in vocab_cnts]
y = [v[0] for v in vocab_cnts]

plt.xticks(rotation=90)
plt.bar(x, y)

pres_list = []

for name in presidents.keys():
    pres_list.append(President(presidents[name]))

by_year = sorted(pres_list, key=lambda x: min(x._years))

vocab_cnts = []

for p in by_year:
    vocab_cnts.append((len(p._counter), p._name))

x = [v[1] for v in vocab_cnts]
y = [v[0] for v in vocab_cnts]

plt.xticks(rotation=90)
plt.bar(x, y)