Opener Project Part 2

Table of contents

fetch-and-unpack-twitter-data.py

This script performs the unglorious task of downloading a zipfile from a URL and unzipping its contents to a folder. When it's done its job, inspect the contents of data-hold/

import requests
import shutil
import os.path
from glob import glob
from tempfile import NamedTemporaryFile

DATA_URL = 'http://stash.compjour.org/data/sunlight-twitter-opener.zip'
DATA_DIR = 'data-hold'

# Download the file
print("Downloading", DATA_URL)
r = requests.get(DATA_URL)
# This NamedTemporaryFile class is just a better convention
# for handling a file (in this case, a zip file) that I know I don't want
# to keep around
# Note: Revision to this script
# Apparently NamedTemporaryFile is borked on Windows
# https://bugs.python.org/issue14243

# The following code will work fine, though.
tzip = NamedTemporaryFile(delete = False)
tzip.write(r.content)
tzip.close()
print("Unzipping to", DATA_DIR)
shutil.unpack_archive(tzip.name, DATA_DIR, format = 'zip')
os.remove(tzip.name) # again, thanks a lot Windows
csvname = os.path.join(DATA_DIR, 'sunlight_legislators.csv')
print(csvname, 'has', os.path.getsize(csvname), 'bytes')
print("Tweet files:", len(glob('data-hold/tweets/*.json')))
print("Profile files:", len(glob('data-hold/profiles/*.json')))
        
File found at: /files/code/opener-project/fetch-and-unpack-twitter-data.py

read-sunlight-csv.py

The first data file comes courtesy of the Sunlight Foundation: it's a spreadsheet (CSV file) that contains information on every current and recent Congressmember, and more helpfully, their unique identifiers to other data sources, such as FEC filings and social networks.

In this script, we use the csv module to create a collection of Dict objects, so that we can access the attributes more easily of each row (i.e. Congressmember).

import csv
import os.path
DATA_DIR = 'data-hold'
csvname = os.path.join(DATA_DIR, 'sunlight_legislators.csv')
# not sure why we have to specify 'utf-8' encoding here. Oh well
csvdata = csv.DictReader(open(csvname, encoding = 'utf-8'))
# turn it into a list that we can easily traverse
congressmembers = []
for row in csvdata:
    congressmembers.append(row)
# Note: the Pythonic idiom is to do a list comprehension
# congressmembers = [row for row in csvdata]
print("There are {} Congressmembers listed".format(len(congressmembers)))

# filter for active congressmembers
# again, pythonic way is:
# active_members = [m for m in congressmembers if m['in_office'] == '1']
active_members = []
for m in congressmembers:
    if m['in_office'] == '1':
        active_members.append(m)
print("There are {} active Congressmembers".format(len(active_members)))

# Now we want active members who are in California
ca_active_members = []
for m in active_members:
    if m['state'] == 'CA':
        ca_active_members.append(m)
print("There are {} active Congressmembers from CA".format(len(ca_active_members)))

# Python list comprehensions can be confusing. The following snippet:
# ca_tweeters = []
# for m in ca_active_members:
#     if m['twitter_id'] != '':
#           ca_twitters.append(m)
ca_tweeters = [m for m in ca_active_members if m['twitter_id'] != '']
print("There are {} active CA Congressmembers from CA on Twitter".format(len(ca_tweeters)))
# yes, the list comprehension syntax can be hard to understand
        
File found at: /files/code/opener-project/read-sunlight-csv.py

The output:

There are 894 Congressmembers listed
There are 539 active Congressmembers
There are 55 active Congressmembers from CA
There are active CA Congressmembers from CA on Twitter

read-twitter-json.py

import json
import os.path
from glob import glob
from datetime import datetime
from operator import itemgetter

DATA_DIR = 'data-hold'
PROFILES_DIR = os.path.join(DATA_DIR, 'profiles')
TWEETS_DIR = os.path.join(DATA_DIR, 'tweets')
screen_name = 'repmikehonda'
# check out his Twitter profile
pfname = os.path.join(PROFILES_DIR, screen_name + '.json')
profile = json.loads(open(pfname).read())
print("{} goes by the screen_name of {}:".format(profile['name'], screen_name))
print("* Has {} followers and follows {}".format(profile['followers_count'], profile['friends_count']))
# the created_at format looks like this: Fri Oct 03 20:18:31 +0000 2008
# converting strings to dates is always a real pain
cr_date = datetime.strptime(profile['created_at'], '%a %b %d %H:%M:%S +0000 %Y')
# for more information on strptime
# http://stackoverflow.com/questions/7703865/going-from-twitter-date-to-python-datetime-date
days_since_created_at = (datetime.today() - cr_date).days
print("* Has been on Twitter for {} days".format(days_since_created_at))
daily_tweet_rate = round(profile['statuses_count'] / days_since_created_at, 2)
print("* Overall Tweet rate of {} tweets/day".format(daily_tweet_rate) )


# Now we open his corresponding tweets JSON file, which contains his
# last 200 tweets
tfname = os.path.join(TWEETS_DIR, screen_name + '.json')
tweets = json.loads(open(tfname).read())

# Get tweets that are not retweets
original_tweets = [t for t in tweets if t.get('retweeted_status') == None]
print("* Has {} original tweets in his last {}".format(len(original_tweets), len(tweets)))
# Get most retweeted Tweet
tweets_sorted_by_rts = sorted(original_tweets, key = itemgetter('retweet_count'), reverse = True)
rt = tweets_sorted_by_rts[0]
print("* Most retweeted tweet has {} RTs and said: {}".format(rt['retweet_count'], rt['text']))
        
File found at: /files/code/opener-project/read-twitter-json.py

The output will look like this:

Rep. Mike Honda goes by the screen_name of repmikehonda:
* Has 16985 followers and follows 827
* Has been on Twitter for 2375 days
* Overall Tweet rate of 0.69 tweets/day
* Has 128 original tweets in his last 200
* Most retweeted tweet has 10608 RTs and said: As the proud grandpa of a transgender grandchild, I hope she can feel safe at school without fear of being bullied. http://t.co/NDIfOdW9sk

twitter_foo.py

import json
import csv
import os.path
from glob import glob
from datetime import datetime
from operator import itemgetter
DATA_DIR = 'data-hold'
PROFILES_DIR = os.path.join(DATA_DIR, 'profiles')
TWEETS_DIR = os.path.join(DATA_DIR, 'tweets')


def get_ca_tweeters():
    csvname = os.path.join(DATA_DIR, 'sunlight_legislators.csv')
    csvdata = csv.DictReader(open(csvname, encoding = 'utf-8'))
    members = []
    for row in csvdata:
        if row['in_office'] == '1' and row['state'] == 'CA' and row['twitter_id'] != '':
            members.append(row)

    return members


def get_profile(screen_name):
    pfname = os.path.join(PROFILES_DIR, screen_name + '.json')
    profile = json.loads(open(pfname, encoding = 'utf-8').read())

    return profile


def get_tweets(screen_name):
    tfname = os.path.join(TWEETS_DIR, screen_name + '.json')
    tweets = json.loads(open(tfname, encoding = 'utf-8').read())

    return tweets


def get_original_tweets(screen_name):
    tweets = get_tweets(screen_name)
    original_tweets = [t for t in tweets if t.get('retweeted_status') == None]

    return original_tweets


def convert_twitter_timestamp(cstr):
    # cstr looks like: "Fri Oct 03 20:18:31 +0000 2008"
    return datetime.strptime(cstr, '%a %b %d %H:%M:%S +0000 %Y')


def get_tweets_with_word(screen_name, some_word):
    word = some_word.lower()
    orgtweets = get_original_tweets(screen_name)
    xlist = []
    for tweet in orgtweets:
        if word in tweet['text'].lower():
            xlist.append(tweet)

    return xlist
        
File found at: /files/code/opener-project/twitter_foo.py

twitter_foo_fun.py

from twitter_foo import get_ca_tweeters, get_profile, get_tweets, get_original_tweets
from twitter_foo import convert_twitter_timestamp, get_tweets_with_word
from datetime import datetime

ca_tweeters = get_ca_tweeters()
print("There are {} CA tweeters".format(len(ca_tweeters)))

for member in ca_tweeters:
    tid = member['twitter_id'].lower()
    profile = get_profile(tid)
    tweets = get_tweets(tid)

    days_ago = (datetime.today() - convert_twitter_timestamp(profile['created_at'])).days
    tweet_rate = round(profile['statuses_count'] / days_ago, 2)

    print("----------------")
    istr = "{} has {} followers and tweets {} times per day since joining Twitter {} days ago"
    print(istr.format(tid, profile['followers_count'], tweet_rate, days_ago))
    wrd = 'obama'
    wrd_tweets = get_tweets_with_word(tid, wrd)
    print("Number of original tweets with '{}' in last 200 tweets: {}".format(
                                            wrd, len(wrd_tweets)))

        
File found at: /files/code/opener-project/twitter_foo_fun.py

The output will look like this:

There are 54 CA tweeters
----------------
reppeteaguilar has 312 followers and tweets 1.15 times per day since joining Twitter 60 days ago
Number of original tweets with 'obama' in last 200 tweets: 0
----------------
repbecerra has 17686 followers and tweets 0.85 times per day since joining Twitter 1815 days ago
Number of original tweets with 'obama' in last 200 tweets: 8
----------------
senatorboxer has 57927 followers and tweets 0.7 times per day since joining Twitter 2455 days ago
Number of original tweets with 'obama' in last 200 tweets: 14

twitter-tablemaker.py

from twitter_foo import get_ca_tweeters, get_profile, get_tweets, get_original_tweets
from twitter_foo import convert_twitter_timestamp, get_tweets_with_word
from datetime import datetime
from jinja2 import Template


html_file = open("data-hold/table.html", "w", encoding = "utf-8")
html_file.write(
"""
<html>
<head>
    <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />

   <title>My CA Twitter Table</title>
   <link rel="stylesheet" href="http://maxcdn.bootstrapcdn.com/bootstrap/3.3.4/css/bootstrap.min.css">
</head>
<body>
    <div class="container">
        <h1>Congress folks on Twitter</h1>
        <table class="table table-condensed table-striped">
          <thead>
             <tr>
               <th>Congressmember</th>
               <th>Twitter name</th>
               <th>Party</th>
               <th>Followers</th>
               <th>Tweets</th>
               <th>Days on Twitter</th>
               <th>Daily Twitter Rate</th>
             </tr>
           </thead>
           <tbody>
""")


row_str = """
<tr>
  <td>
    <img src="{{ profile_pic_url }}" style="width: 60px">
    {{ title }} {{ full_name }}
  </td>
  <td>
    <a href="https://twitter.com/{{ twitter_id }}">{{ twitter_id }}</a>
  </td>
  <td>
    {{ party }}
  </td>
  <td>
    {{ followers_count }}
  </td>
  <td>
    {{ tweets_count }}
  </td>
  <td>
    {{ days_ago }}
  </td>
  <td>
    {{ daily_tweet_rate }}
  </td>
</tr>
"""

row_template = Template(row_str)

for member in get_ca_tweeters():
    tid = member['twitter_id'].lower()
    profile = get_profile(tid)
    tweets = get_tweets(tid)

    days_ago = (datetime.today() - convert_twitter_timestamp(profile['created_at'])).days
    tweet_rate = round(profile['statuses_count'] / days_ago, 2)


    data = {
        "profile_pic_url" : profile['profile_image_url'],
        "title" : member['title'],
        "party" : member['party'],
        "full_name" : " ".join([member['firstname'], member['lastname']]),
        "twitter_id": tid,
        "followers_count":  profile['followers_count'],
        "tweets_count": profile['statuses_count'],
        "days_ago": days_ago,
        "daily_tweet_rate": tweet_rate
        }
    html_file.write(row_template.render(data))


html_file.write("</table></div>")
html_file.close()
        
File found at: /files/code/opener-project/twitter-tablemaker.py

twitter-word-tweets.py

from twitter_foo import get_ca_tweeters, get_profile, get_tweets_with_word
from datetime import datetime
from jinja2 import Template
the_word = "kenya"


html_file = open("data-hold/word-" + the_word + ".html", "w", encoding = "utf-8")
body_template = Template(
"""
<html>
<head>
    <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
   <title>My CA Twitter words</title>
   <link rel="stylesheet" href="http://maxcdn.bootstrapcdn.com/bootstrap/3.3.4/css/bootstrap.min.css">
</head>
<body>
    <section class="container">
        <h1>Congress folks who tweeted about "{{ word }}"</h1>
        <div class="row">
           <div class="col-sm-6">
              <h2>Democrats</h2>
              {{ d_tweets }}
           </div>
           <div class="col-sm-6">
              <h2>Republicans</h2>
              {{ r_tweets }}
           </div>
     </section>
  </body>
  </html>
""")

# Documentation on Twitter's embed tweet code:
# https://dev.twitter.com/web/embedded-tweets
embed_tweet = Template("""
<blockquote class="twitter-tweet" lang="en"><p>
<a href="https://twitter.com/{{screen_name}}/status/{{tweet_id}}">X</a>
</p></blockquote>
<script async src="https://platform.twitter.com/widgets.js" charset="utf-8"></script>
""")


d_str = ""
r_str = ""

for member in get_ca_tweeters():
    screen_name = member['twitter_id'].lower()
    profile = get_profile(screen_name)
    word_tweets = get_tweets_with_word(screen_name, the_word)
    # if there is at least one tweet with the word
    if len(word_tweets) > 0:
        tweet = word_tweets[0]

        t = embed_tweet.render(screen_name = screen_name, tweet_id = tweet['id'])
        if member['party'] == 'D':
            d_str += t
        else:
            r_str += t



# Now add the d_str and r_str strings, which contain a bunch of HTML, to the main
# body template
html_file.write(body_template.render(word = the_word, d_tweets = d_str, r_tweets = r_str))
html_file.close()
        
File found at: /files/code/opener-project/twitter-word-tweets.py