# Twitter data collection

Note that you will need to have an API key from twitter to use this. You can create one here: https://apps.twitter.com

## Imports
- `pandas` is a package for handling and manipulating data.
- `tweepy` is a package that helps you use the Twitter API in python.
- `time` is a package that lets you work with time in python. 
- `json` is a package for handling data in json format 

In [None]:
import pandas as pd
import tweepy
import time
import json

## Keys
- You'll need API keys from twitter to use their API. Once you have them, they can go here.

In [None]:
keys = {'CONSUMER_KEY': 'xxx',
        'CONSUMER_SECRET': 'xxx'}

### Loading a key from a file
- I already have a key saved in a file called `twitter_auth`. 
- This code reads that file and uses the key.

In [None]:
with open('twitter_auth', 'r') as f:
    keys = json.loads(f.read())    
keys

### Using keys
- Once you have keys, you'll need to give them to `tweepy` so that it can use them to get the data you request.

In [None]:
auth = tweepy.AppAuthHandler(keys['CONSUMER_KEY'], 
                                  keys['CONSUMER_SECRET'])

api = tweepy.API(auth, wait_on_rate_limit=True, 
                 wait_on_rate_limit_notify=True)

if not api:
    print ("Can't Authenticate :(")
else:
    print('Authenticated successfully!')

## User account names we want to get tweets for

In [None]:
users = ['nyctaxi', 'NYTWA', 'NYC_TMODA', 'YellowCabNYC', 
         'NYC_DOT', 'NYCTSubway']

## Some functions to help us
- Normally we wouldn't write code this repetitive, but it is here for example.

In [None]:
# create a function that takes a username and the tweepy API as arguments
def get_user_tweets(name, api):
    
    #make an empty list to save tweets in
    tweets = []
    
    # ask python to try and do the following code.
    # if the code breaks, python will instead do what's under "except"
    try: 
        
        # loop over each tweet tweepy finds
        for status in tweepy.Cursor(api.user_timeline,  # the kind of data we want
                                    screen_name=name,   # the user we want data for
                                    count=200).items(): # the maximum number to get at once
            try: 
                # try to convert this tweet to json and then add it to our list
                tweets.append(status._json)
            except:
                # sometimes a tweet is broken or missing, so we can't get its json
                # here we just ignore those tweets with "pass", which means "do nothing"
                pass
    except:
        # sometimes we can't get anything for a user, perhaps because the user doesn't exist
        # here we just skip them using "pass" 
        # you could do more complicated error handling if you wanted
        pass
    
    # take the list of tweets we made, and give it back to whatever code asked for it
    return tweets


def get_user_followers(name, api):
    followers = []
    try: 
        for status in tweepy.Cursor(api.followers, 
                                    screen_name=name, 
                                    count=200).items():
            try:
                followers.append(status._json)
            except:
                pass
            if len(followers) >= 5000:
                print(name, "has more than 5,000 followers. Stopping at 5,000.")
                break
    except:
        pass
    return followers

def get_user_followers_ids(name, api):
    followers = []
    try: 
        for status in tweepy.Cursor(api.followers_ids, 
                                    screen_name=name, 
                                    count=5000).items():
            try:
                followers.append(status)
            except:
                pass
            if len(followers) >= 5000:
                print(name, "has more than 5,000 followers. Stopping at 5,000.")
                break
            
    except:
        pass
    return {'name':name, 'followers': followers}


def get_user_friends(name, api):
    friends = []
    try: 
        for status in tweepy.Cursor(api.friends, 
                                    screen_name=name, 
                                    count=200).items():
            try:
                friends.append(status._json)
            except:
                pass
            if len(friends) >= 5000:
                print(name, "has more than 5,000 friends. Stopping at 5,000.")
                break
    except:
        pass
    return friends

def get_user_friends_ids(name, api):
    friends = []
    try: 
        for status in tweepy.Cursor(api.friends_ids, 
                                    screen_name=name, 
                                    count=5000).items():
            try:
                friends.append(status)
            except:
                pass
            if len(friends) >= 5000:
                print(name, "has more than 5,000 friends. Stopping at 5,000.")
                break
    except:
        pass
    return {'name':name, 'friends': friends}

In [None]:
all_tweets = []

for u in users:
    print("Getting tweets for user", u)
    tweets = get_user_tweets(u, api)
    print("Found", len(tweets), 'tweets.')
    all_tweets.extend(tweets)
    
print('Done!')

## What's the data look like?

In [None]:
all_tweets[0]

## Save our data

In [None]:
# open a file to save our tweets into
with open('data/tweets_raw.json', 'w') as out_file:
    #loop through all tweets
    for t in all_tweets:
        #save each tweet to the file as json
        json.dump(t, out_file)
        #write each tweet on its own line
        out_file.write('\n')

## Convert it to pandas

In [None]:
df = pd.DataFrame(all_tweets)
df.head()

## Friends

In [None]:
all_friends = []

for u in users:
    print("Getting friends for user", u)
    friends = get_user_friends(u, api)
    print("Found", len(friends), 'friends.')
    all_friends.extend(friends)
    
print('Done!')

In [None]:
all_friends[0]

In [None]:
with open('data/friends_raw.json', 'w') as out_file:
    for t in all_friends:
        json.dump(t, out_file)
        out_file.write('\n')

## Followers

In [None]:
all_followers = []

for u in users:
    print("Getting followers for user", u)
    followers = get_user_followers(u, api)
    print("Found", len(followers), 'followers.')
    all_followers.extend(followers)
    
print('Done!')

In [None]:
all_followers[0]

In [None]:
with open('data/followers_raw.json', 'w') as out_file:
    for t in all_followers:
        json.dump(t, out_file)
        out_file.write('\n')

## Followers IDs only

In [None]:
all_followers_ids = []

for u in users:
    print("Getting follower IDs for user", u)
    followers = get_user_followers_ids(u, api)
    print("Found", len(followers['followers']), 'followers.')
    all_followers_ids.append(followers)
    
print('Done!')

In [None]:
all_followers_ids[2]

In [None]:
with open('data/followers_ids_raw.json', 'w') as out_file:
    for t in all_followers_ids:
        json.dump(t, out_file)
        out_file.write('\n')