From: Benjamin Mako Hill Date: Mon, 11 May 2015 23:25:22 +0000 (-0700) Subject: made a collection of twitter api solutions X-Git-Url: https://projects.mako.cc/source/twitter-api-cdsw-solutions/commitdiff_plain/d4653b5f599083dc7631ff1a215096ac58b626d8?hp=1e2d406ccc7bfaa9c73ab809367ce5ae2ab6fc79 made a collection of twitter api solutions --- diff --git a/README.md b/README.md new file mode 100644 index 0000000..f3e0a25 --- /dev/null +++ b/README.md @@ -0,0 +1,24 @@ +## Followers + +1. Write a program to find out how many people a particular user follows? +2. For each of your followers, find out how many followers they have. +3. Make a "famous ratio" for a given user which I'll define as 'number of followers a person has divided by number of people they follow. Try out @makoshark, and @pontifex (the Pope). Who is higher? +4. [SKIPPED] Identify the follower you have that also follows the most of your followers. +5. [SKIPPED] How many users follow you but none of your followers? +6. [SKIPPED] Repeat these analyses for people you follow, rather than that follow you. +7. Identify the "famous ratio" for every one of your followers or friends? Who has the highest one? + +## Topics and Trends + +1. Modify twitter3.py to produce a list of 1000 tweets about a topic of your choice. +2. Look at those tweets. How does twitter interpret a two word query like "data science" +3. Do the previous step but eliminate retweets [hint: look at the tweet object!] +4. For each original tweet, list the number of times you see it retweeted. +5. Get a list of the URLs that are associated with your topic using Twitter. + +## Geolocation + +1. Alter the streaming code to include a "locations" filter. You need to use the order sw_lng, sw_lat, ne_lng, ne_lat for the four coordinates. +2. What are people tweeting about in Times Square today? +3. Set up a bounding box around TS and around NYC as a whole. +4. Do "static" (i.e., not using the streaming API) geolocation search using code like this: d = api.search(geocode='37.781157,-122.398720,1mi') diff --git a/twitter2.py b/solution-followers-1.py similarity index 61% rename from twitter2.py rename to solution-followers-1.py index e88fe95..d200ed2 100644 --- a/twitter2.py +++ b/solution-followers-1.py @@ -1,3 +1,5 @@ +# Q Write a program to find out how many people a particular user follows? + import encoding_fix import tweepy from twitter_authentication import CONSUMER_KEY, CONSUMER_SECRET, ACCESS_TOKEN, ACCESS_TOKEN_SECRET @@ -9,10 +11,6 @@ api = tweepy.API(auth) user = api.get_user('makoshark') -print(user.screen_name + " has " + str(user.followers_count) + " followers.") - -print("They include these 100 people:") +print(user.screen_name + " follows " + str(user.friends_count) + " accounts.") -for follower in user.followers(count=100): - print(follower.screen_name) diff --git a/solution-followers-2-cursor.py b/solution-followers-2-cursor.py new file mode 100644 index 0000000..d5c6772 --- /dev/null +++ b/solution-followers-2-cursor.py @@ -0,0 +1,53 @@ +# For each of your followers, find out how many followers they have. + +import encoding_fix +import tweepy +from twitter_authentication import CONSUMER_KEY, CONSUMER_SECRET, ACCESS_TOKEN, ACCESS_TOKEN_SECRET +import time + +auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET) +auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET) + +api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True) + +user = api.get_user("makoshark") + +# I found the list of functions in Tweepy here: +# https://tweepy.readthedocs.org/en/v3.2.0/api.html + +# I found the idea of how to the user the Cursor here: +# https://tweepy.readthedocs.org/en/v3.2.0/cursor_tutorial.html + +follower_ids = [] +for page in tweepy.Cursor(api.followers_ids, screen_name="makoshark").pages(): + for follower in page: + follower_ids.append(follower) + + +# the answer is using the api.lookup_users() code. unfortunately, this +# seems to only work with 100 users at a time. the following code makes that +# work +counter = 0 +tmp_ids = [] +users = [] +for follower in follower_ids: + tmp_ids.append(follower) + counter = counter + 1 + + # if we've hit 100, we grab data and then reset things and keep going + if counter == 100: + tmp_users = api.lookup_users(user_ids=tmp_ids) + users = users + tmp_users + + counter = 0 + tmp_ids = [] + +# run once more when we're done +tmp_users = api.lookup_users(user_ids=tmp_ids) +users = users + tmp_users + +# run through and print out the list of followers +for user in users: + print("%s : %s" % (user.screen_name, user.followers_count)) + + diff --git a/solution-followers-2.py b/solution-followers-2.py new file mode 100644 index 0000000..a0a87ef --- /dev/null +++ b/solution-followers-2.py @@ -0,0 +1,23 @@ +# For each of your followers, find out how many followers they have. + +import encoding_fix +import tweepy +from twitter_authentication import CONSUMER_KEY, CONSUMER_SECRET, ACCESS_TOKEN, ACCESS_TOKEN_SECRET +import time + +auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET) +auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET) + +api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True) + +user = api.get_user("makoshark") + +for follower in user.followers(): + print("%s : %s" % (follower.screen_name, follower.followers_count)) + + # According to this page, we can make 180 requests for user + # information each 15 minute period or one every 5 seconds: + # + # https://dev.twitter.com/rest/reference/get/users/show + time.sleep(5) + diff --git a/solution-followers-3.py b/solution-followers-3.py new file mode 100644 index 0000000..8c0ad55 --- /dev/null +++ b/solution-followers-3.py @@ -0,0 +1,20 @@ +# Make a "famous ratio" for a given user which I'll define as 'number +# of followers a person has divided by number of people they +# follow. Try out @makoshark, and @pontifex (the Pope). Who is higher? + +import encoding_fix +import tweepy +from twitter_authentication import CONSUMER_KEY, CONSUMER_SECRET, ACCESS_TOKEN, ACCESS_TOKEN_SECRET + +auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET) +auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET) + +api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True) + +def famous_ratio(username): + user = api.get_user(username) + return(user.followers_count / user.friends_count) + +print("mako: %s" % famous_ratio('makoshark')) +print("the pope: %s" % famous_ratio('pontifex')) + diff --git a/solution-followers-7.py b/solution-followers-7.py new file mode 100644 index 0000000..817fbcf --- /dev/null +++ b/solution-followers-7.py @@ -0,0 +1,59 @@ +# Make a "famous ratio" for a given user which I'll define as 'number +# of followers a person has divided by number of people they +# follow. Try out @makoshark, and @pontifex (the Pope). Who is higher? +# +# This works for all users in my follower list. + +import encoding_fix +import tweepy +from twitter_authentication import CONSUMER_KEY, CONSUMER_SECRET, ACCESS_TOKEN, ACCESS_TOKEN_SECRET + +auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET) +auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET) + +api = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True) + +user = api.get_user("makoshark") + +# I found the list of functions in Tweepy here: +# https://tweepy.readthedocs.org/en/v3.2.0/api.html + +# I found the idea of how to the user the Cursor here: +# https://tweepy.readthedocs.org/en/v3.2.0/cursor_tutorial.html + +follower_ids = [] +for page in tweepy.Cursor(api.followers_ids, screen_name="makoshark").pages(): + for follower in page: + follower_ids.append(follower) + + +# the answer is using the api.lookup_users() code. unfortunately, this +# seems to only work with 100 users at a time. the following code makes that +# work +counter = 0 +tmp_ids = [] +users = [] +for follower in follower_ids: + tmp_ids.append(follower) + counter = counter + 1 + + # if we've hit 100, we grab data and then reset things and keep going + if counter == 100: + tmp_users = api.lookup_users(user_ids=tmp_ids) + users = users + tmp_users + + counter = 0 + tmp_ids = [] + +# run once more when we're done +tmp_users = api.lookup_users(user_ids=tmp_ids) +users = users + tmp_users + +# print out the famous ratios for users +famous_ratios = {} +for user in users: + famous_ratios[user.screen_name] = user.followers_count / user.friends_count + +for user in sorted(famous_ratios, key=famous_ratios.get, reverse=True): + print(user, famous_ratios[user]) + diff --git a/twitter-stream2.py b/solution-geo-1.py similarity index 56% rename from twitter-stream2.py rename to solution-geo-1.py index 47855d8..3ba56d3 100644 --- a/twitter-stream2.py +++ b/solution-geo-1.py @@ -1,3 +1,10 @@ +# Alter the streaming code to include a "locations" filter. You need +# to use the order sw_lng, sw_lat, ne_lng, ne_lat for the four +# coordinates. + +# Note: to answer this, I used this website to find a good box: +# http://boundingbox.klokantech.com/ + import encoding_fix import tweepy from twitter_authentication import CONSUMER_KEY, CONSUMER_SECRET, ACCESS_TOKEN, ACCESS_TOKEN_SECRET @@ -12,11 +19,12 @@ class StreamListener(tweepy.StreamListener): print(tweet.author.screen_name + "\t" + tweet.text) def on_error(self, status_code): - print( 'Error: ' + repr(status_code)) + print('Error: ' + repr(status_code)) return False l = StreamListener() streamer = tweepy.Stream(auth=auth, listener=l) -keywords = ['python', 'perl'] -streamer.filter(track = keywords) +# This should grab tweets within Seattle: +streamer.filter(locations=[-122.459696, 47.481002, -122.224433, 47.734136]) + diff --git a/twitter-stream1.py b/solution-geo-2.py similarity index 67% rename from twitter-stream1.py rename to solution-geo-2.py index e94422d..c7830ff 100644 --- a/twitter-stream1.py +++ b/solution-geo-2.py @@ -1,3 +1,8 @@ +# What are people tweeting about in Times Square today? + +# Note: to answer this, I used this website to find a good box: +# http://boundingbox.klokantech.com/ + import encoding_fix import tweepy from twitter_authentication import CONSUMER_KEY, CONSUMER_SECRET, ACCESS_TOKEN, ACCESS_TOKEN_SECRET @@ -18,4 +23,6 @@ class StreamListener(tweepy.StreamListener): l = StreamListener() streamer = tweepy.Stream(auth=auth, listener=l) -streamer.sample() +# This should grab tweets in Times Square +streamer.filter(locations=[-73.9864799803,40.7575460197,-73.9837820197,40.7602439803]) + diff --git a/solution-geo-3.py b/solution-geo-3.py new file mode 100644 index 0000000..da2a31e --- /dev/null +++ b/solution-geo-3.py @@ -0,0 +1,33 @@ +# Set up a bounding box around Times Square and around NYC as a whole. + +# Alter the streaming code to include a "locations" filter. You need +# to use the order sw_lng, sw_lat, ne_lng, ne_lat for the four +# coordinates. + +# Note: to answer this, I used this website to find a good box: +# http://boundingbox.klokantech.com/ + +import encoding_fix +import tweepy +from twitter_authentication import CONSUMER_KEY, CONSUMER_SECRET, ACCESS_TOKEN, ACCESS_TOKEN_SECRET + +auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET) +auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET) + +api = tweepy.API(auth) + +class StreamListener(tweepy.StreamListener): + def on_status(self, tweet): + print(tweet.author.screen_name + "\t" + tweet.text) + + def on_error(self, status_code): + print('Error: ' + repr(status_code)) + return False + +l = StreamListener() +streamer = tweepy.Stream(auth=auth, listener=l) + +# This should grab tweets in Times Square /and/ NYC as a whole +streamer.filter(locations=[-73.9864799803,40.7575460197,-73.9837820197,40.7602439803, + -74.25909,40.477399,-73.700171,40.917577]) + diff --git a/solution-geo-4.py b/solution-geo-4.py new file mode 100644 index 0000000..05a6f9b --- /dev/null +++ b/solution-geo-4.py @@ -0,0 +1,28 @@ +# Do "static" (i.e., not using the streaming API) geolocation search +# using code like this: d = api.search(geocode='37.781157,-122.398720,1mi') + +import encoding_fix +import tweepy +from twitter_authentication import CONSUMER_KEY, CONSUMER_SECRET, ACCESS_TOKEN, ACCESS_TOKEN_SECRET +import time + +auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET) +auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET) + +api = tweepy.API(auth) + +# 100 is the maximum number taht can be returned according to: +# https://dev.twitter.com/rest/reference/get/search/tweets + +counter = 0 +for page in tweepy.Cursor(api.search, "party", geocode='37.781157,-122.398720,1mi', count=100).pages(): + counter = counter + len(page) + for tweet in page: + print(tweet.user.screen_name + "\t" + str(tweet.created_at) + "\t" + tweet.text) + # end this loop if we've gotten 1000 + if counter == 1000: + break + + # This page suggests we can do one request every 5 seconds: + # https://dev.twitter.com/rest/reference/get/search/tweets + time.sleep(5) diff --git a/solution-topics-1.py b/solution-topics-1.py new file mode 100644 index 0000000..b2c8503 --- /dev/null +++ b/solution-topics-1.py @@ -0,0 +1,31 @@ +# Modify twitter3.py to produce a list of 1000 tweets about a topic of +# your choice. + +# Note: I've changed it to search for "community data" instead of "data science." + +import encoding_fix +import tweepy +from twitter_authentication import CONSUMER_KEY, CONSUMER_SECRET, ACCESS_TOKEN, ACCESS_TOKEN_SECRET +import time + +auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET) +auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET) + +api = tweepy.API(auth) + +# 100 is the maximum number taht can be returned according to: +# https://dev.twitter.com/rest/reference/get/search/tweets + +counter = 0 +for page in tweepy.Cursor(api.search, "community data", count=100).pages(): + counter = counter + len(page) + for tweet in page: + print(tweet.user.screen_name + "\t" + str(tweet.created_at) + "\t" + tweet.text) + # end this loop if we've gotten 1000 + if counter == 1000: + break + + # This page suggests we can do one request every 5 seconds: + # https://dev.twitter.com/rest/reference/get/search/tweets + time.sleep(5) + diff --git a/solution-topics-2.py b/solution-topics-2.py new file mode 100644 index 0000000..9405fd1 --- /dev/null +++ b/solution-topics-2.py @@ -0,0 +1,4 @@ +# 2. Look at those tweets. How does twitter interpret a two word query +# like "data science" + +# For two words searches, it seems to search for community *or* data. diff --git a/solution-topics-3.py b/solution-topics-3.py new file mode 100644 index 0000000..7a08746 --- /dev/null +++ b/solution-topics-3.py @@ -0,0 +1,30 @@ +# Do the previous step but eliminate retweets [hint: look at the tweet object!] + +import encoding_fix +import tweepy +from twitter_authentication import CONSUMER_KEY, CONSUMER_SECRET, ACCESS_TOKEN, ACCESS_TOKEN_SECRET +import time + +auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET) +auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET) + +api = tweepy.API(auth) + +counter = 0 +for page in tweepy.Cursor(api.search, "community data", count=100).pages(): + counter = counter + len(page) + for tweet in page: + # use the "hasattr()" function to determine if a tweet is a retweet + if not hasattr(tweet, 'retweeted_status'): + print(tweet.user.screen_name + "\t" + str(tweet.created_at) + "\t" + tweet.text) + + # end this loop if we've gotten 1000 + if counter >= 1000: + break + + # This page suggests we can do one request every 5 seconds: + # https://dev.twitter.com/rest/reference/get/search/tweets + time.sleep(5) + + + diff --git a/solution-topics-4.py b/solution-topics-4.py new file mode 100644 index 0000000..8eea9c2 --- /dev/null +++ b/solution-topics-4.py @@ -0,0 +1,30 @@ +# For each original tweet, list the number of times you see it retweeted. + +import encoding_fix +import tweepy +from twitter_authentication import CONSUMER_KEY, CONSUMER_SECRET, ACCESS_TOKEN, ACCESS_TOKEN_SECRET +import time + +auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET) +auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET) + +api = tweepy.API(auth) + +counter = 0 +for page in tweepy.Cursor(api.search, "community data", count=100).pages(): + counter = counter + len(page) + for tweet in page: + # use the "hasattr()" function to determine if a tweet is a retweet + if not hasattr(tweet, 'retweeted_status'): + print("%s : %s " % (tweet.text, tweet.retweet_count)) + + # end this loop if we've gotten 1000 + if counter >= 1000: + break + + # This page suggests we can do one request every 5 seconds: + # https://dev.twitter.com/rest/reference/get/search/tweets + time.sleep(5) + + + diff --git a/solution-topics-5.py b/solution-topics-5.py new file mode 100644 index 0000000..0371026 --- /dev/null +++ b/solution-topics-5.py @@ -0,0 +1,28 @@ +# For each original tweet, list the number of times you see it retweeted. + +import encoding_fix +import tweepy +from twitter_authentication import CONSUMER_KEY, CONSUMER_SECRET, ACCESS_TOKEN, ACCESS_TOKEN_SECRET +import time + +auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET) +auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET) + +api = tweepy.API(auth) + +counter = 0 +for page in tweepy.Cursor(api.search, "community data", count=100).pages(): + counter = counter + len(page) + for tweet in page: + + # urls seem to be stored in tweet.entities["urls"] + for url in tweet.entities["urls"]: + print(url["expanded_url"]) + + # end this loop if we've gotten 1000 + if counter >= 1000: + break + + # This page suggests we can do one request every 5 seconds: + # https://dev.twitter.com/rest/reference/get/search/tweets + time.sleep(5) diff --git a/twitter-stream-raw1.py b/twitter-stream-raw1.py deleted file mode 100644 index b6ea13b..0000000 --- a/twitter-stream-raw1.py +++ /dev/null @@ -1,31 +0,0 @@ -import encoding_fix -import json -import tweepy -from twitter_authentication import CONSUMER_KEY, CONSUMER_SECRET, ACCESS_TOKEN, ACCESS_TOKEN_SECRET - -auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET) -auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET) - -api = tweepy.API(auth, parser=tweepy.parsers.RawParser()) - -@classmethod -def parse(cls, api, raw): - status = cls.first_parse(api, raw) - setattr(status, 'json', json.dumps(raw)) - return status - -tweepy.models.Status.first_parse = tweepy.models.Status.parse -tweepy.models.Status.parse = parse - -class StreamListener(tweepy.StreamListener): - def on_status(self, tweet): - print(tweet.json) - - def on_error(self, status_code): - print('Error: ' + repr(status_code)) - return False - -l = StreamListener() -streamer = tweepy.Stream(auth=auth, listener=l) - -streamer.sample() diff --git a/twitter1-cursor.py b/twitter1-cursor.py deleted file mode 100644 index dfb1188..0000000 --- a/twitter1-cursor.py +++ /dev/null @@ -1,16 +0,0 @@ -import encoding_fix -import tweepy -from twitter_authentication import CONSUMER_KEY, CONSUMER_SECRET, ACCESS_TOKEN, ACCESS_TOKEN_SECRET -import time - -auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET) -auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET) - -api = tweepy.API(auth) - -# I found the idea of how to the user the Cursor here: -# https://tweepy.readthedocs.org/en/v3.2.0/cursor_tutorial.html -for page in tweepy.Cursor(api.home_timeline, count=200).pages(): - for tweet in page: - print(tweet.text) - time.sleep(1) diff --git a/twitter1.py b/twitter1.py deleted file mode 100644 index 26b6337..0000000 --- a/twitter1.py +++ /dev/null @@ -1,13 +0,0 @@ -import encoding_fix -import tweepy -from twitter_authentication import CONSUMER_KEY, CONSUMER_SECRET, ACCESS_TOKEN, ACCESS_TOKEN_SECRET - -auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET) -auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET) - -api = tweepy.API(auth) - -public_tweets = api.home_timeline(count=100) - -for tweet in public_tweets: - print(tweet.text) diff --git a/twitter3.py b/twitter3.py deleted file mode 100644 index 16329e3..0000000 --- a/twitter3.py +++ /dev/null @@ -1,13 +0,0 @@ -import encoding_fix -import tweepy -from twitter_authentication import CONSUMER_KEY, CONSUMER_SECRET, ACCESS_TOKEN, ACCESS_TOKEN_SECRET - -auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET) -auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET) - -api = tweepy.API(auth) - -public_tweets = api.search("data science", count=20) - -for tweet in public_tweets: - print(tweet.user.screen_name + "\t" + str(tweet.created_at) + "\t" + tweet.text) diff --git a/twitter4.py b/twitter4.py deleted file mode 100644 index 3030edf..0000000 --- a/twitter4.py +++ /dev/null @@ -1,17 +0,0 @@ -import encoding_fix -import tweepy -from twitter_authentication import CONSUMER_KEY, CONSUMER_SECRET, ACCESS_TOKEN, ACCESS_TOKEN_SECRET - -auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET) -auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET) - -api = tweepy.API(auth) - -# code to write the file -output_file = open("MY_DATA.tsv", "w", encoding="utf-8") - -public_tweets = api.search("data science", count=10) - -for tweet in public_tweets: - print(tweet.user.screen_name + "\t" + str(tweet.created_at) + "\t" + tweet.text, file=output_file) -