Getting reTweets from Python
In this snippet I will show how to use the Tweepy library to get statistics from Tweeter.
The script will read a sample feed from the Twitter Streaming API (https://web.archive.org/web/20100723015215/http://dev.twitter.com/pages/streaming_api) and perform the following actions:
- Read the sample feed
- Note the number of retweets seen
- Track the number of times posts have been retweeted, and
- Produce an hourly report of the 20 most frequently retweeted posts
1#!/usr/bin/env python
2
3"""
4Tweepy library proof of concept
5
6@author: Adrian Deccico
7"""
8
9from getpass import getpass
10import hashlib
11import time
12import re
13import operator
14import logging
15import codecs
16import tweepy
17
18logging.basicConfig(level = logging.INFO)
19
20class TweetListener(tweepy.StreamListener):
21
22 #we use this pattern to decide if a post is a retweet or not, given that retweet fields of the feed don't work
23 __retweet_pattern = "^(rt|retweet).*$"
24
25
26 #statistics
27 count = 0
28 found = 0
29 hour_ranking = {}
30 retweets = {}
31
32 #constants
33 TOP_TWEETS = 20 #number of tweets to display in each hour
34
35
36 def on_status(self, status):
37 """callback that will process new tweets"""
38 try:
39 self.count += 1
40 text = status.text
41 #check wether we got a retweet or not
42 logging.debug('count %s found %s - %s' % (self.count, self.found, text))
43 if re.match(self.__retweet_pattern, text, re.IGNORECASE) == None:
44 return
45
46 self.found += 1
47
48 if text not in self.retweets.keys():
49 twitt_times = 1
50 else:
51 twitt_times = self.retweets[text] + 1
52
53 self.retweets[text] = twitt_times
54
55 hour = status.created_at.strftime("%Y%m%d%H")
56
57 logging.info("hour: %s - times: %s - %s" % (hour, twitt_times, text))
58 logging.info("Number of retweets found:%s" % self.found)
59
60 if hour in self.hour_ranking.keys():
61 if text in self.hour_ranking[hour].keys():
62 number = self.hour_ranking[hour][text] + 1
63 else:
64 number = 1
65 else:
66 number = 1
67 self.hour_ranking[hour] = {}
68
69 logging.debug("adding %s to key %s" %(number,text))
70 self.hour_ranking[hour][text] = number
71 self.printHourlyReport()
72
73 except:
74 logging.exception("error while analyzing tweets")
75
76 def printHourlyReport(self):
77 """Print an hourly statistic file in results.txt"""
78 logging.debug("updating statistics file")
79 with codecs.open("results.txt", "w", "utf-8") as f:
80 for h in sorted(self.hour_ranking):
81 logging.debug(h + " " + str(type(h)))
82 f.write("Top %s tweets at: %s n" % (self.TOP_TWEETS, h))
83 count = self.TOP_TWEETS
84 h_dict = self.hour_ranking[h]
85 #sort retweets by times and then by text
86 for t in sorted(h_dict, key=lambda k: (-h_dict[k], k)):
87 line = "%s time%s - %s n" % (h_dict[t],"s" if h_dict[t]>1 else "",t)
88 f.write(line)
89 count -= 1
90 if count == 0:
91 break
92 f.write("-------------------------------nn")
93
94 def on_error(self, status_code):
95 logging.error('An error has occured! Status code = %s' % status_code)
96 return True # keep stream alive
97
98 def on_timeout(self):
99 logging.info('Time out event')
100
101
102def main():
103 # Prompt for login credentials and setup stream object
104 username = raw_input('Twitter username: ')
105 password = getpass('Twitter password: ')
106 stream = tweepy.Stream(username, password, TweetListener(), timeout=None)
107
108 logging.info("Analyzing Tweeter sample feed. Results.txt will be updated in order to reflect the top 20 retweets of each hour.n")
109 stream.sample()
110
111if __name__ == '__main__':
112 try:
113 main()
114 except KeyboardInterrupt:
115 logging.info('nExecution finished!')