Getting reTweets from Python

In this snippet I will show how to use the Tweepy library to get statistics from Tweeter.

The script will read a sample feed from the Twitter Streaming API (https://web.archive.org/web/20100723015215/http://dev.twitter.com/pages/streaming_api) and perform the following actions:

  • Read the sample feed
  • Note the number of retweets seen
  • Track the number of times posts have been retweeted, and
  • Produce an hourly report of the 20 most frequently retweeted posts
  1#!/usr/bin/env python
  2 
  3"""
  4Tweepy library proof of concept
  5 
  6@author: Adrian Deccico
  7"""
  8 
  9from getpass import getpass
 10import hashlib
 11import time
 12import re
 13import operator
 14import logging
 15import codecs
 16import tweepy
 17 
 18logging.basicConfig(level = logging.INFO)
 19 
 20class TweetListener(tweepy.StreamListener):
 21 
 22    #we use this pattern to decide if a post is a retweet or not, given that retweet fields of the feed don't work
 23    __retweet_pattern = "^(rt|retweet).*$"
 24 
 25 
 26    #statistics
 27    count = 0
 28    found = 0
 29    hour_ranking = {}
 30    retweets = {}
 31 
 32    #constants
 33    TOP_TWEETS = 20 #number of tweets to display in each hour
 34 
 35 
 36    def on_status(self, status):
 37        """callback that will process new tweets"""
 38        try:
 39            self.count += 1
 40            text = status.text
 41            #check wether we got a retweet or not
 42            logging.debug('count %s found %s - %s' % (self.count, self.found, text))
 43            if re.match(self.__retweet_pattern, text, re.IGNORECASE) == None:
 44                return
 45 
 46            self.found += 1
 47 
 48            if text not in self.retweets.keys():
 49                twitt_times = 1
 50            else:
 51                twitt_times = self.retweets[text] + 1
 52 
 53            self.retweets[text] = twitt_times
 54 
 55            hour = status.created_at.strftime("%Y%m%d%H")
 56 
 57            logging.info("hour: %s - times: %s - %s" % (hour, twitt_times, text))
 58            logging.info("Number of retweets found:%s" % self.found)
 59 
 60            if hour in self.hour_ranking.keys():
 61                if text in self.hour_ranking[hour].keys():
 62                    number = self.hour_ranking[hour][text] + 1
 63                else:
 64                    number = 1
 65            else:
 66                number = 1
 67                self.hour_ranking[hour] = {}
 68 
 69            logging.debug("adding %s to key %s" %(number,text))
 70            self.hour_ranking[hour][text] = number
 71            self.printHourlyReport()
 72 
 73        except:
 74            logging.exception("error while analyzing tweets")
 75 
 76    def printHourlyReport(self):
 77        """Print an hourly statistic file in results.txt"""
 78        logging.debug("updating statistics file")
 79        with codecs.open("results.txt", "w", "utf-8") as f:
 80            for h in sorted(self.hour_ranking):
 81                logging.debug(h + " " + str(type(h)))
 82                f.write("Top %s tweets at: %s n" % (self.TOP_TWEETS, h))
 83                count = self.TOP_TWEETS
 84                h_dict = self.hour_ranking[h]
 85                #sort retweets by times and then by text
 86                for t in sorted(h_dict, key=lambda k: (-h_dict[k], k)):
 87                    line = "%s time%s - %s n" % (h_dict[t],"s" if h_dict[t]>1 else "",t)
 88                    f.write(line)
 89                    count -= 1
 90                    if count == 0:
 91                        break
 92                f.write("-------------------------------nn")
 93 
 94    def on_error(self, status_code):
 95        logging.error('An error has occured! Status code = %s' % status_code)
 96        return True  # keep stream alive
 97 
 98    def on_timeout(self):
 99        logging.info('Time out event')
100 
101 
102def main():
103    # Prompt for login credentials and setup stream object
104    username = raw_input('Twitter username: ')
105    password = getpass('Twitter password: ')
106    stream = tweepy.Stream(username, password, TweetListener(), timeout=None)
107 
108    logging.info("Analyzing Tweeter sample feed. Results.txt will be updated in order to reflect the top 20 retweets of each hour.n")
109    stream.sample()
110 
111if __name__ == '__main__':
112    try:
113        main()
114    except KeyboardInterrupt:
115        logging.info('nExecution finished!')