-
Notifications
You must be signed in to change notification settings - Fork 0
/
tweet_dumper.py
135 lines (123 loc) · 4.97 KB
/
tweet_dumper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import tweepy #https://github.com/tweepy/tweepy
import sys
import os
import re
# update the id of the lasttweet stored in the given filename
# if the file doesn't exist, one is created
def updateLastTweet(filename, screen_name, newLastTweet):
tmpLine = ""
newData = ""
if not os.path.isfile(filename):
newFile = open(filename, 'w')
newFile.close()
with open(filename, 'r+b') as f:
for line in f:
tmpLine = line.split(':')
if tmpLine[0] == screen_name:
newData += tmpLine[0] + ":" + str(newLastTweet) + "\n"
else:
newData += line
newF = open(filename + ".swp", 'w')
newF.write(newData)
newF.close()
os.remove(filename)
os.rename(filename + ".swp", filename)
# get the id of the most recent tweet recorded from the file
# specified by filename
def getMostRecentTweet(filename, screen_name):
if not os.path.isfile(filename):
newFile = open(filename, 'w')
newFile.close()
with open(filename, 'r+b') as f:
for line in f:
tmpLine = line.split(":")
if tmpLine[0] == screen_name:
MostRecentTweetPulled = tmpLine[1].strip()
f.close()
return MostRecentTweetPulled
f = open(filename, 'a')
newEntry = screen_name + ":0"
f.write(newEntry)
f.close()
return 0
def get_all_tweets3args(screen_name, api, amount):
filename = "%s_tweets.txt" % screen_name
lastTweetId = "lastTweetScraped.txt"
get_all_tweets5args(screen_name, api, amount, filename, lastTweetId)
def get_all_tweets4args(screen_name, api, amount, filename):
lastTweetId = "lastTweetScraped.txt"
get_all_tweets5args(screen_name, api, amount, filename, lastTweetId)
def writeTweets(filename, mode, tweets):
if filename.find("saved_tweets") == -1:
if os.path.isdir("./saved_tweets"):
f = open("./saved_tweets/" + filename, mode)
else:
os.makedirs("./saved_tweets")
f = open("./saved_tweets/" + filename, mode)
elif os.path.isdir("./saved_tweets"):
f = open(filename, mode)
else:
os.makedirs("./saved_tweets")
f = open(filename, mode)
for tweet in tweets:
if tweet != "\n" or tweet != "":
f.write(tweet)
f.write("\n")
f.close()
def get_all_tweets5args(screen_name, api, amount, filename, lastTweetScrapedFile):
#initialize a list to hold all the tweepy Tweets
alltweets = []
# make request for most recent tweets
# you can only pull 200 tweets from twitter at a time,
# hence count=200
new_tweets = api.user_timeline(screen_name = screen_name,count=200, include_rts = True)
numTweets = len(new_tweets)
MostRecentTweetPulled = getMostRecentTweet(lastTweetScrapedFile, screen_name)
print "Most Recent Tweet ID Pulled for", screen_name, "=", MostRecentTweetPulled
# record most recent tweet id.
# The id of the last tweet the user tweeted.
if len(new_tweets) > 0:
# most recent tweet will replace MostRecentTweetPulled
# after all new tweets are scraped
most_recent_tweet = new_tweets[0].id
else:
most_recent_tweet = 0
# loop over first request and keep requesting until amount
# is reached or the 3240 threshold is reached
while len(new_tweets) > 0:
for tweet in new_tweets:
# break if you see the id of a tweet you have scraped
if int(MostRecentTweetPulled) >= int(tweet.id):
print "Most Recent Tweet Pulled =", MostRecentTweetPulled
print "tweet.id = ", tweet.id
numTweets = amount
break
try:
tweet = tweet.text.decode('string_escape', 'ignore').encode('ascii', 'ignore')
except UnicodeEncodeError:
strippedTweet = ""
for character in tweet.text:
if ord(character) > 126 or ord(character) < 32:
continue
else:
strippedTweet+=character
tweet = strippedTweet
# strip URLS, Ampersands, retweets, and newlines
tweet = re.sub(r'(www|https?)[^\s]+', '', tweet)
tweet = re.sub(r'&', '&', tweet, flags=re.MULTILINE)
tweet = re.sub(r'^RT.*:+ ', '', tweet, flags=re.MULTILINE)
tweet = tweet.replace('\n', ' ')
tweet = tweet.strip()
alltweets.append(tweet)
if len(alltweets) >= amount:
numTweets = len(alltweets)
break
oldest = new_tweets[-1].id - 1
print "...%s tweets downloaded so far" % (len(alltweets))
new_tweets = api.user_timeline(screen_name = screen_name,count=200,max_id=oldest)
if numTweets >= amount:
break
numTweets += len(new_tweets)
writeTweets(filename, 'a', alltweets)
# update Last Tweet
updateLastTweet(lastTweetScrapedFile, screen_name, most_recent_tweet)