-
Notifications
You must be signed in to change notification settings - Fork 3
/
index.py
135 lines (113 loc) · 3.59 KB
/
index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
# https://mahrtayyab.github.io/tweety_docs/
from tweety import Twitter
from tweety.filters import SearchFilters
import datetime
import time
import requests
import sys
# PROXY_SERVER = 'http://127.0.0.1:7890'
PROXY_SERVER = None
isCreatedFile = False
outputFileName = ''
def create_file():
global isCreatedFile
global outputFileName
if isCreatedFile:
return outputFileName
outputFileName = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + '.txt'
with open(outputFileName, "w") as f:
pass
isCreatedFile = True
print('created file:' + outputFileName)
return outputFileName
def resolve_short_url(short_url):
response = requests.head(short_url, allow_redirects=True)
final_url = response.url
return final_url
def login():
print('logining...')
global app
global PROXY_SERVER
app = Twitter("session", PROXY_SERVER)
# input your twitter username and password
app.sign_in('account', 'password')
def genDate(start_date_str, end_date_str):
start_date = datetime.datetime.strptime(start_date_str, '%Y-%m-%d')
end_date = datetime.datetime.strptime(end_date_str, '%Y-%m-%d')
date_range = []
current_date = start_date
while current_date < end_date:
next_date = current_date + datetime.timedelta(days=1)
date_range.append({'start_date': current_date.strftime('%Y-%m-%d'), 'end_date': next_date.strftime('%Y-%m-%d')})
current_date = next_date
return date_range
def get_short_url(urlItem):
try:
if 'https://chat.openai.com/g/' in urlItem.expanded_url:
return urlItem.expanded_url.replace('https://chat.openai.com/g/', '').strip().split('?')[0]
real_url = resolve_short_url(urlItem.url)
if 'https://chat.openai.com/g/' in real_url:
return real_url.replace('https://chat.openai.com/g/', '').strip().split('?')[0]
return None
except Exception as e:
return None
def insert_list(tweets):
for tweet in tweets:
for item in tweet['urls']:
short_url = get_short_url(item)
if short_url is not None:
url = 'https://chat.openai.com/g/' + short_url
with open(create_file(), "a") as f:
f.seek(0, 2)
f.write(url + '\n')
print('wrote:' + url)
def scroll_page(keyword, next_cursor):
print('start...................:', keyword, ' cursor:', next_cursor)
try:
tweets = app.search(
keyword=keyword,
wait_time=2,
cursor=next_cursor,
filter_=SearchFilters.Latest()
)
except Exception as e:
print("error cursor:", next_cursor)
print("error:", e)
print("try again........")
time.sleep(23)
scroll_page(keyword, next_cursor)
return
if len(tweets) == 0:
return
insert_list(tweets)
# Search has 50 requests per 15 minutes limit , slow down your requests
time.sleep(23)
scroll_page(keyword, tweets.cursor)
def startDateRange(start_date, end_date, initialCursor):
dates = genDate(start_date, end_date)
for date_range in dates:
cursor = None
if date_range['start_date'] == start_date:
cursor = initialCursor
keyword = "(chat.openai.com/g/) until:" + date_range['end_date'] + " since:" + date_range['start_date']
scroll_page(keyword, cursor)
print('all success.....:', keyword)
def startLatest(cursor):
keyword = "chat.openai.com/g/"
scroll_page(keyword, cursor)
print('all success............')
def startCrawler():
if len(sys.argv) > 1:
if len(sys.argv[1]) > 10:
cur = sys.argv[1]
startLatest(cur)
else:
if len(sys.argv) >= 4:
cur = sys.argv[3]
else:
cur = None
startDateRange(sys.argv[1], sys.argv[2], cur)
else:
startLatest(None)
login()
startCrawler()