-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathrescrapeOld.py
executable file
·136 lines (117 loc) · 3.83 KB
/
rescrapeOld.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#!/usr/bin/env python
# this program resrcapes pages held in minerva that were created from the
# hermes import -- that is pages that are mostly stripped down to just story
# text. The firstGoodId was determined through human query.
#
# Optionally a url pattern can be given on the command line:
# ./rescrapeOld.py '%archiveofourown.org%'
# this will limit it to urls that match like the argument. This is used to run
# an instance of this script per domain in minerva for simple parallelism
# gains.
#
# This script scrapes slowly and delays since the _last_ time the domain was
# hit. If hermes hits a domain while this script is paused, this script will
# wait longer to avoid excessive requests.
from typing import Any, List, Sequence
import random
import sys
import time
import scrape
firstGoodId = 68830
batchSize = 100 # somewhere around an hours worth...
globalPattern = "%"
def getBatch(firstGoodId: int, batchSize: int, pattern: str) -> List[Sequence[Any]]:
conn = scrape.openMinerva()
curs = conn.cursor()
curs.execute(
"""
select w.id, w.url
--, r.id, w.id, w.created, w.url, w.status, octet_length(w.response)
from web w
left join web r
on (r.url = trim(trailing '/' from w.url))
and r.status = 200 and r.id >= %s
where w.id < %s and r.id is null and w.url like %s
order by random() -- w.id asc
limit %s
""",
(firstGoodId, firstGoodId, pattern, batchSize),
)
res = curs.fetchall()
curs.close()
scrape.closeMinerva()
return list(res)
def isOld(firstGoodId: int, url: str) -> bool:
matching = getBatch(firstGoodId, 1, url)
return len(matching) > 0
def getLastScrapeTime(pattern: str, source: str) -> int:
conn = scrape.openMinerva()
curs = conn.cursor()
curs.execute(
"""
select w.status, w.created
from web w
where w.url like %s and w.created is not null
and (w.source = %s or w.source is null)
order by w.created desc
limit 1
""",
(pattern, source),
)
res = curs.fetchone()
curs.close()
scrape.closeMinerva()
if res is None:
return int(time.time()) - 300
if int(res[0]) == 429:
# add extra delay for too many requests
return int(res[1]) + 60
return int(res[1])
def getDomain(url: str) -> str:
strip = ["http://", "https://"]
for s in strip:
if url.startswith(s):
url = url[len(s) :]
p = url.split("/")
d = p[0].split(".")
base = ".".join(d[-2:])
return base
if len(sys.argv) > 1:
globalPattern = sys.argv[1]
scrape.importEnvironment()
print(f"source: {scrape.__scrapeSource}")
assert scrape.__scrapeSource is not None
while True:
batch = getBatch(firstGoodId, batchSize, globalPattern)
if len(batch) == 0:
print("it seems we are done?")
break
for r in batch:
wid = r[0]
url = r[1]
s = 15 + random.randint(0, 5)
patt = f"%{getDomain(url)}%"
while True:
if not isOld(firstGoodId, url):
print("not old anymore")
break # has since been rescraped
ls = getLastScrapeTime(patt, scrape.__scrapeSource)
diff = (ls + s) - int(time.time())
print((patt, int(time.time()), (ls + s), diff))
if diff < 0:
break
else:
time.sleep(diff + 1)
if not isOld(firstGoodId, url):
print("not old anymore")
continue # has since been rescraped
time.sleep(3 * random.random())
print(f"refetching {wid}: {url}")
try:
if not isOld(firstGoodId, url):
print("not old anymore")
continue # has since been rescraped
res = scrape.scrape(url)
print(len(res["raw"]))
except: # noqa: E722
time.sleep(60)