-
Notifications
You must be signed in to change notification settings - Fork 0
/
get_dataset.py
96 lines (77 loc) · 3.08 KB
/
get_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import json
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
from remove_non_english_articles import is_english
import os
import time
import math
from newspaper import Article
import re
import requests
# Base, unformatted dataset
DATASET_FILENAME = "data/dataset.txt"
# Output dataset with article text
DATASET_WITH_ARTICLES = "data/dataset_with_articles.jsonl"
def get_and_reformat_dataset():
# Download public dataset
if os.path.exists(DATASET_FILENAME):
os.remove(DATASET_FILENAME)
res = requests.get("https://datacommons.org/data/factcheck/fact_checks_20190605.txt.gz")
dataset = res.text
for line in dataset.split("\n"):
reformatted_line = re.sub(r"\<(.*?)\>", "", line)
with open(DATASET_FILENAME, "a") as reformatted_f:
reformatted_f.write(reformatted_line + "\n")
def scrape_article(line, tries=0):
"""
Try 5 times to scrape main article text from URL
"""
try:
dict_line = json.loads(line)
article_url = dict_line["url"]
# The Weekly Standard, a common source in this dataset, has since been shut down
# and absorbed by the Washington Examiner.
if "weeklystandard" in article_url:
return None
article = Article(article_url)
article.download()
article.parse()
if not is_english(article.text):
print(f"Skipping non-English article: {article_url}")
return None
dict_line["article"] = article.text
return dict_line
except Exception as e:
print(f"Error processing {article_url}: {e}")
if tries > 5:
print(f"Too many retries for {article_url}, skipping")
return None
# Cool down in case of too many requests
time.sleep(30)
return scrape_article(line, tries+1)
def remove_html_tags(text):
return re.sub(r"\<(.*?)\>", "", text)
if __name__ == "__main__":
get_and_reformat_dataset()
with open(DATASET_FILENAME, "r") as f:
lines = f.readlines()
BATCH_SIZE = len(lines) // 2
num_batches = math.ceil(len(lines) / BATCH_SIZE)
# Multithreading is necessary here: sequential HTTP calls + I/O operations
# can take up to 10 hours on this dataset.
with ThreadPoolExecutor(max_workers=10) as executor:
print("Adding scraping jobs to pool")
if os.path.exists(DATASET_WITH_ARTICLES):
print("Existing dataset detected. Deleting...")
os.remove(DATASET_WITH_ARTICLES)
with open(DATASET_WITH_ARTICLES, "w") as f:
for batch_num in range(num_batches):
start = batch_num * BATCH_SIZE
end = min(start + BATCH_SIZE, len(lines))
batch_lines = lines[start:end]
print(f"Processing batch {batch_num + 1} of {num_batches}")
futures = [executor.submit(scrape_article, line) for line in tqdm(batch_lines)]
for future in tqdm(futures):
dict_line = future.result()
if dict_line:
f.write(json.dumps(dict_line) + "\n")