-
Notifications
You must be signed in to change notification settings - Fork 0
/
extract.py
108 lines (82 loc) · 3.46 KB
/
extract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import csv
import json
import os
import pandas as pd
from datetime import datetime
from multiprocessing import Process
import concurrent.futures
import itertools
from pathlib import Path
from dotenv import load_dotenv
from reddit_api.api import RedditScraper
from reddit_scraper.spiders.redspider import RedspiderSpider
from scrapy.crawler import CrawlerProcess
def transform_data(json_data: json, cur_date: str):
details_dict = {
"title": json_data[0]["data"]["children"][0]["data"]["title"],
"subreddit": json_data[0]["data"]["children"][0]["data"][
"subreddit_name_prefixed"
],
"author": json_data[0]["data"]["children"][0]["data"]["author"],
"url": json_data[0]["data"]["children"][0]["data"]["permalink"],
"nsfw": json_data[0]["data"]["children"][0]["data"]["over_18"],
"score": json_data[0]["data"]["children"][0]["data"]["score"],
"self_text": json_data[0]["data"]["children"][0]["data"]["selftext"],
"upvote_ratio": json_data[0]["data"]["children"][0]["data"]["upvote_ratio"],
"awards": json_data[0]["data"]["children"][0]["data"]["total_awards_received"],
"time": json_data[0]["data"]["children"][0]["data"]["created"],
"date_popular": cur_date,
}
return details_dict
def retrieve_data(post_url: list, cur_date: str, app: RedditScraper):
post_url = post_url[0]
post_details = transform_data(app.get_post_details(post_url), cur_date)
return post_details
def save_data(transformed_data: list, filename: Path):
df = pd.DataFrame(transformed_data)
df.to_parquet(filename)
def execute_crawling(country: str, cur_date: str, filepath: Path, app: RedditScraper):
settings = {
"FEEDS": {
f"{filepath.parent}/{country}.csv": {"format": "csv", "overwrite": True},
},
"CLOSESPIDER_PAGECOUNT": 4,
"DOWNLOAD_DELAY": 5,
"CONCURRENT_REQUESTS_PER_DOMAIN": 16,
}
process = CrawlerProcess(settings)
process.crawl(RedspiderSpider, country=country)
process.start()
if not os.path.exists("local_data"):
os.makedirs("local_data")
with open(filepath.parent / f"{country}.csv", "r", encoding="utf8") as f:
links = csv.reader(f)
next(links, None)
iter_date = itertools.repeat(cur_date)
iter_app = itertools.repeat(app)
with concurrent.futures.ThreadPoolExecutor() as executor:
results = executor.map(retrieve_data, links, iter_date, iter_app)
filename = filepath.parent / "local_data" / f"{country}_{cur_date}.parquet"
save_data(results, filename)
os.remove(filepath.parent / f"{country}.csv")
if __name__ == "__main__":
filepath = Path(__file__).resolve()
env_path = filepath.parent.parent / ".env"
env_file = load_dotenv(env_path)
credential = {
"CLIENT_ID": os.environ.get("CLIENT_ID"),
"SECRET_TOKEN": os.environ.get("SECRET_TOKEN"),
"REDDIT_USER": os.environ.get("REDDIT_USER"),
"REDDIT_PASS": os.environ.get("REDDIT_PASS"),
}
app = RedditScraper(**credential)
# scrape links from r/popular using scraper
countries = ("PH", "global", "MY", "SG", "TH")
cur_date = datetime.today().strftime("%Y-%m-%d")
for country in countries:
p = Process(target=execute_crawling, args=(country, cur_date, filepath, app))
p.start()
p.join()
# p = Process(target=execute_crawling, args=("PH", cur_date, filepath, app))
# p.start()
# p.join()