forked from fajri91/sum_liputan6
-
Notifications
You must be signed in to change notification settings - Fork 0
/
0_download.py
80 lines (66 loc) · 2.33 KB
/
0_download.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
#!/usr/bin/env python
# coding: utf-8
import requests
import time
import json, os
import glob
from bs4 import BeautifulSoup
import threading
def get_id(url):
return url.split('/')[-2]
def get_summary(text):
target = ''
for line in text.split('\n'):
if 'window.kmklabs.channel =' in line:
target = line
break
temp=target.split('window.kmklabs.article = ')[1]
temp=temp.split(';')[0]
data = json.loads(temp)
return data['shortDescription']
def extract_data(text):
soup = BeautifulSoup(text)
title = soup.findAll('title')[0].getText().replace(' - News Liputan6.com', '')
date = soup.findAll('time', {'class': 'read-page--header--author__datetime updated'})[0].getText()
article = []
contents = soup.findAll('div', {'class': 'article-content-body__item-content'})
for content in contents:
article.append(content.getText())
summary = get_summary(text)
return title, date, article, summary
def write_file(id, url, title, date, content, summary, target_path):
json_dict = {}
json_dict['id']=id
json_dict['url']=url
json_dict['title']=title
json_dict['date']=date
json_dict['content']='\n'.join(content)
json_dict['summary']=summary
with open(f"{target_path}/{id}.json", 'w') as json_file:
json.dump(json_dict, json_file)
def proceed_one(url, path):
response = requests.get(url)
url = response.url
id = get_id(url)
title, date, article, summary = extract_data(response.text)
write_file(id, url, title, date, article, summary, path)
def proceed(urls, path):
for url in urls:
try:
proceed_one(url, path)
except:
print('Failed to proceed ', url, '. Potentially the news has been deleted from Liputan6.')
def thread_func(urls, path, num_thread=1):
os.makedirs(path,exist_ok=True)
threads = []
for i in range(num_thread):
cur_idx = int(i*len(urls)/num_thread)
cur_urls = urls[cur_idx:cur_idx+int(len(urls)/num_thread)]
t = threading.Thread(target=proceed, args=(cur_urls, path,))
threads.append(t)
t.start()
THREAD = 10
urls = json.load(open('url.json'))
thread_func(urls['dev_urls'], 'data/raw/dev', THREAD)
thread_func(urls['test_urls'], 'data/raw/test', THREAD)
thread_func(urls['train_urls'], 'data/raw/train', THREAD)