-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathreuters.py
80 lines (57 loc) · 2.04 KB
/
reuters.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import urllib.request
from bs4 import BeautifulSoup
import re
import datetime
import openpyxl
import os
tasty_urls = []
date_string = "20180825"
def strip_formatting(string):
string = string.lower()
string = string.replace("\n", " ")
string = string.replace("'", "")
string = re.sub(r"([.!?,'/()])", r" \1 ", string)
return string
while len(tasty_urls) < 2000:
news_page = "https://www.reuters.com/resources/archive/us/" + str(date_string) + ".html"
page = urllib.request.urlopen(news_page)
soup = BeautifulSoup(page, "html.parser")
for a in soup.find_all('a'):
if "article" in a['href']:
if a['href'] not in tasty_urls:
tasty_urls.append(a['href'])
date_string = datetime.datetime.strptime(date_string, "%Y%m%d") - datetime.timedelta(1)
date_string = str(date_string)[:4] + str(date_string)[5:7] + str(date_string)[8:10]
print(len(tasty_urls))
excel_document = openpyxl.load_workbook('training_data.xlsx')
ws = excel_document.worksheets[0]
row = 2007
for tasty_link in tasty_urls:
column = 1
news_page = tasty_link
try:
page = urllib.request.urlopen(news_page)
except:
continue
soup = BeautifulSoup(page, "html.parser")
name_box = soup.find("div", attrs={"class": "StandardArticleBody_body"})
if name_box != None:
paragraph = name_box.find_all("p")
text = []
for i in paragraph:
try:
this_is_bad = i["class"]
except KeyError:
text.append(strip_formatting(i.text.strip()))
ws.cell(row=row, column=column).value = "reuters"
column += 1
print(tasty_link)
ws.cell(row=row, column=column).value = tasty_link
column += 1
ws.cell(row=row, column=column).value = str(text)
column += 1
ws.cell(row=row, column=column).value = 0
column += 1
row += 1
print(row)
excel_document.save('training_data.xlsx')