-
Notifications
You must be signed in to change notification settings - Fork 0
/
del_repeat.py
executable file
·70 lines (56 loc) · 1.89 KB
/
del_repeat.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
#!/usr/bin/env python3
"""
Roughly 600 companies have multiple tickers, they may have the same news, which can be confusing.
To deal with that, if a piece of news is published in two tickers, we only consider the primary one.
The logic is when a ticker is in the repeatedlist, but
"""
import os
import sys
import json
def generate_list():
dt = {}
for l in open('./input/tickerList.csv'):
l = l.strip().split(',')
name = l[1]
ticker = l[0]
if name not in dt:
dt[name] = []
dt[name].append(ticker)
cnt = 0
filterlist = set()
for name in dt:
if len(dt[name]) > 1:
for _ in range(1, len(dt[name])):
filterlist.add(sorted(dt[name])[_])
cnt += 1
return(filterlist)
"""
Change news type from topstory to repeated
"""
def modify_news(date, filterlist):
f = open('input/news/2018/news_' + date + '.csv')
fout = open('input/news/2018/news_' + date + '.csv_bak', 'w')
for l in f:
l = l.strip().split(',')
if len(l) == 6:
ticker, company, timestamp, title, body, news_type = l
elif len(l) == 7:
ticker, company, timestamp, title, body, news_type, suggestion = l
else:
continue
if news_type == 'topStory' and ticker in filterlist:
print(news_type)
news_type = 'repeated'
if len(l) == 6:
fout.write(','.join([ticker, company, timestamp, title, body, news_type])+ '\n')
elif len(l) == 7:
fout.write(','.join([ticker, company, timestamp, title, body, news_type, suggestion])+ '\n')
fout.close()
f.close()
os.system('mv input/news/2018/news_' + date + '.csv_bak input/news/2018/news_' + date + '.csv')
def main():
date = sys.argv[1]
filterlist = generate_list()
modify_news(date, filterlist)
if __name__ == '__main__':
main()