-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathopml_parser_2.py
61 lines (45 loc) · 1.53 KB
/
opml_parser_2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#!/home/dd/anaconda3/bin/python
import re, requests, json, sys
from more_itertools import unique_everseen
from tqdm import tqdm
def main(ifilename):
file_name = ifilename
file_name_wo_end = file_name[:-4]
f = open(file_name)
file_content = f.read()
m3 = re.findall('xmlUrl=\"(.+?)\"', file_content)
m4 = re.findall('text=\"(.+?)\"', file_content)
m4.pop(0)
result = []
for i in range(len(m4)):
try:
print(m4[i], m3[i])
result.append((m4[i], m3[i]))
except:
print(f"failed to show an item {m4[i]}")
rss_dict = {}
for name, url in tqdm(result):
try:
r = requests.get(url, headers={'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:58.0) Gecko/20100101 Firefox/58.0'}, timeout=15)
except:
print("failed: " + url)
rss_dict[name] = r.text
tqdm.write('Downloaded ' + url + '\n')
json_filename = file_name_wo_end + ".json"
with open(json_filename,'w') as outfile:
json.dump(rss_dict, outfile)
all_urls_3 = []
for content in rss_dict.values():
m = re.findall(r'\"(http\S+?\.(?:mp3|mp4))[\"\?]', content, re.IGNORECASE)
if m:
for item in m:
all_urls_3.append(item)
txt_filename = file_name_wo_end + ".txt"
f = open(txt_filename,'w')
unique_urls = list(unique_everseen(all_urls_3))
for url in unique_urls:
f.write(url)
f.write("\n")
f.close()
if __name__ == '__main__':
main(sys.argv[1])