forked from gh0std4ncer/geeks-pdf
-
Notifications
You must be signed in to change notification settings - Fork 0
/
links.py
74 lines (53 loc) · 1.92 KB
/
links.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import json
from collections import OrderedDict
from bs4 import BeautifulSoup
import requests
def print_titles(content):
for title in content.find_all('strong'):
print(title.text.strip())
def save_links(content, filename):
# DS/Algo
# url = "http://www.geeksforgeeks.org/data-structures/"
# url = "http://www.geeksforgeeks.org/fundamentals-of-algorithms/"
# soup = BeautifulSoup(requests.get(url).text)
# content = soup.find('div', class_="entry-content")
links = []
for ul in content.find_all('ul')[1:]:
topic = OrderedDict()
for link in ul.find_all('a'):
if 'geeksquiz' not in link.get('href'):
topic[link.text.strip()] = link['href'].strip()
if topic:
links.append(topic)
with open(filename, "w") as out:
json.dump(links, out, indent=4)
def grab_links(urls, filename=None, combined=False):
if type(urls) is str:
urls = [urls]
links = OrderedDict()
for url in urls:
soup = BeautifulSoup(requests.get(url).text)
content = soup.find('div', id="content")
topic = OrderedDict()
for ques in content.find_all("h2", class_="entry-title"):
link = ques.find("a")
topic[link.text.strip()] = link['href'].strip()
if combined:
links.update(topic)
else:
topic_name = url.split('/')[-2].title()
links[topic_name] = topic
if not filename:
print(json.dumps(links, indent=4))
else:
with open(filename, "w") as out:
json.dump(links, out, indent=4)
def unique_links(filename):
with open(filename) as inp:
data = json.load(inp, object_pairs_hook=OrderedDict)
uniq = OrderedDict()
for title, link in data.items():
if link not in uniq.values():
uniq[title] = link
with open(filename, "w") as out:
json.dump(uniq, out, indent=4)