forked from heiko-braun/cai
-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawl.py
121 lines (90 loc) · 3.72 KB
/
crawl.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import requests
import re
import urllib.request
from bs4 import BeautifulSoup
from collections import deque
from html.parser import HTMLParser
from urllib.parse import urlparse
import os
from conf.constants import *
import argparse
def relevant_content_links(body):
relevant_links = []
soup = BeautifulSoup(body, 'html.parser')
# <div class="nav-panel-menu is-active">
navigation_block = soup.find("div", class_="nav-panel-menu")
children = navigation_block.findChildren("a" , recursive=True)
for link in children:
relevant_links.append(link.attrs['href'])
return relevant_links
def crawl(url, collection):
print("Parsing ... ", url) # for debugging and to see the progress
# Try extracting the text from the link, if failed proceed with the next item in the queue
try:
# Save text from the url to a <url>.txt file
with open(TEXT_DIR+collection+'/'+url[8:].replace("/", "_") + ".txt", "w", encoding="UTF-8") as f:
# Get the text from the URL using BeautifulSoup
soup = BeautifulSoup(requests.get(url).text, "html.parser")
# Get the text but remove the tags
el = soup.find("div", class_="content")
text = None
if(el is not None):
text = el.get_text()
# If the crawler gets to a page that requires JavaScript, it will stop the crawl
if ("You need to enable JavaScript to run this app." in text):
print("Unable to parse page " + url + " due to JavaScript being required")
else:
print("Could'nt find target element on page: ", url)
if text is None:
text = "No contents"
# Otherwise, write the text to the file in the text directory
f.write("Article source: "+url+"\n\n")
f.write(text)
except Exception as e:
print("Unable to parse page " + url)
print(e)
def remove_data_dir(domain):
dir = TEXT_DIR+domain
if os.path.exists(dir):
for the_file in os.listdir(dir):
file_path = os.path.join(dir, the_file)
try:
if os.path.isfile(file_path):
os.unlink(file_path)
else:
clear_folder(file_path)
os.rmdir(file_path)
except Exception as e:
print(e)
def create_data_dir(domain):
# Create a directory to store the text files
if not os.path.exists(TEXT_DIR):
os.mkdir(TEXT_DIR)
if not os.path.exists(TEXT_DIR+domain+"/"):
os.mkdir(TEXT_DIR + domain + "/")
# Create a directory to store the csv files
if not os.path.exists(PROCESSED_DIR):
os.mkdir(PROCESSED_DIR)
##
# Main execution
##
#DOMAIN = "rhaetor.github.io_components"
#FULL_URL = "https://rhaetor.github.io/rh-camel/components/next/"
parser = argparse.ArgumentParser(description='Upsert PDF pages')
parser.add_argument('-c', '--collection', help='The target collection name (local storage)', required=True)
parser.add_argument('-url', '--url', help='The full URL to scrape', required=True)
args = parser.parse_args()
# cleanup previous runs
remove_data_dir(args.collection)
# create new data dir
create_data_dir(args.collection)
# grab the relevant links
entry_point = urllib.request.urlopen(args.url).read()
content_links = relevant_content_links(entry_point)
print('Found {0} links to relevant content'.format(len(content_links)))
# Parse each link one by one
for path in content_links:
crawl(
url=args.url+"/"+path,
collection=args.collection
)