-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathService.py
59 lines (52 loc) · 2.12 KB
/
Service.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# -*- coding: utf-8 -*-
import html2text
import re
import string
import requests
from bs4 import BeautifulSoup as soup
from urllib.parse import urlparse
class Service:
def get_html_text(self, url):
r = requests.get(url)
return r.text
def extract_text_from_html(self, html):
h = html2text.HTML2Text()
h.wrap_links = False
h.skip_internal_links = True
h.inline_links = True
h.ignore_anchors = True
h.ignore_images = True
h.ignore_emphasis = True
h.ignore_links = True
output_text = h.handle(html)
output = re.sub(r'\w*\d\w*', ' ', output_text)
urls_removed = re.sub(r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))
+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''', '', output)
exclude = set(string.punctuation)
clear_text = ''.join(ch for ch in urls_removed if ch not in exclude)
clear_text = re.sub('\s+' ,' ' ,clear_text )
return clear_text.strip()
def get_images_urls(self, html):
soup_page = soup(html, features="html.parser")
img_tags = soup_page.find_all('img')
img_urls = []
for img in img_tags:
url = img.get('src', img.get('data-src'))
if url:
img_urls.append(url)
return img_urls
def download_images_content(self, site_url, img_urls):
netloc = urlparse(site_url).netloc
scheme = urlparse(site_url).scheme
site = "{}://{}".format(scheme, netloc)
images = []
for url in img_urls:
filename = url[url.rfind("/")+1:]
if any(x in filename for x in ['jpg', 'png', 'jpeg', 'tiff']):
filename = re.sub("[^(A-Za-z.\-_0-9)]", "", filename)
if 'http' not in url:
url = '{}{}'.format(site, url)
img_response = requests.get(url)
filename_image = (filename, img_response.content)
images.append(filename_image)
return images