-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathscrapy_novatec.py
131 lines (108 loc) · 4.65 KB
/
scrapy_novatec.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
class ScrapyNovatec:
def __init__(self):
self.base_url = "https://novatec.com.br/"
self.url = ""
pass
def get_soap(self, url):
self.url = url
html = urlopen(url)
soap_html = BeautifulSoup(html.read(), "html.parser")
return soap_html
def get_full_image(self, source_image, href):
id_image = re.search(r'.+/(.+)\.gif', source_image).group(1)
return self.base_url + href + '/capa_ampliada' + id_image + '.jpg'
def get_launch_books(self):
soap_html = self.get_soap(url=self.base_url)
soap_match = soap_html.find_all("tr", valign="center")
books = []
for e in soap_match:
try:
books.append({"name": e.h1.text,
"image": self.get_full_image(e.img['src'], e.a['href']),
"description": e.h2.text
})
except:
# shut up
pass
return books
def get_next_launch(self):
soap_html = self.get_soap(self.base_url)
soap_match = soap_html.find_all("tr", valign="center", align="center")
books = []
for block in soap_match:
for td in block.find_all("td"):
try:
books.append({"name": td.img['alt'],
"image": self.base_url+td.img['src']
})
except:
# shut up
pass
return books
def get_category(self):
soap_html = self.get_soap(self.base_url)
soap_match = soap_html.find_all("td", align="left")
category = []
for cat in soap_match:
try:
id_category = re.search(r'.+id=([0-9]+)', cat.a["href"]).group(1)
category.append({"id": id_category,
"title": cat.text
})
except:
# shut up
pass
return category
def get_by_category(self, id, page):
soap_html = self.get_soap(url="https://novatec.com.br/lista.php?id="+id+"&pag="+page)
books = []
"""
//@Breno way, tem um pequeno bug duplicando o resultado
blocks = soap_html.find_all('td')
for block in blocks:
if not block.find('a'):
continue
if not 'livros' in block.find('a').attrs['href']:
continue
simple_tags = block.findAll('a')
#book_image = self.base_url+block.find("img", hspace="6")["src"]
author = ''.join([a.text for a in simple_tags if 'autores' in a.attrs['href']])
book_name = ''.join([a.text for a in simple_tags if 'livros' in a.attrs['href']])
brs = block.find_all('br')
if brs:
year, pages, price = [b.split(':')[1].strip()
for b in brs[1].text.split('\n')
if b.strip() != ''
]
book_image = block.find('img').attrs.get('src', '')
books.append({"image":book_image,
"title": book_name,
"author": author,
"year": year,
"pages": pages,
"price": price
})
#print({"autor": author, "name": book_name, "year": year, "pages": pages, "price": price})
print(books)
"""
for e in soap_html.findAll("tr"):
try:
year_pages_price_not_formatted_text = e.find(
"font", face="Arial", size="2").br.br.text.strip("\t\n\r ").split("\n")
book_year = year_pages_price_not_formatted_text[0].split(":")[1].strip("\t\n\r ")
book_pages = year_pages_price_not_formatted_text[2].split(":")[1].strip("\t\n\r ")
book_price = year_pages_price_not_formatted_text[4].split(":")[1].strip("\t\n\r ")
books.append({"image": self.get_full_image(e.a.find("img", hspace="6")["src"], e.a['href']),
"name": e.find("font", face="Arial", size="2").a.text,
"author": e.find("font", face="Arial", size="2").br.a.text,
"year": book_year,
"pages": book_pages,
"price": book_price
})
except:
# shut up
pass
return books