-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathhabr.py
91 lines (71 loc) · 2.64 KB
/
habr.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
from dataclasses import dataclass, field
from functools import partialmethod
from logging import getLogger
from typing import ClassVar, Iterator, List
import requests
from bs4 import BeautifulSoup
from tenacity import TryAgain, retry, stop_after_attempt, wait_incrementing
log = getLogger(__name__)
class TooManyIterations(Exception):
pass
@dataclass
class Habr:
session: requests.Session = field(default_factory=requests.Session)
adapter: requests.adapters.HTTPAdapter = requests.adapters.HTTPAdapter(
pool_maxsize=32,
)
SITE_URL: ClassVar[str] = 'https://habr.com'
API_URL: ClassVar[str] = 'https://habr.com/kek/v2'
TIMEOUT: ClassVar[int] = 10
def __post_init__(self):
self.session.mount('https://', self.adapter)
@retry(
reraise=True,
wait=wait_incrementing(start=1, increment=2),
stop=stop_after_attempt(20),
)
def request(self, method: str, path: str, **kwargs) -> requests.Response:
if path.startswith('/'):
path = self.BASE_URL + path
kwargs.setdefault('timeout', self.TIMEOUT)
response = self.session.request(method, path, **kwargs)
if response.status_code == requests.codes.too_many_requests:
raise TryAgain()
return response
get = partialmethod(request, 'get')
post = partialmethod(request, 'post')
def iter_posts(self, flow: str = 'develop') -> Iterator[dict]:
max_page = 10000
for page in range(1, max_page):
log.debug(f'Scraping posts page {page}')
posts = self.get_posts(flow=flow, page=page)
if not posts:
return
yield from posts
else:
raise TooManyIterations()
def get_posts(self, flow: str, page: int = 1) -> List[dict]:
response = self.get(
f'{self.API_URL}/articles/',
params={
'flow': flow,
'sort': 'all',
'page': page,
'fl': 'ru',
'hl': 'ru',
},
)
if not response.ok:
if response.status_code == requests.codes.not_found:
return []
response.raise_for_status()
result = response.json()
return result['articleRefs'].values()
def get_post(self, id_: int | str) -> str:
return self.get(f'{self.SITE_URL}/ru/post/{id_}')
def get_post_content(self, id_: int | str) -> BeautifulSoup:
html = self.get_post(id_).text
soup = BeautifulSoup(html, features='html.parser')
content = soup.find('div', {'id': 'post-content-body'})
assert content
return content