Skip to content

Commit

Permalink
Merge pull request #5 from yutkin/dev
Browse files Browse the repository at this point in the history
Dev
  • Loading branch information
yutkin authored Dec 14, 2019
2 parents 8d3714e + d62bf8b commit 4ba9584
Show file tree
Hide file tree
Showing 4 changed files with 48 additions and 35 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
.idea
env
lenta-ru-news.csv
venv/
14 changes: 7 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
## Corpus of news articles of Lenta.Ru
* Size: 502 Mb (1.8 Gb uncompressed)
* News articles: ~ 739K
* Dates: 1999-08-30 - 2018-12-15
# Corpus of news articles of Lenta.Ru
* Size: 337 Mb (2 Gb uncompressed)
* News articles: 800K+
* Dates: 30/08/1999 - 14/12/2019

+ [Script](../master/download_lenta.py) for news downloading (Python **3.7**+ is required).


## Download
# Download
* [GitHub](https://github.com/yutkin/Lenta.Ru-News-Dataset/releases/tag/v1.0)
* [Kaggle](https://www.kaggle.com/yutkin/corpus-of-russian-news-articles-from-lenta/)

## Decompression
`gzip -d lenta-ru-news.csv.gz`
# Decompression
`bzip2 -d lenta-ru-news.csv.bz2`
66 changes: 39 additions & 27 deletions download_lenta.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,29 +22,29 @@ class LentaParser:
# lxml is much faster but error prone
default_parser = "html.parser"

def __init__(self, *, max_workers: int, outfile_name: str):
def __init__(self, *, max_workers: int, outfile_name: str, from_date: str):
self._endpoint = "https://lenta.ru/news"

self._sess = None
self._connector = None
self._read_timeout = 10
self._conn_timeout = 10

self._executor = ProcessPoolExecutor(max_workers=max_workers)

self._outfile_name = outfile_name
self._outfile = None
self._csv_writer = None
self.timeouts = aiohttp.ClientTimeout(total=60, connect=60)

self._n_downloaded = 0
self._from_date = datetime.strptime(from_date, "%d.%m.%Y")

@property
def dates_countdown(self):
date_start, date_end = datetime.today(), datetime(1999, 8, 30)
date_start, date_end = self._from_date, datetime.today()

while date_start > date_end:
while date_start <= date_end:
yield date_start.strftime("%Y/%m/%d")
date_start -= timedelta(days=1)
date_start += timedelta(days=1)

@property
def writer(self):
Expand All @@ -62,13 +62,10 @@ def session(self):
if self._sess is None or self._sess.closed:

self._connector = aiohttp.TCPConnector(
use_dns_cache=True, ttl_dns_cache=60 * 60, limit=512
use_dns_cache=True, ttl_dns_cache=60 * 60, limit=1024
)

self._sess = aiohttp.ClientSession(
connector=self._connector,
read_timeout=self._read_timeout,
conn_timeout=self._conn_timeout,
connector=self._connector, timeout=self.timeouts
)

return self._sess
Expand Down Expand Up @@ -103,18 +100,17 @@ def parse_article_html(html: str):
def _extract_urls_from_html(html: str):
doc_tree = BeautifulSoup(html, LentaParser.default_parser)
news_list = doc_tree.find_all("div", "item news b-tabloid__topic_news")
return [f"https://lenta.ru{news.find('a')['href']}" for news in news_list]
return tuple(f"https://lenta.ru{news.find('a')['href']}" for news in news_list)

async def _fetch_all_news_on_page(self, html: str):
loop = asyncio.get_running_loop()

# Get news URLs from raw html
loop = asyncio.get_running_loop()
news_urls = await loop.run_in_executor(
self._executor, self._extract_urls_from_html, html
)

# Fetching news
tasks = [asyncio.create_task(self.fetch(url)) for url in news_urls]
tasks = tuple(asyncio.create_task(self.fetch(url)) for url in news_urls)

fetched_raw_news = dict()

Expand All @@ -123,6 +119,8 @@ async def _fetch_all_news_on_page(self, html: str):
fetch_res = await task
except aiohttp.ClientResponseError as exc:
logger.error(f"Cannot fetch {exc.request_info.url}: {exc}")
except asyncio.TimeoutError:
logger.exception("Cannot fetch. Timout")
else:
fetched_raw_news[news_urls[i]] = fetch_res

Expand All @@ -136,8 +134,8 @@ async def _fetch_all_news_on_page(self, html: str):
for url, task in fetched_raw_news.items():
try:
parse_res = await task
except Exception as exc:
logger.error(f"Cannot parse {url}: {exc}")
except Exception:
logger.exception(f"Cannot parse {url}")
else:
parse_res["url"] = url
parsed_news.append(parse_res)
Expand All @@ -148,11 +146,11 @@ async def _fetch_all_news_on_page(self, html: str):

return len(parsed_news)

async def _shutdown(self):
async def shutdown(self):
if self._sess is not None:
await self._sess.close()

await asyncio.sleep(0.250)
await asyncio.sleep(0.5)

if self._outfile is not None:
self._outfile.close()
Expand All @@ -167,13 +165,15 @@ async def _producer(self):

try:
html = await asyncio.create_task(self.fetch(news_page_url))
except aiohttp.ClientResponseError as exc:
logger.error(f"Cannot fetch {exc.request_info.url} [{exc.status}]")
except aiohttp.ClientResponseError:
logger.exception(f"Cannot fetch {news_page_url}")
except aiohttp.ClientConnectionError:
logger.exception(f"Cannot fetch {news_page_url}")
else:
n_proccessed_news = await self._fetch_all_news_on_page(html)

if n_proccessed_news == 0:
logger.info(f"News not found on {news_page_url}.")
logger.info(f"News not found at {news_page_url}.")

logger.info(
f"{news_page_url} processed ({n_proccessed_news} news). "
Expand All @@ -184,7 +184,7 @@ async def run(self):
try:
await self._producer()
finally:
await self._shutdown()
await self.shutdown()


def main():
Expand All @@ -195,16 +195,28 @@ def main():
)

parser.add_argument(
"--cpu-workers", default=cpu_count(), type=int, help="number of cpu workers"
"--cpu-workers", default=cpu_count(), type=int, help="number of workers"
)

parser.add_argument(
"--from-date",
default="30.08.1999",
type=str,
help="download news from this date. Example: 30.08.1999",
)

args = parser.parse_args()

parser = LentaParser(
max_workers=args.cpu_workers,
outfile_name=args.outfile,
from_date=args.from_date,
)

try:
asyncio.run(
LentaParser(max_workers=args.cpu_workers, outfile_name=args.outfile).run()
)
asyncio.run(parser.run())
except KeyboardInterrupt:
asyncio.run(parser.shutdown())
logger.info("KeyboardInterrupt, exiting...")


Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
aiohttp==3.4.4
aiohttp==3.6.2
beautifulsoup4==4.6.3

0 comments on commit 4ba9584

Please sign in to comment.