Skip to content

Commit

Permalink
Inital commit
Browse files Browse the repository at this point in the history
  • Loading branch information
yutkin committed Jul 16, 2018
0 parents commit 50dbed9
Show file tree
Hide file tree
Showing 2 changed files with 171 additions and 0 deletions.
24 changes: 24 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
## Корпус новостей с Lenta.Ru

* Размер: 1.7 Гб
* Количество новостей: 699.746
* Период: Сентябрь 1999 - июль 2018

+ [Скрипт](../master/download_lenta.py) для скачивания новостей.

## (Eng) Corpus of news articles from Lenta.Ru
* Size: 1.7 Gb
* News articles: 699.746
* Dates: Sept. 1999 - July 2018

+ [Script](../master/download_lenta.py) for news downloading.


## Скачать / Download

* [Amazon S3](https://s3-us-west-2.amazonaws.com/lenta-news-dataset/news_lenta.csv)
* [Academic Torrents](http://academictorrents.com/details/cfc4ba252fe56176d9db31b0609f0ece6a389b09)

## Скриншот / Screenshot

![alt text](https://s3-us-west-2.amazonaws.com/lenta-news-dataset/screenshot.png "Logo Title Text 1")
147 changes: 147 additions & 0 deletions download_lenta.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
import asyncio
import logging
import time
from concurrent.futures import ProcessPoolExecutor, as_completed
from datetime import datetime, timedelta
from multiprocessing import cpu_count

import aiohttp
import uvloop
from bs4 import BeautifulSoup
import csv
asyncio.set_event_loop_policy(uvloop.EventLoopPolicy())

logger = logging.getLogger(name="LentaParser")
logging.basicConfig(
level=logging.DEBUG,
format="%(asctime)s - %(name)s - %(levelname)s @ %(message)s",
datefmt="%d-%m-%Y %H:%M:%S",
)


class LentaParser:

def __init__(self, max_workers=cpu_count(), outfile_name=None):
self.queue = asyncio.Queue()
self._endpoint = "https://lenta.ru/news"
self._sess = None
self.loop = asyncio.get_event_loop()
self.max_workers = max_workers
self.outfile_name = outfile_name
if not outfile_name:
unixtime = int(time.time())
self.outfile_name = f"news_lenta_{unixtime}.csv"

self.outfile = open(self.outfile_name, "a")
self.csv_writer = csv.DictWriter(
self.outfile, fieldnames=["url", "title", "text", "topic", "tags"])
self.csv_writer.writeheader()
self.n_downloaded = 0

@property
def dates_countdown(self):
date = datetime.today()
while True:
yield date.strftime("%Y/%m/%d")
try:
date -= timedelta(days=1)
except OverflowError:
return

@property
def session(self):
if self._sess is None or self._sess.closed:
self._sess = aiohttp.ClientSession()
return self._sess

async def fetch(self, url):
response = await self.session.get(url, allow_redirects=False)

logger.debug(f"{url} ({response.status})")

text = None
if response.status == 200:
text = await response.text()
return text

@staticmethod
def parse_article_html(html):
doc_tree = BeautifulSoup(html, "lxml")
tags = doc_tree.find("a", "item dark active")
tags = tags.get_text() if tags else None

body = doc_tree.find("div", attrs={"itemprop": "articleBody"})
if not body:
raise RuntimeError("Could not find div with itemprop=articleBody")

text = " ".join([p.get_text() for p in body.find_all("p")])

topic = doc_tree.find("a", "b-header-inner__block")
topic = topic.get_text() if topic else None

title = doc_tree.find("h1", attrs={"itemprop": "headline"})
title = title.get_text() if title else None

return {"title": title, "text": text, "topic": topic, "tags": tags}

async def fetch_all_news_on_page(self, feth_news_page_coro):
html = await feth_news_page_coro
if html:
doc_tree = BeautifulSoup(html, "lxml")
news_list = doc_tree.find_all("div", "item news b-tabloid__topic_news")
news_urls = [
f"https://lenta.ru{news.find('a')['href']}" for news in news_list
]

tasks = (asyncio.ensure_future(self.fetch(url)) for url in news_urls)

fetched_news_html = await asyncio.gather(*tasks)

with ProcessPoolExecutor(max_workers=self.max_workers) as executor:
futures = []
for article_html in fetched_news_html:
futures.append(executor.submit(self.parse_article_html, article_html))

for article_url, future in zip(news_urls, as_completed(futures)):
try:
processed_artile = future.result()
processed_artile["url"] = article_url
self.csv_writer.writerow(processed_artile)
self.n_downloaded += 1

except Exception:
logger.exception(f"Error while processing {article_url}")
logger.info(f"#{self.n_downloaded} news processed.")
return html is not None

async def producer(self):
for date in self.dates_countdown:
news_page_url = f"{self._endpoint}/{date}"
fut = asyncio.ensure_future(self.fetch(news_page_url))
ok = await self.fetch_all_news_on_page(fut)
if not ok:
break

def run(self):
try:
self.loop.run_until_complete(self.producer())
except KeyboardInterrupt:
logger.info("KeyboardInterrupt, exiting...")
finally:
if self._sess:
self.loop.run_until_complete(self._sess.close())
self.loop.stop()
if self.n_downloaded:
logger.info(
f"{self.n_downloaded} news processed totally. Saved at {self.outfile_name}"
)

self.outfile.close()


def main():
LentaParser().run()


if __name__ == "__main__":
main()

0 comments on commit 50dbed9

Please sign in to comment.