Merge pull request #5 from yutkin/dev

Dev
yutkin · Dec 14, 2019 · 4ba9584 · 4ba9584
2 parents 8d3714e + d62bf8b
commit 4ba9584
Show file tree

Hide file tree

Showing 4 changed files with 48 additions and 35 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,4 @@
 .idea
 env
 lenta-ru-news.csv
+venv/
diff --git a/README.md b/README.md
@@ -1,14 +1,14 @@
-## Corpus of news articles of Lenta.Ru
-* Size: 502 Mb (1.8 Gb uncompressed)
-* News articles: ~ 739K
-* Dates: 1999-08-30 - 2018-12-15
+# Corpus of news articles of Lenta.Ru
+* Size: 337 Mb (2 Gb uncompressed)
+* News articles: 800K+
+* Dates: 30/08/1999 - 14/12/2019
 
 + [Script](../master/download_lenta.py) for news downloading (Python **3.7**+ is required).
 
 
-## Download
+# Download
 * [GitHub](https://github.com/yutkin/Lenta.Ru-News-Dataset/releases/tag/v1.0)
 * [Kaggle](https://www.kaggle.com/yutkin/corpus-of-russian-news-articles-from-lenta/)
 
-## Decompression
-`gzip -d lenta-ru-news.csv.gz`
+# Decompression
+`bzip2 -d lenta-ru-news.csv.bz2`
diff --git a/download_lenta.py b/download_lenta.py
@@ -22,29 +22,29 @@ class LentaParser:
     # lxml is much faster but error prone
     default_parser = "html.parser"
 
-    def __init__(self, *, max_workers: int, outfile_name: str):
+    def __init__(self, *, max_workers: int, outfile_name: str, from_date: str):
         self._endpoint = "https://lenta.ru/news"
 
         self._sess = None
         self._connector = None
-        self._read_timeout = 10
-        self._conn_timeout = 10
 
         self._executor = ProcessPoolExecutor(max_workers=max_workers)
 
         self._outfile_name = outfile_name
         self._outfile = None
         self._csv_writer = None
+        self.timeouts = aiohttp.ClientTimeout(total=60, connect=60)
 
         self._n_downloaded = 0
+        self._from_date = datetime.strptime(from_date, "%d.%m.%Y")
 
     @property
     def dates_countdown(self):
-        date_start, date_end = datetime.today(), datetime(1999, 8, 30)
+        date_start, date_end = self._from_date, datetime.today()
 
-        while date_start > date_end:
+        while date_start <= date_end:
             yield date_start.strftime("%Y/%m/%d")
-            date_start -= timedelta(days=1)
+            date_start += timedelta(days=1)
 
     @property
     def writer(self):
@@ -62,13 +62,10 @@ def session(self):
         if self._sess is None or self._sess.closed:
 
             self._connector = aiohttp.TCPConnector(
-                use_dns_cache=True, ttl_dns_cache=60 * 60, limit=512
+                use_dns_cache=True, ttl_dns_cache=60 * 60, limit=1024
             )
-
             self._sess = aiohttp.ClientSession(
-                connector=self._connector,
-                read_timeout=self._read_timeout,
-                conn_timeout=self._conn_timeout,
+                connector=self._connector, timeout=self.timeouts
             )
 
         return self._sess
@@ -103,18 +100,17 @@ def parse_article_html(html: str):
     def _extract_urls_from_html(html: str):
         doc_tree = BeautifulSoup(html, LentaParser.default_parser)
         news_list = doc_tree.find_all("div", "item news b-tabloid__topic_news")
-        return [f"https://lenta.ru{news.find('a')['href']}" for news in news_list]
+        return tuple(f"https://lenta.ru{news.find('a')['href']}" for news in news_list)
 
     async def _fetch_all_news_on_page(self, html: str):
-        loop = asyncio.get_running_loop()
-
         # Get news URLs from raw html
+        loop = asyncio.get_running_loop()
         news_urls = await loop.run_in_executor(
             self._executor, self._extract_urls_from_html, html
         )
 
         # Fetching news
-        tasks = [asyncio.create_task(self.fetch(url)) for url in news_urls]
+        tasks = tuple(asyncio.create_task(self.fetch(url)) for url in news_urls)
 
         fetched_raw_news = dict()
 
@@ -123,6 +119,8 @@ async def _fetch_all_news_on_page(self, html: str):
                 fetch_res = await task
             except aiohttp.ClientResponseError as exc:
                 logger.error(f"Cannot fetch {exc.request_info.url}: {exc}")
+            except asyncio.TimeoutError:
+                logger.exception("Cannot fetch. Timout")
             else:
                 fetched_raw_news[news_urls[i]] = fetch_res
 
@@ -136,8 +134,8 @@ async def _fetch_all_news_on_page(self, html: str):
         for url, task in fetched_raw_news.items():
             try:
                 parse_res = await task
-            except Exception as exc:
-                logger.error(f"Cannot parse {url}: {exc}")
+            except Exception:
+                logger.exception(f"Cannot parse {url}")
             else:
                 parse_res["url"] = url
                 parsed_news.append(parse_res)
@@ -148,11 +146,11 @@ async def _fetch_all_news_on_page(self, html: str):
 
         return len(parsed_news)
 
-    async def _shutdown(self):
+    async def shutdown(self):
         if self._sess is not None:
             await self._sess.close()
 
-        await asyncio.sleep(0.250)
+        await asyncio.sleep(0.5)
 
         if self._outfile is not None:
             self._outfile.close()
@@ -167,13 +165,15 @@ async def _producer(self):
 
             try:
                 html = await asyncio.create_task(self.fetch(news_page_url))
-            except aiohttp.ClientResponseError as exc:
-                logger.error(f"Cannot fetch {exc.request_info.url} [{exc.status}]")
+            except aiohttp.ClientResponseError:
+                logger.exception(f"Cannot fetch {news_page_url}")
+            except aiohttp.ClientConnectionError:
+                logger.exception(f"Cannot fetch {news_page_url}")
             else:
                 n_proccessed_news = await self._fetch_all_news_on_page(html)
 
                 if n_proccessed_news == 0:
-                    logger.info(f"News not found on {news_page_url}.")
+                    logger.info(f"News not found at {news_page_url}.")
 
                 logger.info(
                     f"{news_page_url} processed ({n_proccessed_news} news). "
@@ -184,7 +184,7 @@ async def run(self):
         try:
             await self._producer()
         finally:
-            await self._shutdown()
+            await self.shutdown()
 
 
 def main():
@@ -195,16 +195,28 @@ def main():
     )
 
     parser.add_argument(
-        "--cpu-workers", default=cpu_count(), type=int, help="number of cpu workers"
+        "--cpu-workers", default=cpu_count(), type=int, help="number of workers"
+    )
+
+    parser.add_argument(
+        "--from-date",
+        default="30.08.1999",
+        type=str,
+        help="download news from this date. Example: 30.08.1999",
     )
 
     args = parser.parse_args()
 
+    parser = LentaParser(
+        max_workers=args.cpu_workers,
+        outfile_name=args.outfile,
+        from_date=args.from_date,
+    )
+
     try:
-        asyncio.run(
-            LentaParser(max_workers=args.cpu_workers, outfile_name=args.outfile).run()
-        )
+        asyncio.run(parser.run())
     except KeyboardInterrupt:
+        asyncio.run(parser.shutdown())
         logger.info("KeyboardInterrupt, exiting...")
 
 

diff --git a/requirements.txt b/requirements.txt
@@ -1,2 +1,2 @@
-aiohttp==3.4.4
+aiohttp==3.6.2
 beautifulsoup4==4.6.3