anshrathod · Fortune-Adekogbe · Oct 10, 2022
diff --git a/Web-Scraping/Medium-article-downloader/Article_downloader.py b/Web-Scraping/Medium-article-downloader/Article_downloader.py
@@ -1,40 +1,71 @@
-#!/usr/bin/env python3
-#Imports and dependencies
-
-import requests
 from bs4 import BeautifulSoup
+import requests
+import re
+import os
+import shutil
 
-def download_article():
-
-    #The URL of the article is entered here
-    page_url = input("Enter the URL of the Medium Article ")
+ARTICLE_DIR = 'article'
 
-    #On looking for "my user agent", can be used to retrieve the value"
-    headers = {"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:79.0) Gecko/20100101 Firefox/79.0'} 
+def get_article_content(url: str) -> tuple:
+    """
+    This method scrapes and saves all contents in the article using 
+    beautiful soup and a handy regex.
+    It returns a bs4 object and the filename.
+    """
+    response = requests.get(url, timeout=10)
 
-    response = requests.get(page_url)
+    soup  = BeautifulSoup(response.content, "html.parser")
+    filename = soup.find('h1').text.replace(' ', '_')
 
-    soup = BeautifulSoup(response.text,"html.parser")
+    relevant = soup.find_all(re.compile(r'p|li|h[0-4]+|span'),id=re.compile(r'[a-z0-9]{4}'))
 
-    filename = soup.find('h1').text.replace(' ', '_')
+
+    if not os.path.exists(ARTICLE_DIR):
+        os.mkdir(ARTICLE_DIR)
 
-    #The content is written into a text file
+    content = ''
+    for i in relevant:
+        content += i.text + '\n'
 
-    file = open(filename, "w")
+    with open(f'{ARTICLE_DIR}/{filename}.txt', 'w') as f:
+        f.write(content.strip())
+    return soup, filename
 
-    #The content of the article is stored in the <article> tag
+def save_images(soup: BeautifulSoup) -> None:
+    """
+    Saves the all images (highest quality) in the article body.
+    """
+    i = 1
+    image_dir = f'{ARTICLE_DIR}/images'
+    if not os.path.exists(image_dir):
+        os.mkdir(image_dir)
 
-    for line in soup.find('article').find('div'):
-
-    #All the content is essentially stored between <p> tags
-
-        for content in line.find_all('p'):
+    for img in soup.find_all('source'):
+        try:
+            link = img['srcset'].split(',')[-1].split(' ')[1]
+            img_data = requests.get(link, timeout=10).content
+            with open(f'{image_dir}/image_{i}.png', 'wb') as f:
+                f.write(img_data)
+            i += 1
+        except KeyError:
+            continue
 
-        #contents are written into a file
-
-            file.write(content.text + '\n')
+def compress_and_cleanup_files(directory: str, filename: str) -> None:
+    """
+    Zips article content and deletes the directory.
+    """
+    shutil.make_archive(filename, 'zip', directory)
+    shutil.rmtree(directory)
 
-    file.close()
+def main(url):
+    """
+    Runs all the functions in order.
+    """
+    soup, filename = get_article_content(url)
+    save_images(soup)
+    compress_and_cleanup_files(ARTICLE_DIR, filename)
 
 if __name__ == "__main__":
-    download_article()
+    article_url = input("Enter the URL of the Medium Article: ")
+    # article_url = "https://medium.com/pytorch/accelerate-pytorch-with-ipex-and-onednn-using-intel-bf16-technology-dca5b8e6b58f"
+    main(article_url)