Merge pull request Clueless-Community#997 from Siddhesh-Agarwal/confi…

…g-class Config class
Anshika14528 · May 17, 2024 · 85df308 · 85df308
2 parents 0cd734e + bc9212d
commit 85df308
Show file tree

Hide file tree

Showing 81 changed files with 855 additions and 568 deletions.
diff --git a/.gitignore b/.gitignore
@@ -2,11 +2,8 @@
 __pycache__/
 *.py[cod]
 *$py.class
-<<<<<<< HEAD
 local_test.py
-=======
-locaL_test.py
->>>>>>> 8f31f07307b5d503598fb062f51376a7b9eac315
+
 # C extensions
 *.so
 *.ipynb

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -76,31 +76,36 @@ Now you are done with the project setup, now you can make the changes you want o
 - At first, we have to scrape the profile page of a user. For that, we have defined a function in the user class as
 
 ```python
-- scrape-up/src/scrape_up/github/users.py
+# scrape-up/src/scrape_up/github/users.py
+
+from scrape_up.config.request_config import RequestConfig, get
+
 class Users:
 
-    def __init__(self, username):
+    def __init__(self, username, *, config: RequestConfig = RequestConfig()):
         self.username = username
+        self.config = config
 
     def __scrape_page(self):
         username = self.username
-        data = requests.get(f"https://github.com/{username}") 
-        data = BeautifulSoup(data.text, "html.parser")
-        return data
+        data = get(f"https://github.com/{username}", self.config)
+        soup = BeautifulSoup(data.text, "html.parser")
+        return soup
 ```
 
 - The `__scrape_page` is a private function defined to scrape any page.
 - Now we have to create a function with an appropriate name, in this case, `followers`.
+- `scrape_up.config.request_config` contains our custom get function. This function takes 2 parameters: `url` and `config`. The `url` parameter is the URL of the page you want to scrape. The `config` parameter is an instance of the `RequestConfig` class. The `RequestConfig` class contains various settings like headers, timeout, and redirect.
 
 ```python
 def followers(self):
-        page = self.__scrape_page()
-        try:
-            followers = page.find(class_ = "avatar avatar-user width-full border color-bg-default")
-            return followers["src"]
-        except:
-            message = f"{self.username} not found !"
-            return message
+    page = self.__scrape_page()
+    try:
+        followers = page.find(class_ = "avatar avatar-user width-full border color-bg-default")
+        return followers["src"]
+    except:
+        message = f"{self.username} not found !"
+        return message
 ```
 
 - When you do inspect the element of the page, you will get to know the class named `avatar avatar-user width-full border color-bg-default` contains the avatar URL.

diff --git a/src/scrape_up/academia/academia.py b/src/scrape_up/academia/academia.py
@@ -1,5 +1,5 @@
 from bs4 import BeautifulSoup
-import requests
+from scrape_up.config.request_config import RequestConfig, get
 
 
 class Academia:
@@ -17,10 +17,13 @@ class Academia:
 
     """
 
-    def __init__(self):
-        self.headers = {
+    def __init__(self, *, config: RequestConfig = RequestConfig()):
+        headers = {
             "User-Agent": "Mozilla/5.0 (Windows NT 6.3; Win 64 ; x64) Apple WeKit /537.36(KHTML , like Gecko) Chrome/80.0.3987.162 Safari/537.36"
         }
+        self.config = config
+        if self.config.headers == {}:
+            self.config.set_headers(headers)
 
     def get_research_topics(self):
         """
@@ -46,7 +49,7 @@ def get_research_topics(self):
         """
         try:
             url = f"https://www.academia.edu/topics/"
-            html_text = requests.get(url, headers=self.headers).text
+            html_text = get(url, self.config).text
             soup = BeautifulSoup(html_text, "lxml")
 
             topics = []
@@ -92,7 +95,7 @@ def get_research_papers(self, search):
             search = search.title()
             search = search.replace(" ", "_")
             url = f"https://www.academia.edu/Documents/in/{search}"
-            html_text = requests.get(url, headers=self.headers).text
+            html_text = get(url, self.config).text
             soup = BeautifulSoup(html_text, "lxml")
 
             papers = []

diff --git a/src/scrape_up/amazon/products.py b/src/scrape_up/amazon/products.py
@@ -1,21 +1,25 @@
-import requests
 from bs4 import BeautifulSoup
+import requests
+from scrape_up.config.request_config import RequestConfig, get
 
 
 class Product:
     """Class for fetching and retrieving product details from Amazon."""
 
-    def __init__(self, product_name: str):
+    def __init__(self, product_name: str, *, config: RequestConfig = RequestConfig()):
         """
         Initialize the Product object with a product name.
 
         Args:
             product_name (str): The name of the product.
         """
         self.product_name = product_name
-        self.headers = {
+        headers = {
             "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36"
         }
+        self.config = config
+        if self.config.headers == {}:
+            self.config.set_headers(headers)
 
     def fetch_product_page(self):
         """
@@ -30,7 +34,7 @@ def fetch_product_page(self):
         try:
             product_name = self.product_name.replace(" ", "+")
             url = f"https://www.amazon.in/s?k={product_name}"
-            r = requests.get(url, headers=self.headers)
+            r = get(url, self.config)
             r.raise_for_status()  # Raise HTTPError for bad responses
             return BeautifulSoup(r.content, "html.parser")
         except requests.RequestException as e:
@@ -76,7 +80,7 @@ def get_product_details(self):
         """
         try:
             product_link = self.get_product()["data"]
-            r = requests.get(product_link, headers=self.headers)
+            r = get(product_link, self.config)
             r.raise_for_status()  # Raise HTTPError for bad responses
             soup = BeautifulSoup(r.content, "html.parser")
             product_name = soup.find("span", {"id": "productTitle"}).text.strip()
@@ -107,7 +111,7 @@ def get_product_image(self):
         """
         try:
             product_link = self.get_product()["data"]
-            r = requests.get(product_link, headers=self.headers)
+            r = get(product_link, self.config)
             r.raise_for_status()  # Raise HTTPError for bad responses
             soup = BeautifulSoup(r.content, "html.parser")
             product_image = soup.find("div", {"id": "imgTagWrapperId"}).find("img")[
@@ -132,7 +136,7 @@ def customer_review(self):
         """
         try:
             product_link = self.get_product()["data"]
-            r = requests.get(product_link, headers=self.headers)
+            r = get(product_link, self.config)
             r.raise_for_status()  # Raise HTTPError for bad responses
             soup = BeautifulSoup(r.content, "html.parser")
             review_elements = soup.find_all("div", {"data-hook": "review"})

diff --git a/src/scrape_up/ambitionBox/company.py b/src/scrape_up/ambitionBox/company.py
@@ -1,26 +1,29 @@
-import requests
 from bs4 import BeautifulSoup
 
+from scrape_up.config.request_config import RequestConfig, get
+
 
 class Comapiens:
-    def __init__(self, num_pages: int = 1):
+    def __init__(self, num_pages: int = 1, *, config: RequestConfig = RequestConfig()):
         self.num_pages = num_pages
+        headers = {
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"
+        }
+        self.config = config
+        if self.config.headers == {}:
+            self.config.set_headers(headers)
 
     def write_sorted_list(self, file, company_list):
         company_list.sort(key=lambda x: x[1], reverse=True)
         for company_name, rating in company_list:
             file.write(f"{company_name.strip()} {rating}\n")
 
     def scrape_companies(self):
-        headers = {
-            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"
-        }
-
         for page in range(1, self.num_pages + 1):
             print(f"Scraping webpage number: {page} of {self.num_pages}")
 
             url = f"https://www.ambitionbox.com/list-of-companies?page={page}"
-            response = requests.get(url, headers=headers)
+            response = get(url, self.config)
 
             if response.status_code == 200:
                 soup = BeautifulSoup(response.text, "lxml")

diff --git a/src/scrape_up/askubuntu/questions.py b/src/scrape_up/askubuntu/questions.py
@@ -1,7 +1,8 @@
 from bs4 import BeautifulSoup
-import requests
 import json
 
+from scrape_up.config.request_config import RequestConfig, get
+
 
 class Questions:
     """
@@ -21,8 +22,9 @@ class Questions:
     | `.getHighScoredQuestions()` | Returns the most voted questions, views, votes, answer counts, and descriptions in JSON format       |
     """
 
-    def __init__(self, topic):
+    def __init__(self, topic: str, *, config: RequestConfig = RequestConfig()):
         self.topic = topic
+        self.config = config
 
     def getNewQuestions(self):
         """
@@ -46,7 +48,7 @@ def getNewQuestions(self):
         """
         url = "https://askubuntu.com/questions/tagged/" + self.topic + "?tab=Newest"
         try:
-            res = requests.get(url)
+            res = get(url, self.config)
             soup = BeautifulSoup(res.text, "html.parser")
 
             questions_data = {"questions": []}
@@ -103,7 +105,7 @@ def getActiveQuestions(self):
         """
         url = "https://askubuntu.com/questions/tagged/" + self.topic + "?tab=Active"
         try:
-            res = requests.get(url)
+            res = get(url, self.config)
             soup = BeautifulSoup(res.text, "html.parser")
 
             questions_data = {"questions": []}
@@ -160,7 +162,7 @@ def getUnansweredQuestions(self):
         """
         url = "https://askubuntu.com/questions/tagged/" + self.topic + "?tab=Unanswered"
         try:
-            res = requests.get(url)
+            res = get(url, self.config)
             soup = BeautifulSoup(res.text, "html.parser")
 
             questions_data = {"questions": []}
@@ -217,7 +219,7 @@ def getBountiedQuestions(self):
         """
         url = "https://askubuntu.com/questions/tagged/" + self.topic + "?tab=Bountied"
         try:
-            res = requests.get(url)
+            res = get(url, self.config)
             soup = BeautifulSoup(res.text, "html.parser")
 
             questions_data = {"questions": []}
@@ -274,7 +276,7 @@ def getFrequentQuestions(self):
         """
         url = "https://askubuntu.com/questions/tagged/" + self.topic + "?tab=Frequent"
         try:
-            res = requests.get(url)
+            res = get(url, self.config)
             soup = BeautifulSoup(res.text, "html.parser")
 
             questions_data = {"questions": []}
@@ -331,7 +333,7 @@ def getHighScoredQuestions(self):
         """
         url = "https://askubuntu.com/questions/tagged/" + self.topic + "?tab=Votes"
         try:
-            res = requests.get(url)
+            res = get(url, self.config)
             soup = BeautifulSoup(res.text, "html.parser")
 
             questions_data = {"questions": []}

diff --git a/src/scrape_up/banners/scraper88x31.py b/src/scrape_up/banners/scraper88x31.py
@@ -1,6 +1,7 @@
-import requests
 import bs4
 
+from scrape_up.config.request_config import RequestConfig, get
+
 
 class Scraper88x31:
     """
@@ -13,14 +14,15 @@ class Scraper88x31:
     | `get_all()`        | Returns the list of all available 88x31 banners          |
     """
 
-    def __init__(self):
+    def __init__(self, *, config: RequestConfig = RequestConfig()):
         self.urls_to_scrape = [
             "https://cyber.dabamos.de/88x31/index.html",
             "https://cyber.dabamos.de/88x31/index2.html",
             "https://cyber.dabamos.de/88x31/index3.html",
             "https://cyber.dabamos.de/88x31/index4.html",
             "https://cyber.dabamos.de/88x31/index5.html",
         ]
+        self.config = config
 
     def get_all(self):
         """
@@ -40,7 +42,7 @@ def get_all(self):
         img_alt = []
         for url in self.urls_to_scrape:
             try:
-                response = requests.get(url)
+                response = get(url, self.config)
                 response.raise_for_status()
                 source = response.content
                 soup = bs4.BeautifulSoup(source, "lxml")

diff --git a/src/scrape_up/bbcnews/bbcnews.py b/src/scrape_up/bbcnews/bbcnews.py
@@ -1,6 +1,7 @@
-import requests
 from bs4 import BeautifulSoup
 
+from scrape_up.config.request_config import RequestConfig, get
+
 
 class BBCNews:
     """
@@ -14,9 +15,15 @@ class BBCNews:
     | `get_article()`    | Returns an object with proper details about the articles |
     """
 
-    def __init__(self):
+    def __init__(self, *, config: RequestConfig = RequestConfig()):
         self.base_url = "https://www.bbc.co.uk"
         self.headlines_url = self.base_url + "/news"
+        headers = {
+            "User-Agent": "Mozilla/5.0 (Windows NT 6.3; Win 64 ; x64) Apple WeKit /537.36(KHTML , like Gecko) Chrome/80.0.3987.162 Safari/537.36"
+        }
+        self.config = config
+        if self.config.headers == {}:
+            self.config.set_headers(headers)
 
     def get_headlines(self):
         """
@@ -26,7 +33,7 @@ def get_headlines(self):
         Example: [{'index': 1, 'headline': 'Headline 1'}, {'index': 2, 'headline': 'Headline 2'}, ...]
         """
         try:
-            response = requests.get(self.headlines_url)
+            response = get(self.headlines_url, self.config)
             response.raise_for_status()  # Raise an exception for HTTP errors (4xx or 5xx)
         except:
             return None
@@ -46,7 +53,7 @@ def get_headlines(self):
 
         return news_list
 
-    def get_article(self, url):
+    def get_article(self, url: str):
         """
         Create an instance of the class - `BBCNews`\n
         ```python
@@ -56,10 +63,7 @@ def get_article(self, url):
         ```
         """
         try:
-            headers = {
-                "User-Agent": "Mozilla/5.0 (Windows NT 6.3; Win 64 ; x64) Apple WeKit /537.36(KHTML , like Gecko) Chrome/80.0.3987.162 Safari/537.36"
-            }
-            response = requests.get(url, headers=headers).text
+            response = get(url, self.config).text
             soup = BeautifulSoup(response, "lxml")
 
             main_heading = soup.find("h1", {"id": "main-heading"}).text.strip()