Skip to content

Commit

Permalink
Merge pull request Clueless-Community#997 from Siddhesh-Agarwal/confi…
Browse files Browse the repository at this point in the history
…g-class

Config class
  • Loading branch information
nikhil25803 authored May 17, 2024
2 parents 0cd734e + bc9212d commit 85df308
Show file tree
Hide file tree
Showing 81 changed files with 855 additions and 568 deletions.
5 changes: 1 addition & 4 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,8 @@
__pycache__/
*.py[cod]
*$py.class
<<<<<<< HEAD
local_test.py
=======
locaL_test.py
>>>>>>> 8f31f07307b5d503598fb062f51376a7b9eac315

# C extensions
*.so
*.ipynb
Expand Down
29 changes: 17 additions & 12 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -76,31 +76,36 @@ Now you are done with the project setup, now you can make the changes you want o
- At first, we have to scrape the profile page of a user. For that, we have defined a function in the user class as

```python
- scrape-up/src/scrape_up/github/users.py
# scrape-up/src/scrape_up/github/users.py

from scrape_up.config.request_config import RequestConfig, get

class Users:

def __init__(self, username):
def __init__(self, username, *, config: RequestConfig = RequestConfig()):
self.username = username
self.config = config

def __scrape_page(self):
username = self.username
data = requests.get(f"https://github.com/{username}")
data = BeautifulSoup(data.text, "html.parser")
return data
data = get(f"https://github.com/{username}", self.config)
soup = BeautifulSoup(data.text, "html.parser")
return soup
```

- The `__scrape_page` is a private function defined to scrape any page.
- Now we have to create a function with an appropriate name, in this case, `followers`.
- `scrape_up.config.request_config` contains our custom get function. This function takes 2 parameters: `url` and `config`. The `url` parameter is the URL of the page you want to scrape. The `config` parameter is an instance of the `RequestConfig` class. The `RequestConfig` class contains various settings like headers, timeout, and redirect.

```python
def followers(self):
page = self.__scrape_page()
try:
followers = page.find(class_ = "avatar avatar-user width-full border color-bg-default")
return followers["src"]
except:
message = f"{self.username} not found !"
return message
page = self.__scrape_page()
try:
followers = page.find(class_ = "avatar avatar-user width-full border color-bg-default")
return followers["src"]
except:
message = f"{self.username} not found !"
return message
```

- When you do inspect the element of the page, you will get to know the class named `avatar avatar-user width-full border color-bg-default` contains the avatar URL.
Expand Down
13 changes: 8 additions & 5 deletions src/scrape_up/academia/academia.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from bs4 import BeautifulSoup
import requests
from scrape_up.config.request_config import RequestConfig, get


class Academia:
Expand All @@ -17,10 +17,13 @@ class Academia:
"""

def __init__(self):
self.headers = {
def __init__(self, *, config: RequestConfig = RequestConfig()):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.3; Win 64 ; x64) Apple WeKit /537.36(KHTML , like Gecko) Chrome/80.0.3987.162 Safari/537.36"
}
self.config = config
if self.config.headers == {}:
self.config.set_headers(headers)

def get_research_topics(self):
"""
Expand All @@ -46,7 +49,7 @@ def get_research_topics(self):
"""
try:
url = f"https://www.academia.edu/topics/"
html_text = requests.get(url, headers=self.headers).text
html_text = get(url, self.config).text
soup = BeautifulSoup(html_text, "lxml")

topics = []
Expand Down Expand Up @@ -92,7 +95,7 @@ def get_research_papers(self, search):
search = search.title()
search = search.replace(" ", "_")
url = f"https://www.academia.edu/Documents/in/{search}"
html_text = requests.get(url, headers=self.headers).text
html_text = get(url, self.config).text
soup = BeautifulSoup(html_text, "lxml")

papers = []
Expand Down
18 changes: 11 additions & 7 deletions src/scrape_up/amazon/products.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,25 @@
import requests
from bs4 import BeautifulSoup
import requests
from scrape_up.config.request_config import RequestConfig, get


class Product:
"""Class for fetching and retrieving product details from Amazon."""

def __init__(self, product_name: str):
def __init__(self, product_name: str, *, config: RequestConfig = RequestConfig()):
"""
Initialize the Product object with a product name.
Args:
product_name (str): The name of the product.
"""
self.product_name = product_name
self.headers = {
headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36"
}
self.config = config
if self.config.headers == {}:
self.config.set_headers(headers)

def fetch_product_page(self):
"""
Expand All @@ -30,7 +34,7 @@ def fetch_product_page(self):
try:
product_name = self.product_name.replace(" ", "+")
url = f"https://www.amazon.in/s?k={product_name}"
r = requests.get(url, headers=self.headers)
r = get(url, self.config)
r.raise_for_status() # Raise HTTPError for bad responses
return BeautifulSoup(r.content, "html.parser")
except requests.RequestException as e:
Expand Down Expand Up @@ -76,7 +80,7 @@ def get_product_details(self):
"""
try:
product_link = self.get_product()["data"]
r = requests.get(product_link, headers=self.headers)
r = get(product_link, self.config)
r.raise_for_status() # Raise HTTPError for bad responses
soup = BeautifulSoup(r.content, "html.parser")
product_name = soup.find("span", {"id": "productTitle"}).text.strip()
Expand Down Expand Up @@ -107,7 +111,7 @@ def get_product_image(self):
"""
try:
product_link = self.get_product()["data"]
r = requests.get(product_link, headers=self.headers)
r = get(product_link, self.config)
r.raise_for_status() # Raise HTTPError for bad responses
soup = BeautifulSoup(r.content, "html.parser")
product_image = soup.find("div", {"id": "imgTagWrapperId"}).find("img")[
Expand All @@ -132,7 +136,7 @@ def customer_review(self):
"""
try:
product_link = self.get_product()["data"]
r = requests.get(product_link, headers=self.headers)
r = get(product_link, self.config)
r.raise_for_status() # Raise HTTPError for bad responses
soup = BeautifulSoup(r.content, "html.parser")
review_elements = soup.find_all("div", {"data-hook": "review"})
Expand Down
17 changes: 10 additions & 7 deletions src/scrape_up/ambitionBox/company.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,29 @@
import requests
from bs4 import BeautifulSoup

from scrape_up.config.request_config import RequestConfig, get


class Comapiens:
def __init__(self, num_pages: int = 1):
def __init__(self, num_pages: int = 1, *, config: RequestConfig = RequestConfig()):
self.num_pages = num_pages
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"
}
self.config = config
if self.config.headers == {}:
self.config.set_headers(headers)

def write_sorted_list(self, file, company_list):
company_list.sort(key=lambda x: x[1], reverse=True)
for company_name, rating in company_list:
file.write(f"{company_name.strip()} {rating}\n")

def scrape_companies(self):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"
}

for page in range(1, self.num_pages + 1):
print(f"Scraping webpage number: {page} of {self.num_pages}")

url = f"https://www.ambitionbox.com/list-of-companies?page={page}"
response = requests.get(url, headers=headers)
response = get(url, self.config)

if response.status_code == 200:
soup = BeautifulSoup(response.text, "lxml")
Expand Down
18 changes: 10 additions & 8 deletions src/scrape_up/askubuntu/questions.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from bs4 import BeautifulSoup
import requests
import json

from scrape_up.config.request_config import RequestConfig, get


class Questions:
"""
Expand All @@ -21,8 +22,9 @@ class Questions:
| `.getHighScoredQuestions()` | Returns the most voted questions, views, votes, answer counts, and descriptions in JSON format |
"""

def __init__(self, topic):
def __init__(self, topic: str, *, config: RequestConfig = RequestConfig()):
self.topic = topic
self.config = config

def getNewQuestions(self):
"""
Expand All @@ -46,7 +48,7 @@ def getNewQuestions(self):
"""
url = "https://askubuntu.com/questions/tagged/" + self.topic + "?tab=Newest"
try:
res = requests.get(url)
res = get(url, self.config)
soup = BeautifulSoup(res.text, "html.parser")

questions_data = {"questions": []}
Expand Down Expand Up @@ -103,7 +105,7 @@ def getActiveQuestions(self):
"""
url = "https://askubuntu.com/questions/tagged/" + self.topic + "?tab=Active"
try:
res = requests.get(url)
res = get(url, self.config)
soup = BeautifulSoup(res.text, "html.parser")

questions_data = {"questions": []}
Expand Down Expand Up @@ -160,7 +162,7 @@ def getUnansweredQuestions(self):
"""
url = "https://askubuntu.com/questions/tagged/" + self.topic + "?tab=Unanswered"
try:
res = requests.get(url)
res = get(url, self.config)
soup = BeautifulSoup(res.text, "html.parser")

questions_data = {"questions": []}
Expand Down Expand Up @@ -217,7 +219,7 @@ def getBountiedQuestions(self):
"""
url = "https://askubuntu.com/questions/tagged/" + self.topic + "?tab=Bountied"
try:
res = requests.get(url)
res = get(url, self.config)
soup = BeautifulSoup(res.text, "html.parser")

questions_data = {"questions": []}
Expand Down Expand Up @@ -274,7 +276,7 @@ def getFrequentQuestions(self):
"""
url = "https://askubuntu.com/questions/tagged/" + self.topic + "?tab=Frequent"
try:
res = requests.get(url)
res = get(url, self.config)
soup = BeautifulSoup(res.text, "html.parser")

questions_data = {"questions": []}
Expand Down Expand Up @@ -331,7 +333,7 @@ def getHighScoredQuestions(self):
"""
url = "https://askubuntu.com/questions/tagged/" + self.topic + "?tab=Votes"
try:
res = requests.get(url)
res = get(url, self.config)
soup = BeautifulSoup(res.text, "html.parser")

questions_data = {"questions": []}
Expand Down
8 changes: 5 additions & 3 deletions src/scrape_up/banners/scraper88x31.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import requests
import bs4

from scrape_up.config.request_config import RequestConfig, get


class Scraper88x31:
"""
Expand All @@ -13,14 +14,15 @@ class Scraper88x31:
| `get_all()` | Returns the list of all available 88x31 banners |
"""

def __init__(self):
def __init__(self, *, config: RequestConfig = RequestConfig()):
self.urls_to_scrape = [
"https://cyber.dabamos.de/88x31/index.html",
"https://cyber.dabamos.de/88x31/index2.html",
"https://cyber.dabamos.de/88x31/index3.html",
"https://cyber.dabamos.de/88x31/index4.html",
"https://cyber.dabamos.de/88x31/index5.html",
]
self.config = config

def get_all(self):
"""
Expand All @@ -40,7 +42,7 @@ def get_all(self):
img_alt = []
for url in self.urls_to_scrape:
try:
response = requests.get(url)
response = get(url, self.config)
response.raise_for_status()
source = response.content
soup = bs4.BeautifulSoup(source, "lxml")
Expand Down
20 changes: 12 additions & 8 deletions src/scrape_up/bbcnews/bbcnews.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import requests
from bs4 import BeautifulSoup

from scrape_up.config.request_config import RequestConfig, get


class BBCNews:
"""
Expand All @@ -14,9 +15,15 @@ class BBCNews:
| `get_article()` | Returns an object with proper details about the articles |
"""

def __init__(self):
def __init__(self, *, config: RequestConfig = RequestConfig()):
self.base_url = "https://www.bbc.co.uk"
self.headlines_url = self.base_url + "/news"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.3; Win 64 ; x64) Apple WeKit /537.36(KHTML , like Gecko) Chrome/80.0.3987.162 Safari/537.36"
}
self.config = config
if self.config.headers == {}:
self.config.set_headers(headers)

def get_headlines(self):
"""
Expand All @@ -26,7 +33,7 @@ def get_headlines(self):
Example: [{'index': 1, 'headline': 'Headline 1'}, {'index': 2, 'headline': 'Headline 2'}, ...]
"""
try:
response = requests.get(self.headlines_url)
response = get(self.headlines_url, self.config)
response.raise_for_status() # Raise an exception for HTTP errors (4xx or 5xx)
except:
return None
Expand All @@ -46,7 +53,7 @@ def get_headlines(self):

return news_list

def get_article(self, url):
def get_article(self, url: str):
"""
Create an instance of the class - `BBCNews`\n
```python
Expand All @@ -56,10 +63,7 @@ def get_article(self, url):
```
"""
try:
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.3; Win 64 ; x64) Apple WeKit /537.36(KHTML , like Gecko) Chrome/80.0.3987.162 Safari/537.36"
}
response = requests.get(url, headers=headers).text
response = get(url, self.config).text
soup = BeautifulSoup(response, "lxml")

main_heading = soup.find("h1", {"id": "main-heading"}).text.strip()
Expand Down
Loading

0 comments on commit 85df308

Please sign in to comment.