Skip to content

Commit

Permalink
before fetching others
Browse files Browse the repository at this point in the history
  • Loading branch information
AxeemHaider committed Nov 17, 2024
1 parent 8ab5fbd commit 8123ef5
Show file tree
Hide file tree
Showing 3 changed files with 881 additions and 79 deletions.
121 changes: 121 additions & 0 deletions extractor/fetch_others copy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
import os
import json
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

def extract_services():
# Paths
service_name = "others"
service_dir = "./services"
output_dir = f"./others"
images_dir = f"../static/images/{service_name}"

# Ensure directories exist
os.makedirs(output_dir, exist_ok=True)
os.makedirs(images_dir, exist_ok=True)

# Load services data
services = []
for file_name in os.listdir(service_dir):
if file_name.endswith(".json"):
with open(os.path.join(service_dir, file_name), "r") as f:
data = json.load(f)
for entry in data:
services.append(entry)

category_id = "others"

service_data = {}

external_link = "https://elest.io/fully-managed-services"

# Fetch the external page with explicit encoding
response = requests.get(external_link)
response.encoding = 'utf-8' # Set encoding if necessary
if response.status_code != 200:
print(f"Failed to fetch {external_link}")
return

soup = BeautifulSoup(response.text, "html.parser")

# Attempt to locate #cards container
cards_container = soup.select_one("#cards") or soup.find("div", id="cards")

print(len(cards_container), "cards found")

if not cards_container:
print(f"No #cards container found for {external_link}")
return

for a_tag in cards_container.find_all("a", href=True, class_="card-container"):
href = a_tag["href"].strip("/")
service_id = href.split('/')[-1] # Create ID for the service item

id_exist = any(obj['id'] == service_id for obj in services)

if id_exist:
continue
else:
print("New services found", service_id)

link = f"/{service_name}/{category_id}/{service_id}"
ext_link = urljoin("https://elest.io/", href)
title = a_tag.select_one(".template__header > .template__label").text.strip()
description = a_tag.select_one(".template__description").text.strip()

# Process logo
logo_tag = a_tag.select_one(".template__image--img img")
if not logo_tag or not logo_tag["src"]:
print(f"No logo image found for {ext_link}")
continue

logo_url = urljoin(external_link, logo_tag["src"])
logo_ext = os.path.splitext(urlparse(logo_url).path)[-1]
logo_path = f"{images_dir}/{category_id}/{service_id}"
os.makedirs(logo_path, exist_ok=True)
logo_filename = f"{logo_path}/logo{logo_ext}"

# Download the logo image
# logo_response = requests.get(logo_url, stream=True)
# if logo_response.status_code == 200:
# with open(logo_filename, "wb") as img_file:
# img_file.write(logo_response.content)
# logo_path_relative = logo_filename.replace("../static", "")
# else:
# print(f"Failed to download logo from {logo_url}")
# continue

# Create service item if it does not exist
if service_id not in service_data:
service_data[service_id] = {
"id": service_id,
"link": link,
"external_link": ext_link,
"title": title,
"description": description,
# "logo": logo_path_relative,
"category": {
"id": category_id,
"name": "Others",
"ids": [category_id] # Initialize ids with category_id
}
}
else:
# If service item exists, append category_id to ids
if category_id not in service_data[service_id]["category"]["ids"]:
service_data[service_id]["category"]["ids"].append(category_id)



# Save all services in a single file
output_path = os.path.join(output_dir, f"{service_name}.json")
with open(output_path, "w") as outfile:
json.dump(list(service_data.values()), outfile, indent=4)
print(f"Data saved to {output_path}")

print("===============================================")
print(f" Extracting ")
print("===============================================")

extract_services()
136 changes: 58 additions & 78 deletions extractor/fetch_others.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,62 +4,55 @@
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

def extract_services():
# Paths
service_name = "others"
service_dir = "./services"
output_dir = f"./others"
images_dir = f"../static/images/{service_name}"
SERVICE_NAME = "others"
SERVICE_DIR = "./services"
OUTPUT_DIR = "./others"
IMAGES_DIR = "../static/images"
CATEGORY_ID = "others"
CATEGORY_NAME = "Others"

# Ensure directories exist
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(IMAGES_DIR, exist_ok=True)

# Load all previous services
services = {}
for file_name in os.listdir(SERVICE_DIR):
if file_name.endswith(".json"):
with open(os.path.join(SERVICE_DIR, file_name), "r") as f:
data = json.load(f)
for entry in data:
services[entry["id"]] = (entry)

services_list = ["databases", "applications", "development", "hosting-and-infrastructure"]
new_services = {}

for single_service in services_list:
external_link = f"https://elest.io/fully-managed-services/{single_service}"

# Ensure directories exist
os.makedirs(output_dir, exist_ok=True)
os.makedirs(images_dir, exist_ok=True)

# Load services data
services = []
for file_name in os.listdir(service_dir):
if file_name.endswith(".json"):
with open(os.path.join(service_dir, file_name), "r") as f:
data = json.load(f)
for entry in data:
services.append(entry)

category_id = "others"

service_data = {}

external_link = "https://elest.io/fully-managed-services"

# Fetch the external page with explicit encoding
response = requests.get(external_link)
response.encoding = 'utf-8' # Set encoding if necessary
response.encoding = 'utf-8'
if response.status_code != 200:
print(f"Failed to fetch {external_link}")
return
continue

soup = BeautifulSoup(response.text, "html.parser")

# Attempt to locate #cards container
cards_container = soup.select_one("#cards") or soup.find("div", id="cards")

print(len(cards_container), "cards found")

if not cards_container:
print(f"No #cards container found for {external_link}")
return
continue

for a_tag in cards_container.find_all("a", href=True, class_="card-container"):
href = a_tag["href"].strip("/")
service_id = href.split('/')[-1] # Create ID for the service item
service_id = href.split('/')[-1]

id_exist = any(obj['id'] == service_id for obj in services)

if id_exist:
if service_id in services:
continue
else:
print("New services found", service_id)
print("New service found", service_id)

link = f"/{service_name}/{category_id}/{service_id}"
link = f"/{single_service}/{CATEGORY_ID}/{service_id}"
ext_link = urljoin("https://elest.io/", href)
title = a_tag.select_one(".template__header > .template__label").text.strip()
description = a_tag.select_one(".template__description").text.strip()
Expand All @@ -72,50 +65,37 @@ def extract_services():

logo_url = urljoin(external_link, logo_tag["src"])
logo_ext = os.path.splitext(urlparse(logo_url).path)[-1]
logo_path = f"{images_dir}/{category_id}/{service_id}"
logo_path = f"{IMAGES_DIR}/{single_service}/{CATEGORY_ID}/{service_id}"
os.makedirs(logo_path, exist_ok=True)
logo_filename = f"{logo_path}/logo{logo_ext}"

# Download the logo image
# logo_response = requests.get(logo_url, stream=True)
# if logo_response.status_code == 200:
# with open(logo_filename, "wb") as img_file:
# img_file.write(logo_response.content)
# logo_path_relative = logo_filename.replace("../static", "")
# else:
# print(f"Failed to download logo from {logo_url}")
# continue

# Create service item if it does not exist
if service_id not in service_data:
service_data[service_id] = {
"id": service_id,
"link": link,
"external_link": ext_link,
"title": title,
"description": description,
# "logo": logo_path_relative,
"category": {
"id": category_id,
"name": "Others",
"ids": [category_id] # Initialize ids with category_id
}
}
logo_response = requests.get(logo_url, stream=True)
if logo_response.status_code == 200:
with open(logo_filename, "wb") as img_file:
img_file.write(logo_response.content)
logo_path_relative = logo_filename.replace("../static", "")
else:
# If service item exists, append category_id to ids
if category_id not in service_data[service_id]["category"]["ids"]:
service_data[service_id]["category"]["ids"].append(category_id)


print(f"Failed to download logo from {logo_url}")
continue

# Save all services in a single file
output_path = os.path.join(output_dir, f"{service_name}.json")
with open(output_path, "w") as outfile:
json.dump(list(service_data.values()), outfile, indent=4)
print(f"Data saved to {output_path}")
new_services[service_id] = {
"id": service_id,
"link": link,
"external_link": ext_link,
"title": title,
"description": description,
"logo": logo_path_relative,
"category": {
"id": CATEGORY_ID,
"name": CATEGORY_NAME,
"ids": [CATEGORY_ID]
}
}

print("===============================================")
print(f" Extracting ")
print("===============================================")

output_path = os.path.join(OUTPUT_DIR, f"{SERVICE_NAME}.json")
with open(output_path, "w") as outfile:
json.dump(list(new_services.values()), outfile, indent=4)

extract_services()
print(f"Data saved to {output_path}")
Loading

0 comments on commit 8123ef5

Please sign in to comment.