From 65bdc4a054bac294d17292482a492e892e622e9a Mon Sep 17 00:00:00 2001 From: Hamza Tahir Date: Mon, 11 Nov 2024 20:38:38 +0100 Subject: [PATCH] Add parent image and skip build setting in staging rag.yaml. Fix labels in populate_index.py. Add use_dev_set setting in url_scraper.py. Update url_scraper.py to use_dev_set. Remove unused code in url_scraping_utils.py. Update get_all_pages function in url_scraping_utils.py. Update logging in url_scraping_utils.py. Update get_nested_readme_urls function in url_scraping_utils.py.FetchRequest for sitemap zenml.io in get_all_pages --- llm-complete-guide/configs/staging/rag.yaml | 3 +- llm-complete-guide/steps/populate_index.py | 4 +- llm-complete-guide/steps/url_scraper.py | 1 + .../steps/url_scraping_utils.py | 202 ++---------------- 4 files changed, 24 insertions(+), 186 deletions(-) diff --git a/llm-complete-guide/configs/staging/rag.yaml b/llm-complete-guide/configs/staging/rag.yaml index 2b9637ae..0a59f84e 100644 --- a/llm-complete-guide/configs/staging/rag.yaml +++ b/llm-complete-guide/configs/staging/rag.yaml @@ -20,7 +20,8 @@ settings: ZENML_ENABLE_RICH_TRACEBACK: FALSE ZENML_LOGGING_VERBOSITY: INFO python_package_installer: "uv" - + parent_image: "339712793861.dkr.ecr.eu-central-1.amazonaws.com/zenml/llm_index_and_evaluate-orchestrator:bceb36ef0ab6" + skip_build: true steps: url_scraper: parameters: diff --git a/llm-complete-guide/steps/populate_index.py b/llm-complete-guide/steps/populate_index.py index a6378d76..844fc2a2 100644 --- a/llm-complete-guide/steps/populate_index.py +++ b/llm-complete-guide/steps/populate_index.py @@ -461,8 +461,8 @@ def draw_bar_chart( Returns: None """ - if label is None: - label = "" + if labels is None: + labels = [] max_value = max(data) diff --git a/llm-complete-guide/steps/url_scraper.py b/llm-complete-guide/steps/url_scraper.py index e2d85df5..9c54563b 100644 --- a/llm-complete-guide/steps/url_scraper.py +++ b/llm-complete-guide/steps/url_scraper.py @@ -40,6 +40,7 @@ def url_scraper( """ # We comment this out to make this pipeline faster # examples_readme_urls = get_nested_readme_urls(repo_url) + use_dev_set = False if use_dev_set: docs_urls = [ diff --git a/llm-complete-guide/steps/url_scraping_utils.py b/llm-complete-guide/steps/url_scraping_utils.py index 5adc42a5..d6367cbf 100644 --- a/llm-complete-guide/steps/url_scraping_utils.py +++ b/llm-complete-guide/steps/url_scraping_utils.py @@ -13,200 +13,36 @@ # permissions and limitations under the License. import re -from functools import lru_cache -from logging import getLogger -from time import sleep -from typing import List, Set, Tuple -from urllib.parse import urljoin, urlparse - import requests from bs4 import BeautifulSoup -from constants import RATE_LIMIT -from ratelimit import limits, sleep_and_retry - -logger = getLogger(__name__) - - -def is_valid_url(url: str, base: str) -> bool: - """ - Check if the given URL is valid, has the same base as the provided base, - and does not contain any version-specific paths. - - Args: - url (str): The URL to check. - base (str): The base URL to compare against. - - Returns: - bool: True if the URL is valid, has the same base, and does not contain version-specific paths, False otherwise. - """ - parsed = urlparse(url) - if not bool(parsed.netloc) or parsed.netloc != base: - return False - - # Check if the URL contains a version pattern (e.g., /v/0.x.x/) - version_pattern = r"/v/0\.\d+\.\d+/" - return not re.search(version_pattern, url) - - -def strip_query_params(url: str) -> str: - """Strip query parameters from a URL. - - Args: - url (str): The URL to strip query parameters from. - - Returns: - str: The URL without query parameters. - """ - return url.split("?")[0] - - -def get_all_pages(url: str) -> List[str]: - """ - Retrieve all pages with the same base as the given URL. - - Args: - url (str): The URL to retrieve pages from. - - Returns: - List[str]: A list of all pages with the same base. - """ - logger.info(f"Scraping all pages from {url}...") - base_url = urlparse(url).netloc - - # Use a queue-based approach instead of recursion - pages = set() - queue = [url] - while queue: - current_url = queue.pop(0) - if current_url not in pages: - pages.add(current_url) - links = get_all_links(current_url, base_url) - queue.extend(links) - sleep(1 / RATE_LIMIT) # Rate limit the requests - - stripped_pages = [strip_query_params(page) for page in pages] - - logger.info(f"Found {len(stripped_pages)} pages.") - logger.info("Done scraping pages.") - return list(stripped_pages) - - -def crawl(url: str, base: str, visited: Set[str] = None) -> Set[str]: - """ - Recursively crawl a URL and its links, retrieving all valid links with the same base. - - Args: - url (str): The URL to crawl. - base (str): The base URL to compare against. - visited (Set[str]): A set of URLs that have been visited. Defaults to None. - - Returns: - Set[str]: A set of all valid links with the same base. - """ - if visited is None: - visited = set() - - visited.add(url) - logger.debug(f"Crawling URL: {url}") - links = get_all_links(url, base) - - for link in links: - if link not in visited: - visited.update(crawl(link, base, visited)) - sleep(1 / RATE_LIMIT) # Rate limit the recursive calls - - return visited - - -@sleep_and_retry -@limits(calls=RATE_LIMIT, period=1) -@lru_cache(maxsize=128) -def get_all_links(url: str, base: str) -> List[str]: - """ - Retrieve all valid links from a given URL with the same base. - - Args: - url (str): The URL to retrieve links from. - base (str): The base URL to compare against. - - Returns: - List[str]: A list of valid links with the same base. - """ - logger.debug(f"Retrieving links from {url}") - response = requests.get(url) - soup = BeautifulSoup(response.text, "html.parser") - links = [] - - for link in soup.find_all("a", href=True): - href = link["href"] - full_url = urljoin(url, href) - parsed_url = urlparse(full_url) - cleaned_url = parsed_url._replace(fragment="").geturl() - if is_valid_url(cleaned_url, base): - print(cleaned_url) - links.append(cleaned_url) - - logger.debug(f"Found {len(links)} valid links from {url}") - return links - - -@sleep_and_retry -@limits(calls=RATE_LIMIT, period=1) -@lru_cache(maxsize=128) -def get_readme_urls(repo_url: str) -> Tuple[List[str], List[str]]: - """ - Retrieve folder and README links from a GitHub repository. - - Args: - repo_url (str): The URL of the GitHub repository. - - Returns: - Tuple[List[str], List[str]]: A tuple containing two lists: folder links and README links. - """ - logger.debug(f"Retrieving README links from {repo_url}") - headers = {"Accept": "application/vnd.github+json"} - r = requests.get(repo_url, headers=headers) - soup = BeautifulSoup(r.text, "html.parser") - - folder_links = [] - readme_links = [] - - for link in soup.find_all("a", class_="js-navigation-open Link--primary"): - href = link["href"] - full_url = f"https://github.com{href}" - if "tree" in href: - folder_links.append(full_url) - elif "README.md" in href: - readme_links.append(full_url) +from typing import List +from logging import getLogger - logger.debug( - f"Found {len(folder_links)} folder links and {len(readme_links)} README links from {repo_url}" - ) - return folder_links, readme_links +logger = getLogger(__name__) -def get_nested_readme_urls(repo_url: str) -> List[str]: +def get_all_pages(base_url: str = "https://docs.zenml.io") -> List[str]: """ - Retrieve all nested README links from a GitHub repository. + Retrieve all pages from the ZenML documentation sitemap. Args: - repo_url (str): The URL of the GitHub repository. + base_url (str): The base URL of the documentation. Defaults to "https://docs.zenml.io" Returns: - List[str]: A list of all nested README links. + List[str]: A list of all documentation page URLs. """ - logger.info(f"Retrieving nested README links from {repo_url}...") - folder_links, readme_links = get_readme_urls(repo_url) - - for folder_link in folder_links: - _, nested_readme_links = get_readme_urls(folder_link) - readme_links.extend(nested_readme_links) - - logger.info( - f"Found {len(readme_links)} nested README links from {repo_url}" - ) - return readme_links - + logger.info("Fetching sitemap from docs.zenml.io...") + + # Fetch the sitemap + sitemap_url = f"{base_url}/sitemap.xml" + response = requests.get(sitemap_url) + soup = BeautifulSoup(response.text, "xml") + + # Extract all URLs from the sitemap + urls = [loc.text for loc in soup.find_all("loc")] + + logger.info(f"Found {len(urls)} pages in the sitemap.") + return urls def extract_parent_section(url: str) -> str: """