Skip to content

Commit

Permalink
Add parent image and skip build setting in staging rag.yaml. Fix labe…
Browse files Browse the repository at this point in the history
…ls in populate_index.py. Add use_dev_set setting in url_scraper.py. Update url_scraper.py to use_dev_set. Remove unused code in url_scraping_utils.py. Update get_all_pages function in url_scraping_utils.py. Update logging in url_scraping_utils.py. Update get_nested_readme_urls function in url_scraping_utils.py.FetchRequest for sitemap zenml.io in get_all_pages
  • Loading branch information
htahir1 committed Nov 11, 2024
1 parent ca3aed7 commit 65bdc4a
Show file tree
Hide file tree
Showing 4 changed files with 24 additions and 186 deletions.
3 changes: 2 additions & 1 deletion llm-complete-guide/configs/staging/rag.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@ settings:
ZENML_ENABLE_RICH_TRACEBACK: FALSE
ZENML_LOGGING_VERBOSITY: INFO
python_package_installer: "uv"

parent_image: "339712793861.dkr.ecr.eu-central-1.amazonaws.com/zenml/llm_index_and_evaluate-orchestrator:bceb36ef0ab6"
skip_build: true
steps:
url_scraper:
parameters:
Expand Down
4 changes: 2 additions & 2 deletions llm-complete-guide/steps/populate_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -461,8 +461,8 @@ def draw_bar_chart(
Returns:
None
"""
if label is None:
label = ""
if labels is None:
labels = []

max_value = max(data)

Expand Down
1 change: 1 addition & 0 deletions llm-complete-guide/steps/url_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ def url_scraper(
"""
# We comment this out to make this pipeline faster
# examples_readme_urls = get_nested_readme_urls(repo_url)
use_dev_set = False
if use_dev_set:

docs_urls = [
Expand Down
202 changes: 19 additions & 183 deletions llm-complete-guide/steps/url_scraping_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,200 +13,36 @@
# permissions and limitations under the License.

import re
from functools import lru_cache
from logging import getLogger
from time import sleep
from typing import List, Set, Tuple
from urllib.parse import urljoin, urlparse

import requests
from bs4 import BeautifulSoup
from constants import RATE_LIMIT
from ratelimit import limits, sleep_and_retry

logger = getLogger(__name__)


def is_valid_url(url: str, base: str) -> bool:
"""
Check if the given URL is valid, has the same base as the provided base,
and does not contain any version-specific paths.
Args:
url (str): The URL to check.
base (str): The base URL to compare against.
Returns:
bool: True if the URL is valid, has the same base, and does not contain version-specific paths, False otherwise.
"""
parsed = urlparse(url)
if not bool(parsed.netloc) or parsed.netloc != base:
return False

# Check if the URL contains a version pattern (e.g., /v/0.x.x/)
version_pattern = r"/v/0\.\d+\.\d+/"
return not re.search(version_pattern, url)


def strip_query_params(url: str) -> str:
"""Strip query parameters from a URL.
Args:
url (str): The URL to strip query parameters from.
Returns:
str: The URL without query parameters.
"""
return url.split("?")[0]


def get_all_pages(url: str) -> List[str]:
"""
Retrieve all pages with the same base as the given URL.
Args:
url (str): The URL to retrieve pages from.
Returns:
List[str]: A list of all pages with the same base.
"""
logger.info(f"Scraping all pages from {url}...")
base_url = urlparse(url).netloc

# Use a queue-based approach instead of recursion
pages = set()
queue = [url]
while queue:
current_url = queue.pop(0)
if current_url not in pages:
pages.add(current_url)
links = get_all_links(current_url, base_url)
queue.extend(links)
sleep(1 / RATE_LIMIT) # Rate limit the requests

stripped_pages = [strip_query_params(page) for page in pages]

logger.info(f"Found {len(stripped_pages)} pages.")
logger.info("Done scraping pages.")
return list(stripped_pages)


def crawl(url: str, base: str, visited: Set[str] = None) -> Set[str]:
"""
Recursively crawl a URL and its links, retrieving all valid links with the same base.
Args:
url (str): The URL to crawl.
base (str): The base URL to compare against.
visited (Set[str]): A set of URLs that have been visited. Defaults to None.
Returns:
Set[str]: A set of all valid links with the same base.
"""
if visited is None:
visited = set()

visited.add(url)
logger.debug(f"Crawling URL: {url}")
links = get_all_links(url, base)

for link in links:
if link not in visited:
visited.update(crawl(link, base, visited))
sleep(1 / RATE_LIMIT) # Rate limit the recursive calls

return visited


@sleep_and_retry
@limits(calls=RATE_LIMIT, period=1)
@lru_cache(maxsize=128)
def get_all_links(url: str, base: str) -> List[str]:
"""
Retrieve all valid links from a given URL with the same base.
Args:
url (str): The URL to retrieve links from.
base (str): The base URL to compare against.
Returns:
List[str]: A list of valid links with the same base.
"""
logger.debug(f"Retrieving links from {url}")
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
links = []

for link in soup.find_all("a", href=True):
href = link["href"]
full_url = urljoin(url, href)
parsed_url = urlparse(full_url)
cleaned_url = parsed_url._replace(fragment="").geturl()
if is_valid_url(cleaned_url, base):
print(cleaned_url)
links.append(cleaned_url)

logger.debug(f"Found {len(links)} valid links from {url}")
return links


@sleep_and_retry
@limits(calls=RATE_LIMIT, period=1)
@lru_cache(maxsize=128)
def get_readme_urls(repo_url: str) -> Tuple[List[str], List[str]]:
"""
Retrieve folder and README links from a GitHub repository.
Args:
repo_url (str): The URL of the GitHub repository.
Returns:
Tuple[List[str], List[str]]: A tuple containing two lists: folder links and README links.
"""
logger.debug(f"Retrieving README links from {repo_url}")
headers = {"Accept": "application/vnd.github+json"}
r = requests.get(repo_url, headers=headers)
soup = BeautifulSoup(r.text, "html.parser")

folder_links = []
readme_links = []

for link in soup.find_all("a", class_="js-navigation-open Link--primary"):
href = link["href"]
full_url = f"https://github.com{href}"
if "tree" in href:
folder_links.append(full_url)
elif "README.md" in href:
readme_links.append(full_url)
from typing import List
from logging import getLogger

logger.debug(
f"Found {len(folder_links)} folder links and {len(readme_links)} README links from {repo_url}"
)
return folder_links, readme_links

logger = getLogger(__name__)

def get_nested_readme_urls(repo_url: str) -> List[str]:
def get_all_pages(base_url: str = "https://docs.zenml.io") -> List[str]:
"""
Retrieve all nested README links from a GitHub repository.
Retrieve all pages from the ZenML documentation sitemap.
Args:
repo_url (str): The URL of the GitHub repository.
base_url (str): The base URL of the documentation. Defaults to "https://docs.zenml.io"
Returns:
List[str]: A list of all nested README links.
List[str]: A list of all documentation page URLs.
"""
logger.info(f"Retrieving nested README links from {repo_url}...")
folder_links, readme_links = get_readme_urls(repo_url)

for folder_link in folder_links:
_, nested_readme_links = get_readme_urls(folder_link)
readme_links.extend(nested_readme_links)

logger.info(
f"Found {len(readme_links)} nested README links from {repo_url}"
)
return readme_links

logger.info("Fetching sitemap from docs.zenml.io...")

# Fetch the sitemap
sitemap_url = f"{base_url}/sitemap.xml"
response = requests.get(sitemap_url)
soup = BeautifulSoup(response.text, "xml")
# Extract all URLs from the sitemap
urls = [loc.text for loc in soup.find_all("loc")]

logger.info(f"Found {len(urls)} pages in the sitemap.")
return urls

def extract_parent_section(url: str) -> str:
"""
Expand Down

0 comments on commit 65bdc4a

Please sign in to comment.