From 12f773580f470cb9a38a4ba6aa304557ab925d45 Mon Sep 17 00:00:00 2001 From: Hugo van Kemenade <1324225+hugovk@users.noreply.github.com> Date: Sun, 11 Aug 2024 21:23:28 +0300 Subject: [PATCH 1/3] Speed up purges by re-using a requests session --- build_docs.py | 123 +++++++++++++++++++++++++++++++++++++------------- 1 file changed, 91 insertions(+), 32 deletions(-) diff --git a/build_docs.py b/build_docs.py index 212989e..719ee01 100755 --- a/build_docs.py +++ b/build_docs.py @@ -434,7 +434,8 @@ def build_robots_txt( www_root: Path, group, skip_cache_invalidation, -): + session: requests.Session, +) -> None: """Disallow crawl of EOL versions in robots.txt.""" if not www_root.exists(): logging.info("Skipping robots.txt generation (www root does not even exists).") @@ -449,7 +450,7 @@ def build_robots_txt( robots_file.chmod(0o775) run(["chgrp", group, robots_file]) if not skip_cache_invalidation: - purge("robots.txt") + purge(session, "robots.txt") def build_sitemap( @@ -642,7 +643,7 @@ def full_build(self): """ return not self.quick and not self.language.html_only - def run(self) -> bool: + def run(self, session: requests.Session) -> bool: """Build and publish a Python doc, for a language, and a version.""" start_time = perf_counter() logging.info("Running.") @@ -653,7 +654,7 @@ def run(self) -> bool: if self.should_rebuild(): self.build_venv() self.build() - self.copy_build_to_webroot() + self.copy_build_to_webroot(session) self.save_state(build_duration=perf_counter() - start_time) except Exception as err: logging.exception("Badly handled exception, human, please help.") @@ -797,7 +798,7 @@ def build_venv(self): run([venv_path / "bin" / "python", "-m", "pip", "freeze", "--all"]) self.venv = venv_path - def copy_build_to_webroot(self): + def copy_build_to_webroot(self, session: requests.Session) -> None: """Copy a given build to the appropriate webroot with appropriate rights.""" logging.info("Publishing start.") self.www_root.mkdir(parents=True, exist_ok=True) @@ -909,9 +910,9 @@ def copy_build_to_webroot(self): prefixes = run(["find", "-L", targets_dir, "-samefile", target]).stdout prefixes = prefixes.replace(targets_dir + "/", "") prefixes = [prefix + "/" for prefix in prefixes.split("\n") if prefix] - purge(*prefixes) + purge(session, *prefixes) for prefix in prefixes: - purge(*[prefix + p for p in changed]) + purge(session, *[prefix + p for p in changed]) logging.info("Publishing done") def should_rebuild(self): @@ -977,7 +978,15 @@ def save_state(self, build_duration: float): state_file.write_text(tomlkit.dumps(states), encoding="UTF-8") -def symlink(www_root: Path, language: Language, directory: str, name: str, group: str, skip_cache_invalidation: bool): +def symlink( + www_root: Path, + language: Language, + directory: str, + name: str, + group: str, + skip_cache_invalidation: bool, + session: requests.Session, +) -> None: """Used by major_symlinks and dev_symlink to maintain symlinks.""" if language.tag == "en": # english is rooted on /, no /en/ path = www_root @@ -994,12 +1003,17 @@ def symlink(www_root: Path, language: Language, directory: str, name: str, group link.symlink_to(directory) run(["chown", "-h", ":" + group, str(link)]) if not skip_cache_invalidation: - purge_path(www_root, link) + purge_path(session, www_root, link) def major_symlinks( - www_root: Path, group, versions: Iterable[Version], languages: Iterable[Language], skip_cache_invalidation: bool -): + www_root: Path, + group: str, + versions: Iterable[Version], + languages: Iterable[Language], + skip_cache_invalidation: bool, + session: requests.Session, +) -> None: """Maintains the /2/ and /3/ symlinks for each languages. Like: @@ -1009,11 +1023,26 @@ def major_symlinks( """ current_stable = Version.current_stable(versions).name for language in languages: - symlink(www_root, language, current_stable, "3", group, skip_cache_invalidation) - symlink(www_root, language, "2.7", "2", group, skip_cache_invalidation) + symlink( + www_root, + language, + current_stable, + "3", + group, + skip_cache_invalidation, + session, + ) + symlink(www_root, language, "2.7", "2", group, skip_cache_invalidation, session) -def dev_symlink(www_root: Path, group, versions, languages, skip_cache_invalidation: bool): +def dev_symlink( + www_root: Path, + group, + versions, + languages, + skip_cache_invalidation: bool, + session: requests.Session, +) -> None: """Maintains the /dev/ symlinks for each languages. Like: @@ -1023,10 +1052,18 @@ def dev_symlink(www_root: Path, group, versions, languages, skip_cache_invalidat """ current_dev = Version.current_dev(versions).name for language in languages: - symlink(www_root, language, current_dev, "dev", group, skip_cache_invalidation) + symlink( + www_root, + language, + current_dev, + "dev", + group, + skip_cache_invalidation, + session, + ) -def purge(*paths): +def purge(session: requests.Session, *paths: Path | str) -> None: """Remove one or many paths from docs.python.org's CDN. To be used when a file change, so the CDN fetch the new one. @@ -1035,20 +1072,22 @@ def purge(*paths): for path in paths: url = urljoin(base, str(path)) logging.debug("Purging %s from CDN", url) - requests.request("PURGE", url, timeout=30) + session.request("PURGE", url, timeout=30) -def purge_path(www_root: Path, path: Path): +def purge_path(session: requests.Session, www_root: Path, path: Path) -> None: """Recursively remove a path from docs.python.org's CDN. To be used when a directory change, so the CDN fetch the new one. """ - purge(*[file.relative_to(www_root) for file in path.glob("**/*")]) - purge(path.relative_to(www_root)) - purge(str(path.relative_to(www_root)) + "/") + purge(session, *[file.relative_to(www_root) for file in path.glob("**/*")]) + purge(session, path.relative_to(www_root)) + purge(session, str(path.relative_to(www_root)) + "/") -def proofread_canonicals(www_root: Path, skip_cache_invalidation: bool) -> None: +def proofread_canonicals( + www_root: Path, skip_cache_invalidation: bool, session: requests.Session +) -> None: """In www_root we check that all canonical links point to existing contents. It can happen that a canonical is "broken": @@ -1070,11 +1109,11 @@ def proofread_canonicals(www_root: Path, skip_cache_invalidation: bool) -> None: html = html.replace(canonical.group(0), "") file.write_text(html, encoding="UTF-8", errors="surrogateescape") if not skip_cache_invalidation: - purge(str(file).replace("/srv/docs.python.org/", "")) + purge(session, str(file).replace("/srv/docs.python.org/", "")) -def parse_versions_from_devguide(): - releases = requests.get( +def parse_versions_from_devguide(session: requests.Session) -> list[Version]: + releases = session.get( "https://raw.githubusercontent.com/" "python/devguide/main/include/release-cycle.json", timeout=30, @@ -1101,8 +1140,9 @@ def parse_languages_from_config(): def build_docs(args) -> bool: - """Build all docs (each languages and each versions).""" - versions = parse_versions_from_devguide() + """Build all docs (each language and each version).""" + session = requests.Session() + versions = parse_versions_from_devguide(session) languages = parse_languages_from_config() todo = [ (version, language) @@ -1130,7 +1170,7 @@ def build_docs(args) -> bool: builder = DocBuilder( version, versions, language, languages, cpython_repo, **vars(args) ) - all_built_successfully &= builder.run() + all_built_successfully &= builder.run(session) logging.root.handlers[0].setFormatter( logging.Formatter("%(asctime)s %(levelname)s: %(message)s") ) @@ -1138,11 +1178,30 @@ def build_docs(args) -> bool: build_sitemap(versions, languages, args.www_root, args.group) build_404(args.www_root, args.group) build_robots_txt( - versions, languages, args.www_root, args.group, args.skip_cache_invalidation + versions, + languages, + args.www_root, + args.group, + args.skip_cache_invalidation, + session, + ) + major_symlinks( + args.www_root, + args.group, + versions, + languages, + args.skip_cache_invalidation, + session, + ) + dev_symlink( + args.www_root, + args.group, + versions, + languages, + args.skip_cache_invalidation, + session, ) - major_symlinks(args.www_root, args.group, versions, languages, args.skip_cache_invalidation) - dev_symlink(args.www_root, args.group, versions, languages, args.skip_cache_invalidation) - proofread_canonicals(args.www_root, args.skip_cache_invalidation) + proofread_canonicals(args.www_root, args.skip_cache_invalidation, session) return all_built_successfully From 4d05bb0bfe729078bc2f547352bb46934a916740 Mon Sep 17 00:00:00 2001 From: Hugo van Kemenade <1324225+hugovk@users.noreply.github.com> Date: Sun, 11 Aug 2024 22:12:20 +0300 Subject: [PATCH 2/3] Replace requests with urllib3 --- build_docs.py | 68 ++++++++++++++++++++++++------------------------ requirements.txt | 2 +- 2 files changed, 35 insertions(+), 35 deletions(-) diff --git a/build_docs.py b/build_docs.py index 719ee01..9b3d592 100755 --- a/build_docs.py +++ b/build_docs.py @@ -46,11 +46,10 @@ from typing import Iterable from urllib.parse import urljoin -import zc.lockfile import jinja2 -import requests import tomlkit - +import urllib3 +import zc.lockfile try: from os import EX_OK, EX_SOFTWARE as EX_FAILURE @@ -434,7 +433,7 @@ def build_robots_txt( www_root: Path, group, skip_cache_invalidation, - session: requests.Session, + http: urllib3.PoolManager, ) -> None: """Disallow crawl of EOL versions in robots.txt.""" if not www_root.exists(): @@ -450,7 +449,7 @@ def build_robots_txt( robots_file.chmod(0o775) run(["chgrp", group, robots_file]) if not skip_cache_invalidation: - purge(session, "robots.txt") + purge(http, "robots.txt") def build_sitemap( @@ -643,7 +642,7 @@ def full_build(self): """ return not self.quick and not self.language.html_only - def run(self, session: requests.Session) -> bool: + def run(self, http: urllib3.PoolManager) -> bool: """Build and publish a Python doc, for a language, and a version.""" start_time = perf_counter() logging.info("Running.") @@ -654,7 +653,7 @@ def run(self, session: requests.Session) -> bool: if self.should_rebuild(): self.build_venv() self.build() - self.copy_build_to_webroot(session) + self.copy_build_to_webroot(http) self.save_state(build_duration=perf_counter() - start_time) except Exception as err: logging.exception("Badly handled exception, human, please help.") @@ -798,7 +797,7 @@ def build_venv(self): run([venv_path / "bin" / "python", "-m", "pip", "freeze", "--all"]) self.venv = venv_path - def copy_build_to_webroot(self, session: requests.Session) -> None: + def copy_build_to_webroot(self, http: urllib3.PoolManager) -> None: """Copy a given build to the appropriate webroot with appropriate rights.""" logging.info("Publishing start.") self.www_root.mkdir(parents=True, exist_ok=True) @@ -910,9 +909,9 @@ def copy_build_to_webroot(self, session: requests.Session) -> None: prefixes = run(["find", "-L", targets_dir, "-samefile", target]).stdout prefixes = prefixes.replace(targets_dir + "/", "") prefixes = [prefix + "/" for prefix in prefixes.split("\n") if prefix] - purge(session, *prefixes) + purge(http, *prefixes) for prefix in prefixes: - purge(session, *[prefix + p for p in changed]) + purge(http, *[prefix + p for p in changed]) logging.info("Publishing done") def should_rebuild(self): @@ -985,7 +984,7 @@ def symlink( name: str, group: str, skip_cache_invalidation: bool, - session: requests.Session, + http: urllib3.PoolManager, ) -> None: """Used by major_symlinks and dev_symlink to maintain symlinks.""" if language.tag == "en": # english is rooted on /, no /en/ @@ -1003,7 +1002,7 @@ def symlink( link.symlink_to(directory) run(["chown", "-h", ":" + group, str(link)]) if not skip_cache_invalidation: - purge_path(session, www_root, link) + purge_path(http, www_root, link) def major_symlinks( @@ -1012,7 +1011,7 @@ def major_symlinks( versions: Iterable[Version], languages: Iterable[Language], skip_cache_invalidation: bool, - session: requests.Session, + http: urllib3.PoolManager, ) -> None: """Maintains the /2/ and /3/ symlinks for each languages. @@ -1030,9 +1029,9 @@ def major_symlinks( "3", group, skip_cache_invalidation, - session, + http, ) - symlink(www_root, language, "2.7", "2", group, skip_cache_invalidation, session) + symlink(www_root, language, "2.7", "2", group, skip_cache_invalidation, http) def dev_symlink( @@ -1041,7 +1040,7 @@ def dev_symlink( versions, languages, skip_cache_invalidation: bool, - session: requests.Session, + http: urllib3.PoolManager, ) -> None: """Maintains the /dev/ symlinks for each languages. @@ -1059,11 +1058,11 @@ def dev_symlink( "dev", group, skip_cache_invalidation, - session, + http, ) -def purge(session: requests.Session, *paths: Path | str) -> None: +def purge(http: urllib3.PoolManager, *paths: Path | str) -> None: """Remove one or many paths from docs.python.org's CDN. To be used when a file change, so the CDN fetch the new one. @@ -1072,21 +1071,21 @@ def purge(session: requests.Session, *paths: Path | str) -> None: for path in paths: url = urljoin(base, str(path)) logging.debug("Purging %s from CDN", url) - session.request("PURGE", url, timeout=30) + http.request("PURGE", url, timeout=30) -def purge_path(session: requests.Session, www_root: Path, path: Path) -> None: +def purge_path(http: urllib3.PoolManager, www_root: Path, path: Path) -> None: """Recursively remove a path from docs.python.org's CDN. To be used when a directory change, so the CDN fetch the new one. """ - purge(session, *[file.relative_to(www_root) for file in path.glob("**/*")]) - purge(session, path.relative_to(www_root)) - purge(session, str(path.relative_to(www_root)) + "/") + purge(http, *[file.relative_to(www_root) for file in path.glob("**/*")]) + purge(http, path.relative_to(www_root)) + purge(http, str(path.relative_to(www_root)) + "/") def proofread_canonicals( - www_root: Path, skip_cache_invalidation: bool, session: requests.Session + www_root: Path, skip_cache_invalidation: bool, http: urllib3.PoolManager ) -> None: """In www_root we check that all canonical links point to existing contents. @@ -1109,11 +1108,12 @@ def proofread_canonicals( html = html.replace(canonical.group(0), "") file.write_text(html, encoding="UTF-8", errors="surrogateescape") if not skip_cache_invalidation: - purge(session, str(file).replace("/srv/docs.python.org/", "")) + purge(http, str(file).replace("/srv/docs.python.org/", "")) -def parse_versions_from_devguide(session: requests.Session) -> list[Version]: - releases = session.get( +def parse_versions_from_devguide(http: urllib3.PoolManager) -> list[Version]: + releases = http.request( + "GET", "https://raw.githubusercontent.com/" "python/devguide/main/include/release-cycle.json", timeout=30, @@ -1141,8 +1141,8 @@ def parse_languages_from_config(): def build_docs(args) -> bool: """Build all docs (each language and each version).""" - session = requests.Session() - versions = parse_versions_from_devguide(session) + http = urllib3.PoolManager() + versions = parse_versions_from_devguide(http) languages = parse_languages_from_config() todo = [ (version, language) @@ -1170,7 +1170,7 @@ def build_docs(args) -> bool: builder = DocBuilder( version, versions, language, languages, cpython_repo, **vars(args) ) - all_built_successfully &= builder.run(session) + all_built_successfully &= builder.run(http) logging.root.handlers[0].setFormatter( logging.Formatter("%(asctime)s %(levelname)s: %(message)s") ) @@ -1183,7 +1183,7 @@ def build_docs(args) -> bool: args.www_root, args.group, args.skip_cache_invalidation, - session, + http, ) major_symlinks( args.www_root, @@ -1191,7 +1191,7 @@ def build_docs(args) -> bool: versions, languages, args.skip_cache_invalidation, - session, + http, ) dev_symlink( args.www_root, @@ -1199,9 +1199,9 @@ def build_docs(args) -> bool: versions, languages, args.skip_cache_invalidation, - session, + http, ) - proofread_canonicals(args.www_root, args.skip_cache_invalidation, session) + proofread_canonicals(args.www_root, args.skip_cache_invalidation, http) return all_built_successfully diff --git a/requirements.txt b/requirements.txt index f51c7d0..cf12434 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ jinja2 -requests sentry-sdk tomlkit +urllib3 zc.lockfile From 81cce71c9e46add68452f77a5a0fe558e13ffb49 Mon Sep 17 00:00:00 2001 From: Hugo van Kemenade <1324225+hugovk@users.noreply.github.com> Date: Mon, 12 Aug 2024 00:01:51 +0300 Subject: [PATCH 3/3] Install urllib3 >= v2 Co-authored-by: Adam Turner <9087854+AA-Turner@users.noreply.github.com> --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index cf12434..4602e89 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ jinja2 sentry-sdk tomlkit -urllib3 +urllib3>=2 zc.lockfile