From 1b300a6c07aa9d0ec3985a00c23aff7f3d9fe4d9 Mon Sep 17 00:00:00 2001 From: ebembi-crdb Date: Mon, 17 Mar 2025 15:18:52 +0530 Subject: [PATCH 01/12] data required for archival --- src/current/_config_cockroachdb_local.yml | 1 - src/current/audit.py | 408 + src/current/audit_report.txt | 76545 ++++++++++++++++++++ src/current/v19.2_audit_report.txt | 76545 ++++++++++++++++++++ 4 files changed, 153498 insertions(+), 1 deletion(-) create mode 100644 src/current/audit.py create mode 100644 src/current/audit_report.txt create mode 100644 src/current/v19.2_audit_report.txt diff --git a/src/current/_config_cockroachdb_local.yml b/src/current/_config_cockroachdb_local.yml index 3440c9a8df7..98579dde61f 100644 --- a/src/current/_config_cockroachdb_local.yml +++ b/src/current/_config_cockroachdb_local.yml @@ -4,7 +4,6 @@ exclude: - "v2.0" - "v2.1" - "v19.1" -- "v19.2" - "v20.1" - "ci" - "scripts" diff --git a/src/current/audit.py b/src/current/audit.py new file mode 100644 index 00000000000..2d2c968735d --- /dev/null +++ b/src/current/audit.py @@ -0,0 +1,408 @@ +#!/usr/bin/env python3 +""" +audit.py + +An audit script that: +1) Finds cross-version links (categorized by location). +2) Finds cockroachlabs.com non-docs links. +3) Finds external (non-cockroachlabs.com) links. +4) Audits image/CSS/JS/font usage, categorizing them as present, missing, or external. + +**This version** uses a "fallback" approach in asset_status() so +we do *not* unconditionally remove "/docs/" from the path. Instead, +we generate multiple candidate paths and see if any match the disk. +""" + +import os +import sys +import re +import argparse +from bs4 import BeautifulSoup +from urllib.parse import urlparse + +def is_cross_version_link(url, current_version): + """ + Return (True, found_version) if `url` is a docs link pointing to a different version. + E.g. /docs/v19.2/... vs current_version v21.1 + """ + match = re.search(r'/docs/(v\d+\.\d+)', url) + if match: + version = match.group(1) + return (version != current_version, version) + return (False, None) + +def categorize_cross_version_link(tag): + """ + For cross-version links, figure out if they're in the sidebar, version-switcher, or body. + """ + if tag.find_parent(id="sidebar"): + return "Sidebar Navigation" + elif tag.find_parent(id="version-switcher"): + return "Version Switcher" + else: + return "Content Body" + +def find_assets(soup): + """ + Return a dict: { "images": set(), "css": set(), "js": set(), "fonts": set() } + by scanning , , + +''' + + html = re.sub(r"", nav_deps + "\n", html, flags=re.IGNORECASE) + + # Add offline styles + offline_styles = f'''''' + + html = re.sub(r"", offline_styles + "\n", html, flags=re.IGNORECASE) + + # Add navigation initialization + nav_init = """""" + + html = re.sub(r"", nav_init + "\n", html, flags=re.IGNORECASE) + + # Write output + dst_path.parent.mkdir(parents=True, exist_ok=True) + dst_path.write_text(html, encoding="utf-8") + + self.processed_files.add(str(rel_path)) + + except Exception as e: + self.log(f"Error processing {src_path}: {e}", "ERROR") + import traceback + traceback.print_exc() + + def fix_css_images(self): + """Fix image paths in CSS files""" + self.log("Fixing CSS image paths...") + + for css_file in (OUTPUT_ROOT / "css").rglob("*.css"): + try: + content = css_file.read_text(encoding="utf-8") + + # Fix various image URL patterns + content = re.sub( + r"url\((['\"]?)/?docs/images/([^)\"']+)\1\)", + r"url(\1../images/\2\1)", + content, + ) + content = re.sub( + r"url\((['\"]?)images/([^)\"']+)\1\)", + r"url(\1../images/\2\1)", + content, + ) + + css_file.write_text(content, encoding="utf-8") + + except Exception as e: + self.log(f"Error fixing CSS {css_file}: {e}", "WARNING") + + def download_google_fonts(self): + """Download and localize Google Fonts""" + self.log("Downloading Google Fonts...") + + fonts_dir = OUTPUT_ROOT / "fonts" + fonts_dir.mkdir(exist_ok=True) + + try: + # Get CSS + headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'} + css_response = requests.get(FONTS_CSS_URL, headers=headers, timeout=10) + css_response.raise_for_status() + css_content = css_response.text + + # Extract and download font files + font_urls = set(re.findall(r"url\((https://fonts\.gstatic\.com/[^\)]+)\)", css_content)) + + for url in font_urls: + try: + # Download font + font_response = requests.get(url, headers=headers, timeout=10) + font_response.raise_for_status() + + # Save font + parsed = urlparse(url) + font_path = parsed.path.lstrip("/") + dst = fonts_dir / font_path + dst.parent.mkdir(parents=True, exist_ok=True) + dst.write_bytes(font_response.content) + + # Update CSS + css_content = css_content.replace(url, f"../fonts/{font_path}") + + except Exception as e: + self.log(f"Failed to download font from {url}: {e}", "WARNING") + + # Save localized CSS + (OUTPUT_ROOT / "css" / "google-fonts.css").write_text(css_content, encoding="utf-8") + self.log("Google Fonts localized", "SUCCESS") + + except Exception as e: + self.log(f"Error downloading fonts: {e}", "ERROR") + # Create fallback + fallback = """/* Fallback fonts */ +body { font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Arial, sans-serif; } +code, pre { font-family: Consolas, Monaco, "Courier New", monospace; }""" + (OUTPUT_ROOT / "css" / "google-fonts.css").write_text(fallback) + + def create_link_test_page(self): + """Create a test page to verify link processing""" + test_html = f""" + + + Link Test Page + + + +

Link Processing Test Results

+

This page shows how different link patterns were processed:

+ +

From pages NOT in version directory:

+
+
Context: Page at /index.html
+
Original: /docs/insert.html
+
Should be: v19.2/insert.html
+ Test Link +
+ +
+
Context: Page at /index.html
+
Original: /docs/v19.2/secure-a-cluster.html
+
Should be: v19.2/secure-a-cluster.html
+ Test Link +
+ +

From pages IN version directory:

+
+
Context: Page at /v19.2/index.html
+
Original: /docs/secure-a-cluster.html
+
Should be: secure-a-cluster.html (same dir)
+

This link would be at: v19.2/secure-a-cluster.html

+
+ +
+
Context: Page at /v19.2/index.html
+
Original: /docs/v19.2/secure-a-cluster.html
+
Should be: secure-a-cluster.html (same dir)
+

This link would be at: v19.2/secure-a-cluster.html

+
+ +

Special cases:

+
+
Original: /docs/stable/something.html
+
Should be: v19.2/something.html
+ Test Link +
+ +
+
Original: /docs/cockroachcloud/quickstart.html
+
Should be: cockroachcloud/quickstart.html
+ Test Link +
+ +
+
Original: /docs/releases/index.html
+
Should be: releases/index.html
+ Test Link +
+ +

Note: Click each link to verify it works correctly.

+ +""" + + test_path = OUTPUT_ROOT / "_link_test.html" + test_path.write_text(test_html) + self.log("Created link test page: _link_test.html", "SUCCESS") + + def create_index_page(self): + """Create the index page""" + index_html = f""" + + + + + CockroachDB {TARGET_VERSION} Documentation (Offline) + + + + + +

CockroachDB {TARGET_VERSION}

+

Offline Documentation Archive

+ +
+ + + + +
+

ā˜ļø CockroachDB Cloud

+ +
+ + +
+ +
+

šŸ“Œ Offline Archive

+

This is a complete offline archive of the CockroachDB {TARGET_VERSION} documentation. + All internal links have been updated to work offline.

+

Created: {datetime.now().strftime('%B %d, %Y at %I:%M %p')}

+
+ + + +""" + + (OUTPUT_ROOT / "index.html").write_text(index_html) + self.log("Created index.html", "SUCCESS") + """Create the index page""" + index_html = f""" + + + + + CockroachDB {TARGET_VERSION} Documentation (Offline) + + + + + +

CockroachDB {TARGET_VERSION}

+

Offline Documentation Archive

+ +
+ + + + +
+

ā˜ļø CockroachDB Cloud

+ +
+ + +
+ +
+

šŸ“Œ Offline Archive

+

This is a complete offline archive of the CockroachDB {TARGET_VERSION} documentation. + All internal links have been updated to work offline.

+

Created: {datetime.now().strftime('%B %d, %Y at %I:%M %p')}

+
+ + + +""" + + (OUTPUT_ROOT / "index.html").write_text(index_html) + self.log("Created index.html", "SUCCESS") + + def build(self): + """Main build process following Code 2's structure""" + print("\n" + "="*60) + print("šŸš€ COCKROACHDB OFFLINE DOCUMENTATION ARCHIVER") + print("="*60) + + # Verify paths + self.log(f"Jekyll Root: {JEKYLL_ROOT}") + self.log(f"Site Root: {SITE_ROOT}") + self.log(f"Docs Root: {DOCS_ROOT}") + self.log(f"Output: {OUTPUT_ROOT}") + + if not SITE_ROOT.exists(): + self.log("Site root not found! Run 'jekyll build' first.", "ERROR") + return False + + # Clean output directory + if OUTPUT_ROOT.exists(): + self.log("Cleaning existing output directory...") + shutil.rmtree(OUTPUT_ROOT) + OUTPUT_ROOT.mkdir(parents=True) + + # CRITICAL: Copy global assets FIRST (from SITE_ROOT, not DOCS_ROOT) + self.log("\n--- Copying Global Assets ---") + for asset_dir in ["css", "js", "img"]: + src = SITE_ROOT / asset_dir + if src.exists(): + dst = OUTPUT_ROOT / asset_dir + shutil.copytree(src, dst, dirs_exist_ok=True) + self.log(f"Copied global {asset_dir}/", "SUCCESS") + + # Copy docs-specific assets + self.log("\n--- Copying Docs Assets ---") + for asset_dir in ["css", "js", "images", "_internal"]: + src = DOCS_ROOT / asset_dir + if src.exists(): + dst = OUTPUT_ROOT / asset_dir + shutil.copytree(src, dst, dirs_exist_ok=True) + self.log(f"Copied docs {asset_dir}/", "SUCCESS") + + # Ensure critical navigation assets + self.log("\n--- Ensuring Navigation Assets ---") + self.ensure_asset( + "jquery.min.js", + [DOCS_ROOT / "js" / "jquery.min.js", SITE_ROOT / "js" / "jquery.min.js"], + "https://code.jquery.com/jquery-3.6.3.min.js", + OUTPUT_ROOT / "js" + ) + self.ensure_asset( + "jquery.cookie.min.js", + [DOCS_ROOT / "js" / "jquery.cookie.min.js", SITE_ROOT / "js" / "jquery.cookie.min.js"], + "https://cdnjs.cloudflare.com/ajax/libs/jquery-cookie/1.4.1/jquery.cookie.min.js", + OUTPUT_ROOT / "js" + ) + self.ensure_asset( + "jquery.navgoco.min.js", + [DOCS_ROOT / "js" / "jquery.navgoco.min.js", SITE_ROOT / "js" / "jquery.navgoco.min.js"], + "https://raw.githubusercontent.com/tefra/navgoco/master/src/jquery.navgoco.js", + OUTPUT_ROOT / "js" + ) + self.ensure_asset( + "jquery.navgoco.css", + [DOCS_ROOT / "css" / "jquery.navgoco.css", SITE_ROOT / "css" / "jquery.navgoco.css"], + "https://raw.githubusercontent.com/tefra/navgoco/master/src/jquery.navgoco.css", + OUTPUT_ROOT / "css" + ) + + # Load sidebar + self.log("\n--- Loading Sidebar ---") + self.load_sidebar() + + # Process HTML files + self.log("\n--- Processing HTML Files ---") + + # Collect files to process + files_to_process = [] + + # Target version files + version_dir = DOCS_ROOT / TARGET_VERSION + if version_dir.exists(): + files_to_process.extend(list(version_dir.rglob("*.html"))) + self.log(f"Found {len(files_to_process)} files in {TARGET_VERSION}/", "SUCCESS") + + # Common pages + for pattern in COMMON_PAGES: + if '*' in pattern: + files_to_process.extend(list(DOCS_ROOT.glob(pattern))) + else: + file_path = DOCS_ROOT / pattern + if file_path.exists(): + files_to_process.append(file_path) + + # Remove duplicates + files_to_process = list(set(files_to_process)) + self.log(f"Total files to process: {len(files_to_process)}") + + # Process each file + for i, file_path in enumerate(files_to_process, 1): + # Skip non-v19.2 version directories + rel_path = file_path.relative_to(DOCS_ROOT) + if rel_path.parts and rel_path.parts[0].startswith('v') and rel_path.parts[0] != TARGET_VERSION: + continue + + if i % 25 == 0: + self.log(f"Progress: {i}/{len(files_to_process)} ({i*100//len(files_to_process)}%)") + + self.process_html_file(file_path) + + self.log(f"Processed {len(self.processed_files)} files", "SUCCESS") + + # Final cleanup steps + self.log("\n--- Final Steps ---") + self.fix_css_images() + self.download_google_fonts() + self.create_index_page() + + # Summary + print("\n" + "="*60) + self.log("ARCHIVE COMPLETE!", "SUCCESS") + self.log(f"Output directory: {OUTPUT_ROOT.resolve()}") + self.log(f"Total files: {len(self.processed_files)}") + self.log("āœ… Ask AI widget removed", "SUCCESS") + self.log("āœ… All links converted to relative paths", "SUCCESS") + self.log("āœ… Version directory (v19.2) added where needed", "SUCCESS") + + print(f"\nšŸŽ‰ Offline site built in {OUTPUT_ROOT}") + print(f"\nšŸ“¦ To test: open file://{OUTPUT_ROOT.resolve()}/index.html") + print(f"\nšŸ“Œ Note: Check console output above for link transformation details") + + return True + + +def main(): + """Main entry point""" + try: + archiver = OfflineArchiver() + success = archiver.build() + sys.exit(0 if success else 1) + except KeyboardInterrupt: + print("\n\nArchiving cancelled by user.") + sys.exit(1) + except Exception as e: + print(f"\nāŒ Fatal error: {e}") + import traceback + traceback.print_exc() + sys.exit(1) + + +if __name__ == "__main__": + main() \ No newline at end of file From 1fe7a2492ed23cff90e59223942504c76826c011 Mon Sep 17 00:00:00 2001 From: ebembi-crdb Date: Sun, 20 Jul 2025 20:04:44 +0530 Subject: [PATCH 04/12] working solution --- src/current/snapshot.py | 822 ++++++++-------------------------------- 1 file changed, 166 insertions(+), 656 deletions(-) diff --git a/src/current/snapshot.py b/src/current/snapshot.py index 96c63f40d95..c47d4e36e0c 100644 --- a/src/current/snapshot.py +++ b/src/current/snapshot.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 """ Complete Offline Documentation Archiver for Jekyll CockroachDB Documentation -Fixed version that preserves CSS structure from Code 2 +FIXED VERSION with correct JavaScript URL processing """ import re import shutil @@ -159,8 +159,149 @@ def ensure_asset(self, name, local_candidates, url, dest_dir): except Exception as e: self.log(f"Failed to download {name}: {e}", "ERROR") + def fix_sidebar_javascript(self, html): + """Fix the embedded sidebar JavaScript configuration and URL processing""" + + # Fix 1: Replace baseUrl in the embedded sidebar configuration + html = re.sub( + r'baseUrl:\s*["\'][^"\']*["\']', + 'baseUrl: ""', + html + ) + + # Fix 2: Find and replace the URL processing logic + # Look for the specific URL processing pattern in the JavaScript + url_processing_pattern = r'(if \(!/\^https\?:/.test\(url\)\) \{\s*url = sidebar\.baseUrl \+ url\.replace\([^}]+\}\s*return url;)' + + # More robust pattern that captures the entire URL processing block + better_pattern = r'(const urls = \(item\.urls \|\| \[\]\)\.map\(function \(url\) \{[\s\S]*?)(if \(!/\^https\?:/.test\(url\)\) \{[\s\S]*?url = sidebar\.baseUrl \+ url\.replace[\s\S]*?\}[\s\S]*?)(return url;[\s\S]*?\}\);)' + + def replace_url_processing(match): + start_part = match.group(1) + end_part = match.group(3) + + # Inject our custom URL processing logic + new_processing = r'''if (!/^https?:/.test(url)) { + // Remove /docs/ prefix if present + url = url.replace(/^\/docs\//, '').replace(/^docs\//, ''); + + // Better current directory detection for file:// URLs + var currentPath = window.location.pathname; + var currentDir = ''; + + // Extract just the relevant part of the path (handle both web and file:// URLs) + var pathMatch = currentPath.match(/(cockroachcloud|v19\.2|releases|advisories)\/[^\/]+$/); + if (pathMatch) { + currentDir = pathMatch[1]; + } else { + // Fallback: check if we're in root or any subdirectory + var pathParts = currentPath.split('/').filter(function(part) { return part; }); + for (var i = pathParts.length - 2; i >= 0; i--) { + if (pathParts[i] === 'cockroachcloud' || pathParts[i] === 'v19.2' || + pathParts[i] === 'releases' || pathParts[i] === 'advisories') { + currentDir = pathParts[i]; + break; + } + } + } + + // Remove leading slash from URL + if (url.startsWith('/')) { + url = url.substring(1); + } + + // Handle stable -> v19.2 conversion + url = url.replace(/^stable\//, 'v19.2/').replace(/\/stable\//, '/v19.2/'); + + // Calculate relative path based on current directory context + if (currentDir) { + // We're in a subdirectory + if (url.startsWith(currentDir + '/')) { + // Same directory - remove the directory prefix + url = url.substring(currentDir.length + 1); + } else if (url.includes('/')) { + // Different directory - need to go up one level + url = '../' + url; + } else if (url !== '' && !url.endsWith('.html') && !url.endsWith('/')) { + // Root level file - go up one level + url = '../' + url; + } + } + + // Clean up any double slashes + url = url.replace(/\/+/g, '/'); + // Note: Keep .html extensions for offline file:// URLs + }''' + + return start_part + new_processing + end_part + + # Try to apply the replacement + new_html = re.sub(better_pattern, replace_url_processing, html, flags=re.DOTALL) + + # If the complex pattern didn't match, try a simpler approach + if new_html == html: + # Simple pattern - just replace the specific problematic line + simple_pattern = r'url = sidebar\.baseUrl \+ url\.replace\([^}]+\}' + + simple_replacement = r'''// Custom offline URL processing + url = url.replace(/^\/docs\//, '').replace(/^docs\//, ''); + + var currentPath = window.location.pathname; + var currentDir = ''; + + var pathMatch = currentPath.match(/(cockroachcloud|v19\.2|releases|advisories)\/[^\/]+$/); + if (pathMatch) { + currentDir = pathMatch[1]; + } else { + var pathParts = currentPath.split('/').filter(function(part) { return part; }); + for (var i = pathParts.length - 2; i >= 0; i--) { + if (pathParts[i] === 'cockroachcloud' || pathParts[i] === 'v19.2' || + pathParts[i] === 'releases' || pathParts[i] === 'advisories') { + currentDir = pathParts[i]; + break; + } + } + } + + if (url.startsWith('/')) { + url = url.substring(1); + } + + url = url.replace(/^stable\//, 'v19.2/').replace(/\/stable\//, '/v19.2/'); + + if (currentDir) { + if (url.startsWith(currentDir + '/')) { + url = url.substring(currentDir.length + 1); + } else if (url.includes('/')) { + url = '../' + url; + } else if (url !== '' && !url.endsWith('.html') && !url.endsWith('/')) { + url = '../' + url; + } + } + + url = url.replace(/\/+/g, '/'); + // Keep .html extensions for offline use + }''' + + new_html = re.sub(simple_pattern, simple_replacement, html, flags=re.DOTALL) + + # Also fix the .html stripping issue + new_html = re.sub( + r'url = url\.replace\("/index\.html", ""\)\.replace\("\.html", ""\);', + 'url = url.replace("/index.html", ""); // Keep .html for offline', + new_html + ) + + # Debug output + if new_html != html: + self.log("Successfully replaced JavaScript URL processing", "SUCCESS") + else: + self.log("Warning: JavaScript URL processing replacement may have failed", "WARNING") + + return new_html + def process_html_file(self, src_path): - """Process a single HTML file using Code 2's approach""" + """Process a single HTML file""" try: rel_path = src_path.relative_to(DOCS_ROOT) dst_path = OUTPUT_ROOT / rel_path @@ -177,54 +318,8 @@ def process_html_file(self, src_path): # Read content html = src_path.read_text(encoding="utf-8") - # Inject sidebar HTML if available - if self.sidebar_html: - html = re.sub( - r"(
]*>)(\s*?
)", - rf"\1{self.sidebar_html}\2", - html, - flags=re.IGNORECASE, - ) - - # Parse with BeautifulSoup to fix sidebar links - soup = BeautifulSoup(html, "html.parser") - - # Remove Ask AI widget and other unwanted elements - remove_selectors = [ - # Ask AI widget - more comprehensive selectors - '.ask-ai', '#ask-ai', '[data-ask-ai]', '.ai-widget', '.kapa-widget', - 'script[src*="kapa"]', '#kapa-widget-container', '.kapa-trigger', - '.kapa-ai-button', '[class*="kapa"]', '[id*="kapa"]', - 'div[data-kapa-widget]', 'button[aria-label*="AI"]', - '[class*="ask-ai"]', '[id*="ask-ai"]', - 'iframe[src*="kapa"]', 'iframe[id*="kapa"]', - - # Version switcher - '.version-switcher', '#version-switcher', '.version-dropdown', - - # Feedback widgets - '.feedback-widget', '#feedback-widget', '[id*="feedback"]', - '.helpful-widget', '.page-helpful', - - # Analytics - 'script[src*="googletagmanager"]', 'script[src*="google-analytics"]', - 'script[src*="segment"]', 'script[src*="heap"]', - ] - - for selector in remove_selectors: - for elem in soup.select(selector): - elem.decompose() - - # Also remove any script tags that contain kapa or AI-related code - for script in soup.find_all('script'): - if script.string and any(term in script.string.lower() for term in ['kapa', 'askai', 'ask-ai', 'aiwidget']): - script.decompose() - - # Remove any iframes that might be Ask AI related - for iframe in soup.find_all('iframe'): - src = iframe.get('src', '') - if any(term in src.lower() for term in ['kapa', 'ask', 'ai']): - iframe.decompose() + # CRITICAL: Fix sidebar JavaScript BEFORE other processing + html = self.fix_sidebar_javascript(html) # Inject sidebar HTML if available if self.sidebar_html: @@ -235,27 +330,20 @@ def process_html_file(self, src_path): flags=re.IGNORECASE, ) - # Parse with BeautifulSoup to fix sidebar links + # Parse with BeautifulSoup for additional cleanup soup = BeautifulSoup(html, "html.parser") # Remove Ask AI widget and other unwanted elements remove_selectors = [ - # Ask AI widget - more comprehensive selectors '.ask-ai', '#ask-ai', '[data-ask-ai]', '.ai-widget', '.kapa-widget', 'script[src*="kapa"]', '#kapa-widget-container', '.kapa-trigger', '.kapa-ai-button', '[class*="kapa"]', '[id*="kapa"]', 'div[data-kapa-widget]', 'button[aria-label*="AI"]', '[class*="ask-ai"]', '[id*="ask-ai"]', 'iframe[src*="kapa"]', 'iframe[id*="kapa"]', - - # Version switcher '.version-switcher', '#version-switcher', '.version-dropdown', - - # Feedback widgets '.feedback-widget', '#feedback-widget', '[id*="feedback"]', '.helpful-widget', '.page-helpful', - - # Analytics 'script[src*="googletagmanager"]', 'script[src*="google-analytics"]', 'script[src*="segment"]', 'script[src*="heap"]', ] @@ -264,7 +352,7 @@ def process_html_file(self, src_path): for elem in soup.select(selector): elem.decompose() - # Also remove any script tags that contain kapa or AI-related code + # Remove any script tags that contain kapa or AI-related code for script in soup.find_all('script'): if script.string and any(term in script.string.lower() for term in ['kapa', 'askai', 'ask-ai', 'aiwidget']): script.decompose() @@ -275,246 +363,10 @@ def process_html_file(self, src_path): if any(term in src.lower() for term in ['kapa', 'ask', 'ai']): iframe.decompose() - # Process sidebar links with clearer logic - sidebar_links = soup.select("#sidebar a[href], #sidebarMenu a[href], #mysidebar a[href]") - - for a in sidebar_links: - original_href = a.get("href", "") - - # Skip external links and anchors - if original_href.startswith(('http://', 'https://', 'mailto:', '#', 'javascript:')): - continue - - # Store original - a['data-original-href'] = original_href - - # Process the href step by step - h = original_href.strip() - - # Check if this was originally a relative link (important for context) - was_relative = not h.startswith('/') - - # Step 1: Handle stable -> v19.2 conversion - h = h.replace('/stable/', f'/{TARGET_VERSION}/') - h = h.replace('stable/', f'{TARGET_VERSION}/') - - # Step 2: Remove domain/localhost if present - if '127.0.0.1:4000/' in h: - h = h.split('127.0.0.1:4000/')[-1] - if 'localhost:4000/' in h: - h = h.split('localhost:4000/')[-1] - - # Step 3: Remove /docs/ prefix - if h.startswith('/docs/'): - h = h[6:] # Remove '/docs/' - elif h.startswith('docs/'): - h = h[5:] # Remove 'docs/' - - # Step 4: Remove any remaining leading slashes - h = h.lstrip('/') - - # Step 5: Determine if we need to add version directory - needs_version = False - if h: # If we have a path - # Check if it already has a version - if not h.startswith(f'{TARGET_VERSION}/'): - # List of paths that should NOT get version prefix - non_versioned = [ - 'cockroachcloud/', 'releases/', 'advisories/', - 'images/', 'css/', 'js/', '_internal/', 'fonts/', - 'img/', 'assets/' - ] - - # Check if it's a special non-versioned path - is_special = any(h.startswith(d) for d in non_versioned) - - # Check if it has a file extension that indicates an asset - is_asset = any(h.endswith(ext) for ext in [ - '.css', '.js', '.png', '.jpg', '.jpeg', '.gif', - '.svg', '.ico', '.woff', '.woff2', '.ttf', '.eot' - ]) - - # CRITICAL FIX: If we're already in a version directory and this is - # a simple doc page (like secure-a-cluster.html), we DON'T need to add version - # because it will be relative to the current directory - if is_in_version_dir and not is_special and not is_asset and '/' not in h: - # This is a simple filename in the same version directory - needs_version = False - if 'secure-a-cluster' in h: - self.log(f"NOT adding version to '{h}' - already in version dir", "WARNING") - elif was_relative and is_in_version_dir: - # Original link was relative AND we're in a version directory - needs_version = False - elif not is_special and not is_asset: - # Otherwise, if it's not special and not an asset, it needs version - needs_version = True - if sidebar_links.index(a) < 5: # Debug first few - self.log(f"Adding version to: {h} (was_relative={was_relative}, in_version={is_in_version_dir})", "DEBUG") - - # Add version directory if needed - if needs_version: - h = f'{TARGET_VERSION}/{h}' - - # Step 6: Add .html if needed - if h and not h.endswith('/') and not h.endswith('.html'): - # Check if it already has an extension - parts = h.split('/') - last_part = parts[-1] - if '.' not in last_part: - h += '.html' - - # Step 7: Calculate the correct relative path - # Now that we've been smart about adding version, this is simpler - - # Special debugging for secure-a-cluster.html - if 'secure-a-cluster' in h or sidebar_links.index(a) < 3: - self.log(f" Final path calc: h='{h}' in_v_dir={is_in_version_dir}", "DEBUG") - - if is_in_version_dir: - # We're in a version directory - if h.startswith(f'{TARGET_VERSION}/'): - # This shouldn't happen if we were smart above, but just in case - # Remove redundant version prefix - h = h[len(TARGET_VERSION) + 1:] - final_href = h - self.log(f" WARNING: Had to strip redundant version prefix", "WARNING") - elif any(h.startswith(d) for d in ['cockroachcloud/', 'releases/', 'advisories/', 'images/', 'css/', 'js/']): - # These need to go up a level from version dir - final_href = "../" + h - else: - # Simple filename in same directory - final_href = h - else: - # We're NOT in version dir, use normal prefix - final_href = prefix + h if h else prefix + "index.html" - - a["href"] = final_href - - # Debug output - if sidebar_links.index(a) < 5 or 'secure-a-cluster' in original_href: - self.log(f"Sidebar: '{original_href}' -> '{final_href}'", "INFO") - - # Process ALL other links - all_links = soup.select("a[href]") - content_link_count = 0 - for a in all_links: - if a in sidebar_links: # Skip already processed - continue - - original_href = a.get("href", "") - - # Skip external links and anchors - if original_href.startswith(('http://', 'https://', 'mailto:', '#', 'javascript:')): - continue - - # Store original - a['data-original-href'] = original_href - - # Apply same processing - h = original_href.strip() - - # Check if this was originally relative - was_relative = not h.startswith('/') - - # Handle stable -> v19.2 - h = h.replace('/stable/', f'/{TARGET_VERSION}/') - h = h.replace('stable/', f'{TARGET_VERSION}/') - - # Remove domain - if '127.0.0.1:4000/' in h: - h = h.split('127.0.0.1:4000/')[-1] - if 'localhost:4000/' in h: - h = h.split('localhost:4000/')[-1] - - # Remove /docs/ prefix - if h.startswith('/docs/'): - h = h[6:] - elif h.startswith('docs/'): - h = h[5:] - - # Remove leading slashes - h = h.lstrip('/') - - # Determine if we need to add version directory - needs_version = False - if h: # If we have a path - # Check if it already has a version - if not h.startswith(f'{TARGET_VERSION}/'): - # List of paths that should NOT get version prefix - non_versioned = [ - 'cockroachcloud/', 'releases/', 'advisories/', - 'images/', 'css/', 'js/', '_internal/', 'fonts/', - 'img/', 'assets/' - ] - - # Check if it's a special non-versioned path - is_special = any(h.startswith(d) for d in non_versioned) - - # Check for file extensions that indicate assets - is_asset = any(h.endswith(ext) for ext in [ - '.css', '.js', '.png', '.jpg', '.jpeg', '.gif', - '.svg', '.ico', '.woff', '.woff2', '.ttf', '.eot' - ]) - - # CRITICAL FIX: If we're already in a version directory and this is - # a simple doc page (like secure-a-cluster.html), we DON'T need to add version - if is_in_version_dir and not is_special and not is_asset and '/' not in h: - # This is a simple filename in the same version directory - needs_version = False - if 'secure-a-cluster' in h: - self.log(f"NOT adding version to '{h}' - already in version dir", "WARNING") - elif was_relative and is_in_version_dir: - # Original link was relative AND we're in a version directory - needs_version = False - elif not is_special and not is_asset: - # Otherwise, if it's not special and not an asset, it needs version - needs_version = True - - # Add version directory if needed - if needs_version: - h = f'{TARGET_VERSION}/{h}' - - # Add .html if needed - if h and not h.endswith('/') and not h.endswith('.html'): - parts = h.split('/') - last_part = parts[-1] - if '.' not in last_part: - h += '.html' - - # Calculate the correct relative path - # Now that we've been smart about adding version, this is simpler - - if is_in_version_dir: - # We're in a version directory - if h.startswith(f'{TARGET_VERSION}/'): - # This shouldn't happen if we were smart above, but just in case - # Remove redundant version prefix - h = h[len(TARGET_VERSION) + 1:] - final_href = h - elif any(h.startswith(d) for d in ['cockroachcloud/', 'releases/', 'advisories/', 'images/', 'css/', 'js/']): - # These need to go up a level from version dir - final_href = "../" + h - else: - # Simple filename in same directory - final_href = h - else: - # We're NOT in version dir, use normal prefix - final_href = prefix + h if h else prefix + "index.html" - - a["href"] = final_href - - # Debug first few content links - if content_link_count < 3 or 'secure-a-cluster' in original_href: - self.log(f"Content: '{original_href}' -> '{final_href}'", "INFO") - content_link_count += 1 - # Convert back to string html = str(soup) - # Convert back to string - html = str(soup) - - # Clean up query parameters + # Clean up various path patterns html = re.sub( r"(src|href)=\"([^\"?]+)\?[^\" ]+\"", lambda m: f'{m.group(1)}="{m.group(2)}"', @@ -522,24 +374,15 @@ def process_html_file(self, src_path): ) # Fix various path patterns - # Handle stable version references first html = re.sub(r'(href|src)="/docs/stable/', rf'\1="{TARGET_VERSION}/', html) html = re.sub(r'(href|src)="docs/stable/', rf'\1="{TARGET_VERSION}/', html) - - # Remove /docs/ prefix while preserving version - # This regex specifically handles /docs/vXX.X/ patterns html = re.sub(r'(href|src)="/docs/(v\d+\.\d+/[^"]+)"', r'\1="\2"', html) html = re.sub(r'(href|src)="docs/(v\d+\.\d+/[^"]+)"', r'\1="\2"', html) - - # For non-versioned docs paths html = re.sub(r'(href|src)="/docs/([^v][^"]+)"', r'\1="\2"', html) html = re.sub(r'(href|src)="docs/([^v][^"]+)"', r'\1="\2"', html) - - # Remove any remaining leading slashes from local paths - # Skip URLs that start with // (protocol-relative) html = re.sub(r'(href|src)="/(?!/)([^"]+)"', r'\1="\2"', html) - # Fix asset paths - this is critical for CSS + # Fix asset paths for asset in ["css", "js", "images", "_internal"]: html = re.sub( rf"(src|href)=[\"']/{asset}/([^\"']+)[\"']", @@ -547,31 +390,13 @@ def process_html_file(self, src_path): html, ) - # Fix img paths - html = re.sub( - r"(src|href)=[\"']/?img/([^\"']+)[\"']", - r'\1="img/\2"', - html, - ) - - # Fix docs/images paths - html = re.sub( - r"(src|href|xlink:href)=[\"']/?docs/images/([^\"']+)[\"']", - r'\1="images/\2"', - html, - ) + html = re.sub(r"(src|href)=[\"']/?img/([^\"']+)[\"']", r'\1="img/\2"', html) + html = re.sub(r"(src|href|xlink:href)=[\"']/?docs/images/([^\"']+)[\"']", r'\1="images/\2"', html) # Replace Google Fonts html = re.sub( r"]+fonts\.googleapis\.com[^>]+>", - '', - html, - ) - - # Fix CSS imports - html = re.sub( - r"@import\s+url\((['\"]?)/docs/(css/[^)]+)\1\);", - r"@import url(\2);", + f'', html, ) @@ -583,32 +408,7 @@ def process_html_file(self, src_path): html, ) - # # Fix remaining paths that need prefix - # # Only add prefix to paths that don't already have it and aren't external - # html = re.sub( - # r'(href|src)="(?!\.\./)(?!https?:)(?!mailto:)(?!#)(?!javascript:)(?!//)([^"]+)"', - # rf'\1="{prefix}\2"', - # html, - # ) - - # Debug: Check if we still have absolute paths - if len(self.processed_files) < 3: # Only for first few files - import re as regex - abs_paths = regex.findall(r'href="/(v19\.2/[^"]+)"', html) - if abs_paths: - self.log(f"Warning: Found absolute paths in {rel_path}: {abs_paths[:3]}", "WARNING") - - # Final cleanup - remove any double slashes or incorrect patterns - html = html.replace('"//', '"/') # Fix double slashes - html = re.sub(r'"\.\./+', '"../', html) # Fix multiple slashes after ../ - - # Fix any paths that might have lost their 'v' prefix - html = re.sub(r'(href|src)="(\.\./)*19\.2/', rf'\1="\2v19.2/', html) - - # Ensure v19.2 paths don't have unnecessary prefixes - html = re.sub(r'(href|src)="(\.\./)+v19\.2/v19\.2/', r'\1="\2v19.2/', html) - - # Inject navigation dependencies - CRITICAL FOR STYLING + # Inject navigation dependencies nav_deps = f''' @@ -626,12 +426,13 @@ def process_html_file(self, src_path): overflow: visible !important; }} -/* Hide online-only elements - comprehensive */ +/* Hide online-only elements */ .ask-ai, #ask-ai, [data-ask-ai], .ai-widget, .kapa-widget, [class*="kapa"], [id*="kapa"], [class*="ask-ai"], [id*="ask-ai"], .version-switcher, #version-switcher, .feedback-widget, button[aria-label*="AI"], div[data-kapa-widget], -.kapa-ai-button, .ai-assistant, .ai-chat {{ +.kapa-ai-button, .ai-assistant, .ai-chat, +.floating-action-button, .fab, [class*="floating-button"] {{ display: none !important; visibility: hidden !important; opacity: 0 !important; @@ -640,23 +441,6 @@ def process_html_file(self, src_path): left: -9999px !important; }} -/* Hide floating action buttons */ -.floating-action-button, .fab, [class*="floating-button"], -button[style*="fixed"], button[style*="absolute"] {{ - display: none !important; -}} - -/* Hide any fixed position elements in bottom right (common for chat widgets) */ -[style*="position: fixed"][style*="bottom"][style*="right"], -[style*="position:fixed"][style*="bottom"][style*="right"] {{ - display: none !important; -}} - -/* Hide iframes that might be chat widgets */ -iframe[src*="kapa"], iframe[id*="kapa"], iframe[class*="chat"] {{ - display: none !important; -}} - /* Navgoco styling */ .navgoco li {{ list-style: none; }} .navgoco li.active > a {{ @@ -673,21 +457,12 @@ def process_html_file(self, src_path): # Add navigation initialization nav_init = """""" html = re.sub(r"", nav_init + "\n", html, flags=re.IGNORECASE) @@ -761,121 +527,39 @@ def download_google_fonts(self): fonts_dir.mkdir(exist_ok=True) try: - # Get CSS headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'} css_response = requests.get(FONTS_CSS_URL, headers=headers, timeout=10) css_response.raise_for_status() css_content = css_response.text - # Extract and download font files font_urls = set(re.findall(r"url\((https://fonts\.gstatic\.com/[^\)]+)\)", css_content)) for url in font_urls: try: - # Download font font_response = requests.get(url, headers=headers, timeout=10) font_response.raise_for_status() - # Save font parsed = urlparse(url) font_path = parsed.path.lstrip("/") dst = fonts_dir / font_path dst.parent.mkdir(parents=True, exist_ok=True) dst.write_bytes(font_response.content) - # Update CSS css_content = css_content.replace(url, f"../fonts/{font_path}") except Exception as e: self.log(f"Failed to download font from {url}: {e}", "WARNING") - # Save localized CSS (OUTPUT_ROOT / "css" / "google-fonts.css").write_text(css_content, encoding="utf-8") self.log("Google Fonts localized", "SUCCESS") except Exception as e: self.log(f"Error downloading fonts: {e}", "ERROR") - # Create fallback fallback = """/* Fallback fonts */ body { font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Arial, sans-serif; } code, pre { font-family: Consolas, Monaco, "Courier New", monospace; }""" (OUTPUT_ROOT / "css" / "google-fonts.css").write_text(fallback) - def create_link_test_page(self): - """Create a test page to verify link processing""" - test_html = f""" - - - Link Test Page - - - -

Link Processing Test Results

-

This page shows how different link patterns were processed:

- -

From pages NOT in version directory:

-
-
Context: Page at /index.html
-
Original: /docs/insert.html
-
Should be: v19.2/insert.html
- Test Link -
- -
-
Context: Page at /index.html
-
Original: /docs/v19.2/secure-a-cluster.html
-
Should be: v19.2/secure-a-cluster.html
- Test Link -
- -

From pages IN version directory:

-
-
Context: Page at /v19.2/index.html
-
Original: /docs/secure-a-cluster.html
-
Should be: secure-a-cluster.html (same dir)
-

This link would be at: v19.2/secure-a-cluster.html

-
- -
-
Context: Page at /v19.2/index.html
-
Original: /docs/v19.2/secure-a-cluster.html
-
Should be: secure-a-cluster.html (same dir)
-

This link would be at: v19.2/secure-a-cluster.html

-
- -

Special cases:

-
-
Original: /docs/stable/something.html
-
Should be: v19.2/something.html
- Test Link -
- -
-
Original: /docs/cockroachcloud/quickstart.html
-
Should be: cockroachcloud/quickstart.html
- Test Link -
- -
-
Original: /docs/releases/index.html
-
Should be: releases/index.html
- Test Link -
- -

Note: Click each link to verify it works correctly.

- -""" - - test_path = OUTPUT_ROOT / "_link_test.html" - test_path.write_text(test_html) - self.log("Created link test page: _link_test.html", "SUCCESS") - def create_index_page(self): """Create the index page""" index_html = f""" @@ -887,17 +571,6 @@ def create_index_page(self): - - -

CockroachDB {TARGET_VERSION}

-

Offline Documentation Archive

- - - -
-

šŸ“Œ Offline Archive

-

This is a complete offline archive of the CockroachDB {TARGET_VERSION} documentation. - All internal links have been updated to work offline.

-

Created: {datetime.now().strftime('%B %d, %Y at %I:%M %p')}

-
- - """ @@ -1179,9 +689,9 @@ def create_index_page(self): self.log("Created index.html", "SUCCESS") def build(self): - """Main build process following Code 2's structure""" + """Main build process""" print("\n" + "="*60) - print("šŸš€ COCKROACHDB OFFLINE DOCUMENTATION ARCHIVER") + print("šŸš€ COCKROACHDB OFFLINE DOCUMENTATION ARCHIVER (FIXED)") print("="*60) # Verify paths @@ -1200,7 +710,7 @@ def build(self): shutil.rmtree(OUTPUT_ROOT) OUTPUT_ROOT.mkdir(parents=True) - # CRITICAL: Copy global assets FIRST (from SITE_ROOT, not DOCS_ROOT) + # Copy global assets FIRST self.log("\n--- Copying Global Assets ---") for asset_dir in ["css", "js", "img"]: src = SITE_ROOT / asset_dir @@ -1296,16 +806,16 @@ def build(self): # Summary print("\n" + "="*60) - self.log("ARCHIVE COMPLETE!", "SUCCESS") + self.log("ARCHIVE COMPLETE WITH JAVASCRIPT FIXES!", "SUCCESS") self.log(f"Output directory: {OUTPUT_ROOT.resolve()}") self.log(f"Total files: {len(self.processed_files)}") - self.log("āœ… Ask AI widget removed", "SUCCESS") - self.log("āœ… All links converted to relative paths", "SUCCESS") - self.log("āœ… Version directory (v19.2) added where needed", "SUCCESS") + self.log("āœ… Sidebar JavaScript URL processing FIXED", "SUCCESS") + self.log("āœ… Relative path calculation corrected", "SUCCESS") + self.log("āœ… cockroachcloud/ links should now work correctly", "SUCCESS") - print(f"\nšŸŽ‰ Offline site built in {OUTPUT_ROOT}") + print(f"\nšŸŽ‰ Fixed offline site built in {OUTPUT_ROOT}") print(f"\nšŸ“¦ To test: open file://{OUTPUT_ROOT.resolve()}/index.html") - print(f"\nšŸ“Œ Note: Check console output above for link transformation details") + print(f"\nšŸ”— Test the problematic link: cockroachcloud/quickstart.html → create-an-account.html") return True From 4c4dde1cecc89ce9f7020e3ed32af14304f76215 Mon Sep 17 00:00:00 2001 From: ebembi-crdb Date: Mon, 21 Jul 2025 06:57:12 +0530 Subject: [PATCH 05/12] index page fixed --- src/current/snapshot.py | 151 +++++++++++++++++++++++----------------- 1 file changed, 87 insertions(+), 64 deletions(-) diff --git a/src/current/snapshot.py b/src/current/snapshot.py index c47d4e36e0c..840b76e7297 100644 --- a/src/current/snapshot.py +++ b/src/current/snapshot.py @@ -185,46 +185,58 @@ def replace_url_processing(match): // Remove /docs/ prefix if present url = url.replace(/^\/docs\//, '').replace(/^docs\//, ''); - // Better current directory detection for file:// URLs - var currentPath = window.location.pathname; - var currentDir = ''; - - // Extract just the relevant part of the path (handle both web and file:// URLs) - var pathMatch = currentPath.match(/(cockroachcloud|v19\.2|releases|advisories)\/[^\/]+$/); - if (pathMatch) { - currentDir = pathMatch[1]; + // Handle root/home URLs + if (url === '/' || url === '' || url === 'index' || url === 'index.html') { + // For docs home, determine if we need to go up directories + var currentPath = window.location.pathname; + var pathMatch = currentPath.match(/(cockroachcloud|v19\.2|releases|advisories)\/[^\/]+$/); + if (pathMatch) { + url = '../index.html'; // Go up to main index + } else { + url = 'index.html'; // Stay at current level + } } else { - // Fallback: check if we're in root or any subdirectory - var pathParts = currentPath.split('/').filter(function(part) { return part; }); - for (var i = pathParts.length - 2; i >= 0; i--) { - if (pathParts[i] === 'cockroachcloud' || pathParts[i] === 'v19.2' || - pathParts[i] === 'releases' || pathParts[i] === 'advisories') { - currentDir = pathParts[i]; - break; + // Better current directory detection for file:// URLs + var currentPath = window.location.pathname; + var currentDir = ''; + + // Extract just the relevant part of the path (handle both web and file:// URLs) + var pathMatch = currentPath.match(/(cockroachcloud|v19\.2|releases|advisories)\/[^\/]+$/); + if (pathMatch) { + currentDir = pathMatch[1]; + } else { + // Fallback: check if we're in root or any subdirectory + var pathParts = currentPath.split('/').filter(function(part) { return part; }); + for (var i = pathParts.length - 2; i >= 0; i--) { + if (pathParts[i] === 'cockroachcloud' || pathParts[i] === 'v19.2' || + pathParts[i] === 'releases' || pathParts[i] === 'advisories') { + currentDir = pathParts[i]; + break; + } } } - } - - // Remove leading slash from URL - if (url.startsWith('/')) { - url = url.substring(1); - } - - // Handle stable -> v19.2 conversion - url = url.replace(/^stable\//, 'v19.2/').replace(/\/stable\//, '/v19.2/'); - - // Calculate relative path based on current directory context - if (currentDir) { - // We're in a subdirectory - if (url.startsWith(currentDir + '/')) { - // Same directory - remove the directory prefix - url = url.substring(currentDir.length + 1); - } else if (url.includes('/')) { - // Different directory - need to go up one level - url = '../' + url; - } else if (url !== '' && !url.endsWith('.html') && !url.endsWith('/')) { - // Root level file - go up one level - url = '../' + url; + + // Remove leading slash from URL + if (url.startsWith('/')) { + url = url.substring(1); + } + + // Handle stable -> v19.2 conversion + url = url.replace(/^stable\//, 'v19.2/').replace(/\/stable\//, '/v19.2/'); + + // Calculate relative path based on current directory context + if (currentDir) { + // We're in a subdirectory + if (url.startsWith(currentDir + '/')) { + // Same directory - remove the directory prefix + url = url.substring(currentDir.length + 1); + } else if (url.includes('/')) { + // Different directory - need to go up one level + url = '../' + url; + } else if (url !== '' && !url.endsWith('.html') && !url.endsWith('/')) { + // Root level file - go up one level + url = '../' + url; + } } } @@ -246,36 +258,47 @@ def replace_url_processing(match): simple_replacement = r'''// Custom offline URL processing url = url.replace(/^\/docs\//, '').replace(/^docs\//, ''); - var currentPath = window.location.pathname; - var currentDir = ''; - - var pathMatch = currentPath.match(/(cockroachcloud|v19\.2|releases|advisories)\/[^\/]+$/); - if (pathMatch) { - currentDir = pathMatch[1]; + // Handle root/home URLs + if (url === '/' || url === '' || url === 'index' || url === 'index.html') { + var currentPath = window.location.pathname; + var pathMatch = currentPath.match(/(cockroachcloud|v19\.2|releases|advisories)\/[^\/]+$/); + if (pathMatch) { + url = '../index.html'; + } else { + url = 'index.html'; + } } else { - var pathParts = currentPath.split('/').filter(function(part) { return part; }); - for (var i = pathParts.length - 2; i >= 0; i--) { - if (pathParts[i] === 'cockroachcloud' || pathParts[i] === 'v19.2' || - pathParts[i] === 'releases' || pathParts[i] === 'advisories') { - currentDir = pathParts[i]; - break; + var currentPath = window.location.pathname; + var currentDir = ''; + + var pathMatch = currentPath.match(/(cockroachcloud|v19\.2|releases|advisories)\/[^\/]+$/); + if (pathMatch) { + currentDir = pathMatch[1]; + } else { + var pathParts = currentPath.split('/').filter(function(part) { return part; }); + for (var i = pathParts.length - 2; i >= 0; i--) { + if (pathParts[i] === 'cockroachcloud' || pathParts[i] === 'v19.2' || + pathParts[i] === 'releases' || pathParts[i] === 'advisories') { + currentDir = pathParts[i]; + break; + } } } - } - - if (url.startsWith('/')) { - url = url.substring(1); - } - - url = url.replace(/^stable\//, 'v19.2/').replace(/\/stable\//, '/v19.2/'); - - if (currentDir) { - if (url.startsWith(currentDir + '/')) { - url = url.substring(currentDir.length + 1); - } else if (url.includes('/')) { - url = '../' + url; - } else if (url !== '' && !url.endsWith('.html') && !url.endsWith('/')) { - url = '../' + url; + + if (url.startsWith('/')) { + url = url.substring(1); + } + + url = url.replace(/^stable\//, 'v19.2/').replace(/\/stable\//, '/v19.2/'); + + if (currentDir) { + if (url.startsWith(currentDir + '/')) { + url = url.substring(currentDir.length + 1); + } else if (url.includes('/')) { + url = '../' + url; + } else if (url !== '' && !url.endsWith('.html') && !url.endsWith('/')) { + url = '../' + url; + } } } From 52be96aba91cc91acd9ffeb05ca7b7a4d2587815 Mon Sep 17 00:00:00 2001 From: ebembi-crdb Date: Mon, 21 Jul 2025 07:18:57 +0530 Subject: [PATCH 06/12] Removed dead links of files not in 19.2 version --- src/current/snapshot.py | 121 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 121 insertions(+) diff --git a/src/current/snapshot.py b/src/current/snapshot.py index 840b76e7297..1bd7dc796ee 100644 --- a/src/current/snapshot.py +++ b/src/current/snapshot.py @@ -63,6 +63,84 @@ def log(self, message, level="INFO"): }.get(level, "") print(f"[{timestamp}] {prefix} {message}") + def clean_sidebar_data(self, sidebar_data): + """Remove broken links from sidebar data""" + def check_file_exists(url): + """Check if a file exists for a given URL""" + if url.startswith(('http://', 'https://', '#', 'mailto:', 'javascript:')): + return True # External links are always valid + + # Normalize URL to file path + file_url = url.strip() + + # Handle root/empty URLs + if file_url in ['/', '', 'index', 'index.html']: + return True # Root index always exists + + # Remove leading slash and docs prefix + if file_url.startswith('/docs/'): + file_url = file_url[6:] + elif file_url.startswith('docs/'): + file_url = file_url[5:] + file_url = file_url.lstrip('/') + + # Handle stable -> v19.2 + file_url = file_url.replace('/stable/', f'/{TARGET_VERSION}/') + file_url = file_url.replace('stable/', f'{TARGET_VERSION}/') + + # Convert ${VERSION} placeholder + file_url = file_url.replace('${VERSION}', TARGET_VERSION) + + # Add .html if needed + if file_url and not file_url.endswith('/') and not file_url.endswith('.html'): + if '.' not in file_url.split('/')[-1]: # No extension + file_url += '.html' + + # Check if file exists + file_path = DOCS_ROOT / file_url + exists = file_path.exists() + + if not exists: + self.log(f"Removing broken link: {url} -> {file_path}", "WARNING") + + return exists + + def clean_item(item): + """Recursively clean an item and its children""" + if isinstance(item, dict): + # Clean URLs if present + if 'urls' in item: + item['urls'] = [url for url in item['urls'] if check_file_exists(url)] + # If no valid URLs left, this item is invalid + if not item['urls']: + return None + + # Clean child items if present + if 'items' in item: + cleaned_items = [] + for child in item['items']: + cleaned_child = clean_item(child) + if cleaned_child is not None: + cleaned_items.append(cleaned_child) + item['items'] = cleaned_items + + # If no URLs and no valid children, remove this item + if 'urls' not in item and not item['items']: + return None + + return item + + return item + + # Clean the sidebar data + cleaned_items = [] + for item in sidebar_data: + cleaned_item = clean_item(item) + if cleaned_item is not None: + cleaned_items.append(cleaned_item) + + return cleaned_items + def load_sidebar(self): """Load and prepare the sidebar HTML""" self.log(f"Loading sidebar from: {SIDEBAR_HTML_PATH}") @@ -83,6 +161,49 @@ def load_sidebar(self): break if self.sidebar_html: + # Extract and clean sidebar data + self.log("Cleaning sidebar data (removing broken links)...") + + # Parse the sidebar HTML to extract the JavaScript data + import re + import json + + # Extract the sidebar items from the JavaScript + items_match = re.search(r'items:\s*(\[[\s\S]*?\])\s*};', self.sidebar_html) + if items_match: + try: + # Parse the JavaScript array as JSON (with some cleaning) + items_str = items_match.group(1) + # Clean up JavaScript to make it valid JSON + items_str = re.sub(r'(\w+):', r'"\1":', items_str) # Quote keys + items_str = re.sub(r',\s*}', '}', items_str) # Remove trailing commas + items_str = re.sub(r',\s*]', ']', items_str) # Remove trailing commas in arrays + + sidebar_data = json.loads(items_str) + + # Clean the sidebar data + cleaned_data = self.clean_sidebar_data(sidebar_data) + + # Replace the items in the HTML + cleaned_items_str = json.dumps(cleaned_data, indent=2) + self.sidebar_html = re.sub( + r'items:\s*\[[\s\S]*?\]', + f'items:{cleaned_items_str}', + self.sidebar_html + ) + + self.log(f"Cleaned sidebar data: removed broken links", "SUCCESS") + + except Exception as e: + self.log(f"Could not clean sidebar data: {e}", "WARNING") + + # Simplify isVersionDirectory function for v19.2 only + self.sidebar_html = re.sub( + r'isVersionDirectory:\s*function\s*\([^}]*\{[^}]*\}', + 'isVersionDirectory: function (d) { return d === "v19.2" || d === "stable"; }', + self.sidebar_html + ) + # Clean the sidebar HTML of any Ask AI elements sidebar_soup = BeautifulSoup(self.sidebar_html, "html.parser") From 8d309786fdf452a8db6d544c7bf29c9faad86a38 Mon Sep 17 00:00:00 2001 From: ebembi-crdb Date: Mon, 21 Jul 2025 07:58:11 +0530 Subject: [PATCH 07/12] Updated home page --- src/current/snapshot.py | 479 ++++++++++++++++++++++++++++++++-------- 1 file changed, 390 insertions(+), 89 deletions(-) diff --git a/src/current/snapshot.py b/src/current/snapshot.py index 1bd7dc796ee..1443986f7ef 100644 --- a/src/current/snapshot.py +++ b/src/current/snapshot.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 """ Complete Offline Documentation Archiver for Jekyll CockroachDB Documentation -FIXED VERSION with correct JavaScript URL processing +FIXED VERSION with proper purple CockroachDB branding """ import re import shutil @@ -371,6 +371,13 @@ def replace_url_processing(match): # Try to apply the replacement new_html = re.sub(better_pattern, replace_url_processing, html, flags=re.DOTALL) + # Also fix the .html stripping issue - replace the line that removes .html extensions + new_html = re.sub( + r'url = url\.replace\("/index\.html", ""\)\.replace\("\.html", ""\);', + 'url = url.replace("/index.html", ""); // Keep .html for offline', + new_html + ) + # If the complex pattern didn't match, try a simpler approach if new_html == html: # Simple pattern - just replace the specific problematic line @@ -705,137 +712,430 @@ def download_google_fonts(self): (OUTPUT_ROOT / "css" / "google-fonts.css").write_text(fallback) def create_index_page(self): - """Create the index page""" + """Create the index page with proper CockroachDB purple branding""" index_html = f""" - CockroachDB {TARGET_VERSION} Documentation (Offline) + CockroachDB Documentation + -

CockroachDB {TARGET_VERSION}

-

Offline Documentation Archive

- -
-
-

šŸ“š Getting Started

- + +
+
+ šŸ“± + Offline Documentation Archive - CockroachDB Version 19.2
- - - -
-

ā˜ļø CockroachDB Cloud

- +
+ + +
+
+
+

Documentation

+

CockroachDB is the SQL database for building global, scalable cloud services that survive disasters.

+
+ +
+
+
ā˜ļø
+

Start a cloud cluster

+

Get started with CockroachDB Cloud, our fully managed service.

+ + Learn more → + +
+ +
+
šŸ–„ļø
+

Start a local cluster

+

Set up a local CockroachDB cluster for development and testing.

+ + Learn more → + +
+ +
+
šŸš€
+

Build a sample app

+

Build applications using your favorite language and framework.

+ + Learn more → + +
+
+ +
- -
+ + +
+ -
-

šŸ“Œ Offline Archive

-

This is a complete offline archive of the CockroachDB {TARGET_VERSION} documentation. - All internal links have been updated to work offline.

-

Created: {datetime.now().strftime('%B %d, %Y at %I:%M %p')}

-
+ """ (OUTPUT_ROOT / "index.html").write_text(index_html) - self.log("Created index.html", "SUCCESS") + self.log("Created CockroachDB purple-branded index.html", "SUCCESS") def build(self): """Main build process""" print("\n" + "="*60) - print("šŸš€ COCKROACHDB OFFLINE DOCUMENTATION ARCHIVER (FIXED)") + print("šŸš€ COCKROACHDB OFFLINE DOCUMENTATION ARCHIVER (PURPLE BRANDED)") print("="*60) # Verify paths @@ -950,16 +1250,17 @@ def build(self): # Summary print("\n" + "="*60) - self.log("ARCHIVE COMPLETE WITH JAVASCRIPT FIXES!", "SUCCESS") + self.log("ARCHIVE COMPLETE WITH PURPLE BRANDING!", "SUCCESS") self.log(f"Output directory: {OUTPUT_ROOT.resolve()}") self.log(f"Total files: {len(self.processed_files)}") + self.log("🟣 CockroachDB purple branding applied", "SUCCESS") self.log("āœ… Sidebar JavaScript URL processing FIXED", "SUCCESS") - self.log("āœ… Relative path calculation corrected", "SUCCESS") - self.log("āœ… cockroachcloud/ links should now work correctly", "SUCCESS") + self.log("āœ… Broken sidebar links removed", "SUCCESS") + self.log("āœ… Professional index page created", "SUCCESS") - print(f"\nšŸŽ‰ Fixed offline site built in {OUTPUT_ROOT}") + print(f"\nšŸŽ‰ Purple-branded offline site built in {OUTPUT_ROOT}") print(f"\nšŸ“¦ To test: open file://{OUTPUT_ROOT.resolve()}/index.html") - print(f"\nšŸ”— Test the problematic link: cockroachcloud/quickstart.html → create-an-account.html") + print(f"\n🟣 Your site now has proper CockroachDB purple branding!") return True From ac829d17feb1c8d51326df1f2e2d30bafffb6a48 Mon Sep 17 00:00:00 2001 From: ebembi-crdb Date: Tue, 22 Jul 2025 00:54:15 +0530 Subject: [PATCH 08/12] code for removing sidelinks --- src/current/test_removal.py | 460 ++++++++++++++++++++++++++++++++++++ 1 file changed, 460 insertions(+) create mode 100644 src/current/test_removal.py diff --git a/src/current/test_removal.py b/src/current/test_removal.py new file mode 100644 index 00000000000..24232d6a703 --- /dev/null +++ b/src/current/test_removal.py @@ -0,0 +1,460 @@ +#!/usr/bin/env python3 +""" +Test script for cleaning JavaScript sidebar items array in individual HTML pages +""" +import re +import json +from pathlib import Path + +# Configuration +JEKYLL_ROOT = Path.cwd() +SITE_ROOT = JEKYLL_ROOT / "_site" +DOCS_ROOT = SITE_ROOT / "docs" +TARGET_VERSION = "v19.2" + +def check_file_exists(url): + """Test if a file exists for a given URL""" + print(f" Checking URL: {url}") + original_url = url + + if url.startswith(('http://', 'https://', '#', 'mailto:', 'javascript:')): + print(f" -> External/anchor link, keeping: {url}") + return True + + # Normalize URL to file path + file_url = url.strip() + + # Handle root/empty URLs + if file_url in ['/', '', 'index', 'index.html']: + print(f" -> Root URL, keeping: {url}") + return True + + # Remove leading slash and docs prefix + if file_url.startswith('/docs/'): + file_url = file_url[6:] + elif file_url.startswith('docs/'): + file_url = file_url[5:] + file_url = file_url.lstrip('/') + + # Handle stable -> v19.2 + file_url = file_url.replace('/stable/', f'/{TARGET_VERSION}/') + file_url = file_url.replace('stable/', f'{TARGET_VERSION}/') + if file_url == 'stable': + file_url = TARGET_VERSION + + # Convert ${VERSION} placeholder + file_url = file_url.replace('${VERSION}', TARGET_VERSION) + + print(f" -> Normalized: {original_url} → {file_url}") + + # Try multiple file path variations + possible_paths = [ + file_url, + file_url + '.html' if file_url and not file_url.endswith('.html') and '.' not in file_url.split('/')[-1] else None, + file_url + '/index.html' if file_url and not file_url.endswith('/') else None, + file_url.rstrip('/') + '.html' if file_url.endswith('/') else None + ] + + # Check if any variation exists + for path in possible_paths: + if path: + file_path = DOCS_ROOT / path + if file_path.exists(): + print(f" -> āœ… FOUND: {path}") + return True + + print(f" -> āŒ NOT FOUND: {url}") + return False + +def clean_sidebar_items(items_data): + """Clean the sidebar items array""" + removed_urls_count = 0 + + def clean_item(item, level=0): + nonlocal removed_urls_count + """Recursively clean an item""" + indent = " " * level + + if not isinstance(item, dict): + return item + + title = item.get('title', 'Unknown') + print(f"{indent}Cleaning: '{title}'") + + # Clean URLs if present + if 'urls' in item and item['urls']: + original_count = len(item['urls']) + valid_urls = [] + + print(f"{indent} Found {original_count} URLs:") + for url in item['urls']: + if check_file_exists(url): + valid_urls.append(url) + else: + print(f"{indent} REMOVING: {url}") + removed_urls_count += 1 + + if valid_urls: + item['urls'] = valid_urls + print(f"{indent} Result: {len(valid_urls)} kept, {original_count - len(valid_urls)} removed") + else: + print(f"{indent} Result: No valid URLs, removing urls key") + del item['urls'] + + # Clean child items if present + if 'items' in item and item['items']: + original_children = len(item['items']) + cleaned_items = [] + + print(f"{indent} Processing {original_children} child items:") + for child in item['items']: + cleaned_child = clean_item(child, level + 1) + if cleaned_child is not None: + cleaned_items.append(cleaned_child) + + if cleaned_items: + item['items'] = cleaned_items + print(f"{indent} Children result: {len(cleaned_items)} kept, {original_children - len(cleaned_items)} removed") + else: + print(f"{indent} Children result: No valid children, removing items key") + del item['items'] + + # Decide whether to keep this item + has_urls = 'urls' in item and item['urls'] + has_children = 'items' in item and item['items'] + is_top_level = item.get('is_top_level', False) + + if has_urls or has_children or is_top_level: + print(f"{indent}KEEPING '{title}' (urls={has_urls}, children={has_children}, top_level={is_top_level})") + return item + else: + print(f"{indent}REMOVING '{title}' (no valid content)") + return None + + # Clean the items array + print(f" Cleaning {len(items_data)} top-level items") + cleaned_items = [] + + for item in items_data: + cleaned_item = clean_item(item) + if cleaned_item is not None: + cleaned_items.append(cleaned_item) + + print(f" Final result: {len(cleaned_items)} sections kept, {len(items_data) - len(cleaned_items)} removed") + return cleaned_items, removed_urls_count + +def js_to_json(js_text): + """Convert JavaScript object notation to valid JSON""" + print(" Converting JavaScript to JSON...") + + # First pass - handle line by line for basic fixes + lines = js_text.split('\n') + fixed_lines = [] + + for line_num, line in enumerate(lines, 1): + original_line = line + + # Remove comments first + if '//' in line: + # Only remove comments that aren't inside quotes + in_quotes = False + quote_char = None + comment_pos = -1 + + for i, char in enumerate(line): + if not in_quotes and char in ['"', "'"]: + in_quotes = True + quote_char = char + elif in_quotes and char == quote_char and (i == 0 or line[i-1] != '\\'): + in_quotes = False + quote_char = None + elif not in_quotes and char == '/' and i < len(line) - 1 and line[i+1] == '/': + comment_pos = i + break + + if comment_pos >= 0: + line = line[:comment_pos].rstrip() + + # Remove function definitions + line = re.sub(r':\s*function\s*\([^)]*\)\s*\{[^}]*\}', ': null', line) + + # Fix unquoted property names ONLY at start of line + stripped = line.strip() + if stripped and ':' in stripped and not stripped.startswith('"') and not stripped.startswith('[') and not stripped.startswith('{'): + match = re.match(r'^(\s*)([a-zA-Z_$][a-zA-Z0-9_$]*)(\s*:\s*)(.*)', line) + if match: + indent, prop_name, colon_part, rest = match.groups() + line = f'{indent}"{prop_name}"{colon_part}{rest}' + + # Remove trailing commas before } or ] + line = re.sub(r',(\s*[}\]])', r'\1', line) + + if line != original_line: + print(f" Modified line {line_num}: {original_line.strip()[:60]}...") + print(f" -> {line.strip()[:60]}...") + + fixed_lines.append(line) + + result = '\n'.join(fixed_lines) + + # Second pass - safer character-by-character processing for quotes + final_result = [] + in_double_quotes = False + in_single_quotes = False + i = 0 + + while i < len(result): + char = result[i] + + if char == '"' and not in_single_quotes: + in_double_quotes = not in_double_quotes + final_result.append(char) + elif char == "'" and not in_double_quotes: + if in_single_quotes: + # End of single-quoted string - convert to double quote + final_result.append('"') + in_single_quotes = False + else: + # Start of single-quoted string - convert to double quote + final_result.append('"') + in_single_quotes = True + elif char == '\\' and (in_single_quotes or in_double_quotes): + # Handle escape sequences + final_result.append(char) + if i + 1 < len(result): + i += 1 + final_result.append(result[i]) + else: + final_result.append(char) + + i += 1 + + result = ''.join(final_result) + + # Handle undefined + result = re.sub(r'\bundefined\b', 'null', result) + + print(f" Converted to JSON ({len(result)} chars)") + return result + +def find_matching_bracket(text, start_pos): + """Find the matching closing bracket for an opening bracket at start_pos""" + if start_pos >= len(text) or text[start_pos] != '[': + return -1 + + count = 0 + in_string = False + escape_next = False + quote_char = None + + for i in range(start_pos, len(text)): + char = text[i] + + if escape_next: + escape_next = False + continue + + if char == '\\': + escape_next = True + continue + + if not in_string: + if char in ['"', "'"]: + in_string = True + quote_char = char + elif char == '[': + count += 1 + elif char == ']': + count -= 1 + if count == 0: + return i + else: + if char == quote_char: + in_string = False + quote_char = None + + return -1 + +def clean_sidebar_in_html_page(html_content, file_path): + """Clean the JavaScript sidebar items array in an HTML page""" + print(f"\n=== CLEANING SIDEBAR JS IN: {file_path} ===") + + # Look for the sidebar JavaScript object + sidebar_start = html_content.find('const sidebar = {') + if sidebar_start == -1: + print(" No 'const sidebar = {' found in this page") + return html_content, 0 + + # Find the items: part + items_start = html_content.find('items:', sidebar_start) + if items_start == -1: + print(" No 'items:' found in sidebar object") + return html_content, 0 + + # Find the opening bracket of the items array + array_start = html_content.find('[', items_start) + if array_start == -1: + print(" No opening '[' found after 'items:'") + return html_content, 0 + + # Find the matching closing bracket + array_end = find_matching_bracket(html_content, array_start) + if array_end == -1: + print(" Could not find matching closing ']' for items array") + # Try to find just the next ]; or }; as fallback + fallback_end = html_content.find('];', array_start) + if fallback_end != -1: + array_end = fallback_end + print(f" Using fallback end position: {array_end}") + else: + return html_content, 0 + + # Extract the items array + items_str = html_content[array_start:array_end + 1] + print(f" āœ… Extracted items array ({len(items_str)} chars)") + + try: + # Convert JavaScript to JSON + json_str = js_to_json(items_str) + items_data = json.loads(json_str) + print(f" āœ… Parsed {len(items_data)} top-level sidebar items") + + # Clean the items + cleaned_items, removed_urls_count = clean_sidebar_items(items_data) + + # Convert back to JSON string + cleaned_json = json.dumps(cleaned_items, indent=2) + + # Replace in the original HTML + new_html = html_content[:array_start] + cleaned_json + html_content[array_end + 1:] + + removed_sections = len(items_data) - len(cleaned_items) + print(f" SUCCESS: Cleaned sidebar JavaScript - {removed_sections} sections removed, {removed_urls_count} URLs removed") + + return new_html, removed_urls_count + + except json.JSONDecodeError as e: + print(f" ERROR: JSON parsing failed: {e}") + + # Extract error position information + error_pos = getattr(e, 'pos', 0) + error_line = getattr(e, 'lineno', 1) + error_col = getattr(e, 'colno', 1) + + print(f" Error at line {error_line}, column {error_col}, position {error_pos}") + + # Find the problematic section around the error + lines = json_str.split('\n') + start_line = max(0, error_line - 5) # 5 lines before + end_line = min(len(lines), error_line + 5) # 5 lines after + + problematic_section = [] + for i in range(start_line, end_line): + line_num = i + 1 + line_content = lines[i] if i < len(lines) else "" + marker = " >>> ERROR LINE <<<" if line_num == error_line else "" + problematic_section.append(f"{line_num:3d}: {line_content}{marker}") + + # Save only the problematic section + debug_file = JEKYLL_ROOT / f"debug_{str(file_path).replace('/', '_')}.txt" + with open(debug_file, 'w') as f: + f.write(f"JSON PARSING ERROR in {file_path}\n") + f.write(f"Error: {e}\n") + f.write(f"Position: line {error_line}, column {error_col}, char {error_pos}\n\n") + f.write("PROBLEMATIC SECTION (±5 lines around error):\n") + f.write("=" * 50 + "\n") + f.write('\n'.join(problematic_section)) + f.write("\n" + "=" * 50 + "\n") + + # Also show the exact character that failed + if error_pos < len(json_str): + f.write(f"\nCharacter at error position: '{json_str[error_pos]}'\n") + f.write(f"Context around error: '{json_str[max(0, error_pos-20):error_pos+20]}'\n") + + # Save the full converted JSON for debugging + f.write("\n" + "=" * 50 + "\n") + f.write("FULL CONVERTED JSON:\n") + f.write(json_str) + + print(f" šŸ’¾ Saved error details to: {debug_file}") + return html_content, 0 + + except Exception as e: + print(f" ERROR: {e}") + import traceback + traceback.print_exc() + return html_content, 0 + +def main(): + print("šŸ” SIDEBAR JAVASCRIPT CLEANING TEST") + print("=" * 60) + + print(f"Looking for HTML files in: {DOCS_ROOT}") + + if not DOCS_ROOT.exists(): + print("āŒ Docs root not found!") + return + + # Find sample HTML files to test + sample_files = [] + + # Look for some common files that likely have sidebar + common_files = [ + f"{TARGET_VERSION}/index.html", + f"{TARGET_VERSION}/install-cockroachdb-linux.html", + "cockroachcloud/quickstart.html", + "releases/index.html", + f"{TARGET_VERSION}/sql-statements.html" + ] + + for file_path in common_files: + full_path = DOCS_ROOT / file_path + if full_path.exists(): + sample_files.append(full_path) + + # If no common files found, grab first few HTML files + if not sample_files: + sample_files = list(DOCS_ROOT.rglob("*.html"))[:5] + + if not sample_files: + print("āŒ No HTML files found!") + return + + print(f"āœ… Found {len(sample_files)} sample files to test:") + for f in sample_files[:5]: # Limit to first 5 for testing + print(f" - {f.relative_to(DOCS_ROOT)}") + + total_removed = 0 + + for html_file in sample_files[:5]: # Test first 5 files only + try: + html_content = html_file.read_text(encoding="utf-8") + cleaned_html, removed_count = clean_sidebar_in_html_page(html_content, html_file.relative_to(DOCS_ROOT)) + total_removed += removed_count + + # Save cleaned version for inspection + if removed_count > 0: + output_file = JEKYLL_ROOT / f"cleaned_{html_file.name}" + with open(output_file, 'w', encoding='utf-8') as f: + f.write(cleaned_html) + print(f" šŸ’¾ Saved cleaned version to: {output_file}") + + except Exception as e: + print(f" āŒ Error processing {html_file}: {e}") + import traceback + traceback.print_exc() + + print(f"\nšŸ“Š SUMMARY:") + print(f" Total files processed: {len(sample_files[:5])}") + print(f" Total broken URLs removed: {total_removed}") + + if total_removed > 0: + print(f"\nāœ… Found and cleaned sidebar JavaScript - {total_removed} broken URLs removed!") + print(f"This logic is ready to integrate into the main archiver.") + else: + print(f"\nšŸ¤” No broken sidebar links found. Either:") + print(f" 1. All sidebar links are valid, or") + print(f" 2. The file checking logic needs adjustment") + +if __name__ == "__main__": + main() \ No newline at end of file From e06458425d3f80688225cc9546ca6d47db9ff105 Mon Sep 17 00:00:00 2001 From: ebembi-crdb Date: Mon, 28 Jul 2025 13:36:30 +0530 Subject: [PATCH 09/12] working code --- src/current/snapshot.py | 466 ++++++++++++++++++++++++++++++---------- 1 file changed, 354 insertions(+), 112 deletions(-) diff --git a/src/current/snapshot.py b/src/current/snapshot.py index 1443986f7ef..8b062654fdf 100644 --- a/src/current/snapshot.py +++ b/src/current/snapshot.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 """ Complete Offline Documentation Archiver for Jekyll CockroachDB Documentation -FIXED VERSION with proper purple CockroachDB branding +FIXED VERSION with proper purple CockroachDB branding and working sidebar cleaning """ import re import shutil @@ -50,6 +50,8 @@ def __init__(self): self.processed_files = set() self.missing_assets = set() self.copied_assets = set() + self.total_broken_urls = 0 + self.total_removed_sections = 0 def log(self, message, level="INFO"): """Enhanced logging with levels""" @@ -63,82 +65,307 @@ def log(self, message, level="INFO"): }.get(level, "") print(f"[{timestamp}] {prefix} {message}") - def clean_sidebar_data(self, sidebar_data): - """Remove broken links from sidebar data""" - def check_file_exists(url): - """Check if a file exists for a given URL""" - if url.startswith(('http://', 'https://', '#', 'mailto:', 'javascript:')): - return True # External links are always valid - - # Normalize URL to file path - file_url = url.strip() - - # Handle root/empty URLs - if file_url in ['/', '', 'index', 'index.html']: - return True # Root index always exists - - # Remove leading slash and docs prefix - if file_url.startswith('/docs/'): - file_url = file_url[6:] - elif file_url.startswith('docs/'): - file_url = file_url[5:] - file_url = file_url.lstrip('/') - - # Handle stable -> v19.2 - file_url = file_url.replace('/stable/', f'/{TARGET_VERSION}/') - file_url = file_url.replace('stable/', f'{TARGET_VERSION}/') - - # Convert ${VERSION} placeholder - file_url = file_url.replace('${VERSION}', TARGET_VERSION) - - # Add .html if needed - if file_url and not file_url.endswith('/') and not file_url.endswith('.html'): - if '.' not in file_url.split('/')[-1]: # No extension - file_url += '.html' - - # Check if file exists - file_path = DOCS_ROOT / file_url - exists = file_path.exists() - - if not exists: - self.log(f"Removing broken link: {url} -> {file_path}", "WARNING") - - return exists - - def clean_item(item): - """Recursively clean an item and its children""" - if isinstance(item, dict): - # Clean URLs if present - if 'urls' in item: - item['urls'] = [url for url in item['urls'] if check_file_exists(url)] - # If no valid URLs left, this item is invalid - if not item['urls']: - return None + def check_file_exists(self, url): + """Test if a file exists for a given URL""" + if url.startswith(('http://', 'https://', '#', 'mailto:', 'javascript:')): + return True # External/anchor links are always valid + + # Normalize URL to file path + file_url = url.strip() + + # Handle root/empty URLs + if file_url in ['/', '', 'index', 'index.html']: + return True # Root index always exists + + # Remove leading slash and docs prefix + if file_url.startswith('/docs/'): + file_url = file_url[6:] + elif file_url.startswith('docs/'): + file_url = file_url[5:] + file_url = file_url.lstrip('/') + + # Handle stable -> v19.2 + file_url = file_url.replace('/stable/', f'/{TARGET_VERSION}/') + file_url = file_url.replace('stable/', f'{TARGET_VERSION}/') + if file_url == 'stable': + file_url = TARGET_VERSION + + # Convert ${VERSION} placeholder + file_url = file_url.replace('${VERSION}', TARGET_VERSION) + + # Try multiple file path variations + possible_paths = [ + file_url, + file_url + '.html' if file_url and not file_url.endswith('.html') and '.' not in file_url.split('/')[-1] else None, + file_url + '/index.html' if file_url and not file_url.endswith('/') else None, + file_url.rstrip('/') + '.html' if file_url.endswith('/') else None + ] + + # Check if any variation exists + for path in possible_paths: + if path: + file_path = DOCS_ROOT / path + if file_path.exists(): + return True + + return False + + def clean_sidebar_items(self, items_data): + """Clean the sidebar items array and count removed URLs""" + removed_urls_count = 0 + removed_sections_count = 0 + + def clean_item(item, level=0): + nonlocal removed_urls_count, removed_sections_count + + if not isinstance(item, dict): + return item + + # Clean URLs if present + if 'urls' in item and item['urls']: + original_count = len(item['urls']) + valid_urls = [] - # Clean child items if present - if 'items' in item: - cleaned_items = [] - for child in item['items']: - cleaned_child = clean_item(child) - if cleaned_child is not None: - cleaned_items.append(cleaned_child) - item['items'] = cleaned_items - - # If no URLs and no valid children, remove this item - if 'urls' not in item and not item['items']: - return None + for url in item['urls']: + if self.check_file_exists(url): + valid_urls.append(url) + else: + removed_urls_count += 1 + if level == 0: # Only log for top-level items to reduce noise + self.log(f"Removing broken URL: {url}", "DEBUG") - return item + if valid_urls: + item['urls'] = valid_urls + else: + del item['urls'] + + # Clean child items if present + if 'items' in item and item['items']: + cleaned_items = [] + + for child in item['items']: + cleaned_child = clean_item(child, level + 1) + if cleaned_child is not None: + cleaned_items.append(cleaned_child) + + if cleaned_items: + item['items'] = cleaned_items + else: + del item['items'] - return item - - # Clean the sidebar data + # Decide whether to keep this item + has_urls = 'urls' in item and item['urls'] + has_children = 'items' in item and item['items'] + + # Only keep items that have actual content (URLs or children) + # Remove empty parents regardless of is_top_level status + if has_urls or has_children: + return item + else: + # Remove empty items completely + removed_sections_count += 1 + if level == 0: # Only log removal of top-level items to reduce noise + title = item.get('title', 'Unknown') + is_top_level = item.get('is_top_level', False) + self.log(f"Removing empty {'top-level ' if is_top_level else ''}section: '{title}' (no URLs or children)", "DEBUG") + return None + + # Clean the items array cleaned_items = [] - for item in sidebar_data: + + for item in items_data: cleaned_item = clean_item(item) if cleaned_item is not None: cleaned_items.append(cleaned_item) + return cleaned_items, removed_urls_count, removed_sections_count + + def js_to_json(self, js_text): + """Convert JavaScript object notation to valid JSON""" + # First pass - handle line by line for basic fixes + lines = js_text.split('\n') + fixed_lines = [] + + for line_num, line in enumerate(lines, 1): + original_line = line + + # Remove comments first + if '//' in line: + # Only remove comments that aren't inside quotes + in_quotes = False + quote_char = None + comment_pos = -1 + + for i, char in enumerate(line): + if not in_quotes and char in ['"', "'"]: + in_quotes = True + quote_char = char + elif in_quotes and char == quote_char and (i == 0 or line[i-1] != '\\'): + in_quotes = False + quote_char = None + elif not in_quotes and char == '/' and i < len(line) - 1 and line[i+1] == '/': + comment_pos = i + break + + if comment_pos >= 0: + line = line[:comment_pos].rstrip() + + # Remove function definitions + line = re.sub(r':\s*function\s*\([^)]*\)\s*\{[^}]*\}', ': null', line) + + # Fix unquoted property names ONLY at start of line + stripped = line.strip() + if stripped and ':' in stripped and not stripped.startswith('"') and not stripped.startswith('[') and not stripped.startswith('{'): + match = re.match(r'^(\s*)([a-zA-Z_$][a-zA-Z0-9_$]*)(\s*:\s*)(.*)', line) + if match: + indent, prop_name, colon_part, rest = match.groups() + line = f'{indent}"{prop_name}"{colon_part}{rest}' + + # Remove trailing commas before } or ] + line = re.sub(r',(\s*[}\]])', r'\1', line) + + fixed_lines.append(line) + + result = '\n'.join(fixed_lines) + + # Second pass - safer character-by-character processing for quotes + final_result = [] + in_double_quotes = False + in_single_quotes = False + i = 0 + + while i < len(result): + char = result[i] + + if char == '"' and not in_single_quotes: + in_double_quotes = not in_double_quotes + final_result.append(char) + elif char == "'" and not in_double_quotes: + if in_single_quotes: + # End of single-quoted string - convert to double quote + final_result.append('"') + in_single_quotes = False + else: + # Start of single-quoted string - convert to double quote + final_result.append('"') + in_single_quotes = True + elif char == '\\' and (in_single_quotes or in_double_quotes): + # Handle escape sequences + final_result.append(char) + if i + 1 < len(result): + i += 1 + final_result.append(result[i]) + else: + final_result.append(char) + + i += 1 + + result = ''.join(final_result) + + # Handle undefined + result = re.sub(r'\bundefined\b', 'null', result) + + return result + + def find_matching_bracket(self, text, start_pos): + """Find the matching closing bracket for an opening bracket at start_pos""" + if start_pos >= len(text) or text[start_pos] != '[': + return -1 + + count = 0 + in_string = False + escape_next = False + quote_char = None + + for i in range(start_pos, len(text)): + char = text[i] + + if escape_next: + escape_next = False + continue + + if char == '\\': + escape_next = True + continue + + if not in_string: + if char in ['"', "'"]: + in_string = True + quote_char = char + elif char == '[': + count += 1 + elif char == ']': + count -= 1 + if count == 0: + return i + else: + if char == quote_char: + in_string = False + quote_char = None + + return -1 + + def clean_sidebar_in_html(self, html_content): + """Clean the JavaScript sidebar items array in HTML content""" + # Look for the sidebar JavaScript object + sidebar_start = html_content.find('const sidebar = {') + if sidebar_start == -1: + return html_content, 0 + + # Find the items: part + items_start = html_content.find('items:', sidebar_start) + if items_start == -1: + return html_content, 0 + + # Find the opening bracket of the items array + array_start = html_content.find('[', items_start) + if array_start == -1: + return html_content, 0 + + # Find the matching closing bracket + array_end = self.find_matching_bracket(html_content, array_start) + if array_end == -1: + # Try to find just the next ]; as fallback + fallback_end = html_content.find('];', array_start) + if fallback_end != -1: + array_end = fallback_end + else: + return html_content, 0 + + # Extract the items array + items_str = html_content[array_start:array_end + 1] + + try: + # Convert JavaScript to JSON + json_str = self.js_to_json(items_str) + items_data = json.loads(json_str) + + # Clean the items + cleaned_items, removed_urls_count, removed_sections_count = self.clean_sidebar_items(items_data) + + # Convert back to JSON string + cleaned_json = json.dumps(cleaned_items, indent=2) + + # Replace in the original HTML + new_html = html_content[:array_start] + cleaned_json + html_content[array_end + 1:] + + if removed_urls_count > 0 or removed_sections_count > 0: + self.log(f"Cleaned sidebar: {removed_urls_count} broken URLs, {removed_sections_count} empty sections removed", "SUCCESS") + + return new_html, removed_urls_count + removed_sections_count + + except json.JSONDecodeError as e: + self.log(f"JSON parsing failed in sidebar cleaning: {e}", "WARNING") + return html_content, 0 + + except Exception as e: + self.log(f"Error cleaning sidebar: {e}", "WARNING") + return html_content, 0 + + def clean_sidebar_data(self, sidebar_data): + """Legacy method - replaced by clean_sidebar_in_html""" + # This method is kept for compatibility but the real work is done in clean_sidebar_in_html + cleaned_items, removed_urls, removed_sections = self.clean_sidebar_items(sidebar_data) return cleaned_items def load_sidebar(self): @@ -161,41 +388,11 @@ def load_sidebar(self): break if self.sidebar_html: - # Extract and clean sidebar data + # Clean the sidebar using our working method self.log("Cleaning sidebar data (removing broken links)...") - - # Parse the sidebar HTML to extract the JavaScript data - import re - import json - - # Extract the sidebar items from the JavaScript - items_match = re.search(r'items:\s*(\[[\s\S]*?\])\s*};', self.sidebar_html) - if items_match: - try: - # Parse the JavaScript array as JSON (with some cleaning) - items_str = items_match.group(1) - # Clean up JavaScript to make it valid JSON - items_str = re.sub(r'(\w+):', r'"\1":', items_str) # Quote keys - items_str = re.sub(r',\s*}', '}', items_str) # Remove trailing commas - items_str = re.sub(r',\s*]', ']', items_str) # Remove trailing commas in arrays - - sidebar_data = json.loads(items_str) - - # Clean the sidebar data - cleaned_data = self.clean_sidebar_data(sidebar_data) - - # Replace the items in the HTML - cleaned_items_str = json.dumps(cleaned_data, indent=2) - self.sidebar_html = re.sub( - r'items:\s*\[[\s\S]*?\]', - f'items:{cleaned_items_str}', - self.sidebar_html - ) - - self.log(f"Cleaned sidebar data: removed broken links", "SUCCESS") - - except Exception as e: - self.log(f"Could not clean sidebar data: {e}", "WARNING") + cleaned_sidebar, removed_count = self.clean_sidebar_in_html(self.sidebar_html) + self.sidebar_html = cleaned_sidebar + self.total_broken_urls += removed_count # Simplify isVersionDirectory function for v19.2 only self.sidebar_html = re.sub( @@ -211,7 +408,15 @@ def load_sidebar(self): remove_selectors = [ '.ask-ai', '#ask-ai', '[data-ask-ai]', '.kapa-widget', '[class*="kapa"]', '[id*="kapa"]', 'script[src*="kapa"]', - '[class*="ask-ai"]', '[id*="ask-ai"]' + '[class*="ask-ai"]', '[id*="ask-ai"]', + # Remove search elements that won't work offline + '.search', '#search', '.search-bar', '.search-input', '.search-form', + '[class*="search"]', '[id*="search"]', 'input[type="search"]', + '.algolia-search', '.docsearch', '[class*="docsearch"]', + # Target forms and inputs with search-related attributes + 'form[action*="search"]', 'input[placeholder*="Search" i]', + 'input[placeholder*="search" i]', 'input[name="query"]', + 'form[action="/docs/search"]', 'form[action*="/search"]' ] for selector in remove_selectors: @@ -445,7 +650,7 @@ def replace_url_processing(match): # Debug output if new_html != html: - self.log("Successfully replaced JavaScript URL processing", "SUCCESS") + self.log("Successfully replaced JavaScript URL processing", "DEBUG") else: self.log("Warning: JavaScript URL processing replacement may have failed", "WARNING") @@ -464,14 +669,18 @@ def process_html_file(self, src_path): # Check if this file is in the version directory is_in_version_dir = str(rel_path).startswith(f'{TARGET_VERSION}/') - self.log(f"Processing {rel_path} (in_v_dir={is_in_version_dir}, depth={depth})") - # Read content html = src_path.read_text(encoding="utf-8") # CRITICAL: Fix sidebar JavaScript BEFORE other processing html = self.fix_sidebar_javascript(html) + # CRITICAL: Clean embedded sidebar JavaScript + cleaned_html, removed_count = self.clean_sidebar_in_html(html) + if removed_count > 0: + self.total_broken_urls += removed_count + html = cleaned_html + # Inject sidebar HTML if available if self.sidebar_html: html = re.sub( @@ -497,6 +706,14 @@ def process_html_file(self, src_path): '.helpful-widget', '.page-helpful', 'script[src*="googletagmanager"]', 'script[src*="google-analytics"]', 'script[src*="segment"]', 'script[src*="heap"]', + # Remove search elements that won't work offline + '.search', '#search', '.search-bar', '.search-input', '.search-form', + '[class*="search"]', '[id*="search"]', 'input[type="search"]', + '.algolia-search', '.docsearch', '[class*="docsearch"]', + # Target forms and inputs with search-related attributes + 'form[action*="search"]', 'input[placeholder*="Search" i]', + 'input[placeholder*="search" i]', 'input[name="query"]', + 'form[action="/docs/search"]', 'form[action*="/search"]' ] for selector in remove_selectors: @@ -583,7 +800,13 @@ def process_html_file(self, src_path): .version-switcher, #version-switcher, .feedback-widget, button[aria-label*="AI"], div[data-kapa-widget], .kapa-ai-button, .ai-assistant, .ai-chat, -.floating-action-button, .fab, [class*="floating-button"] {{ +.floating-action-button, .fab, [class*="floating-button"], +.search, #search, .search-bar, .search-input, .search-form, +[class*="search"], [id*="search"], input[type="search"], +.algolia-search, .docsearch, [class*="docsearch"], +form[action*="search"], input[placeholder*="Search" i], +input[placeholder*="search" i], input[name="query"], +form[action="/docs/search"], form[action*="/search"] {{ display: none !important; visibility: hidden !important; opacity: 0 !important; @@ -613,6 +836,11 @@ def process_html_file(self, src_path): $('[class*="kapa"], [id*="kapa"], [class*="ask-ai"], [id*="ask-ai"]').remove(); $('.version-switcher, #version-switcher, .feedback-widget').remove(); $('.floating-action-button, .fab, [class*="floating-button"]').remove(); + $('.search, #search, .search-bar, .search-input, .search-form').remove(); + $('[class*="search"], [id*="search"], input[type="search"]').remove(); + $('.algolia-search, .docsearch, [class*="docsearch"]').remove(); + $('form[action*="search"], input[placeholder*="Search"], input[placeholder*="search"]').remove(); + $('input[name="query"], form[action="/docs/search"], form[action*="/search"]').remove(); // Initialize navigation $('#sidebar, #sidebarMenu, #mysidebar').navgoco({ @@ -1010,7 +1238,13 @@ def create_index_page(self): /* Hide online elements */ .ask-ai, #ask-ai, [data-ask-ai], .kapa-widget, - [class*="kapa"], [id*="kapa"], .floating-action-button {{ + [class*="kapa"], [id*="kapa"], .floating-action-button, + .search, #search, .search-bar, .search-input, .search-form, + [class*="search"], [id*="search"], input[type="search"], + .algolia-search, .docsearch, [class*="docsearch"], + form[action*="search"], input[placeholder*="Search" i], + input[placeholder*="search" i], input[name="query"], + form[action="/docs/search"], form[action*="/search"] {{ display: none !important; }} @@ -1118,7 +1352,13 @@ def create_index_page(self): // Remove any Ask AI elements document.addEventListener('DOMContentLoaded', function() {{ var selectors = ['.ask-ai', '#ask-ai', '[data-ask-ai]', '.kapa-widget', - '[class*="kapa"]', '[id*="kapa"]', '.floating-action-button']; + '[class*="kapa"]', '[id*="kapa"]', '.floating-action-button', + '.search', '#search', '.search-bar', '.search-input', '.search-form', + '[class*="search"]', '[id*="search"]', 'input[type="search"]', + '.algolia-search', '.docsearch', '[class*="docsearch"]', + 'form[action*="search"]', 'input[placeholder*="Search" i]', + 'input[placeholder*="search" i]', 'input[name="query"]', + 'form[action="/docs/search"]', 'form[action*="/search"]']; selectors.forEach(function(selector) {{ document.querySelectorAll(selector).forEach(function(el) {{ el.remove(); @@ -1130,7 +1370,7 @@ def create_index_page(self): """ (OUTPUT_ROOT / "index.html").write_text(index_html) - self.log("Created CockroachDB purple-branded index.html", "SUCCESS") + self.log("Created CockroachDB purple-branded index.html with broken link count", "SUCCESS") def build(self): """Main build process""" @@ -1253,14 +1493,16 @@ def build(self): self.log("ARCHIVE COMPLETE WITH PURPLE BRANDING!", "SUCCESS") self.log(f"Output directory: {OUTPUT_ROOT.resolve()}") self.log(f"Total files: {len(self.processed_files)}") + self.log(f"Total broken URLs removed: {self.total_broken_urls}", "SUCCESS") self.log("🟣 CockroachDB purple branding applied", "SUCCESS") self.log("āœ… Sidebar JavaScript URL processing FIXED", "SUCCESS") - self.log("āœ… Broken sidebar links removed", "SUCCESS") + self.log("āœ… Broken sidebar links and empty sections removed", "SUCCESS") self.log("āœ… Professional index page created", "SUCCESS") print(f"\nšŸŽ‰ Purple-branded offline site built in {OUTPUT_ROOT}") print(f"\nšŸ“¦ To test: open file://{OUTPUT_ROOT.resolve()}/index.html") print(f"\n🟣 Your site now has proper CockroachDB purple branding!") + print(f"\nšŸ”§ {self.total_broken_urls} broken sidebar URLs and empty sections were cleaned up!") return True From 6cbecd6a4b04a8e2de5efc98ac1d6ccac637319b Mon Sep 17 00:00:00 2001 From: ebembi-crdb Date: Mon, 4 Aug 2025 12:30:45 +0530 Subject: [PATCH 10/12] correct script --- src/current/snapshot.py | 860 ++++++++++++++++++++++++++-------------- 1 file changed, 572 insertions(+), 288 deletions(-) diff --git a/src/current/snapshot.py b/src/current/snapshot.py index 8b062654fdf..0da9a0f319b 100644 --- a/src/current/snapshot.py +++ b/src/current/snapshot.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 """ Complete Offline Documentation Archiver for Jekyll CockroachDB Documentation -FIXED VERSION with proper purple CockroachDB branding and working sidebar cleaning +HYBRID VERSION - Combines vibrant sidebar styling, professional homepage, optimized assets, and improved navigation logic """ import re import shutil @@ -67,48 +67,57 @@ def log(self, message, level="INFO"): def check_file_exists(self, url): """Test if a file exists for a given URL""" - if url.startswith(('http://', 'https://', '#', 'mailto:', 'javascript:')): - return True # External/anchor links are always valid - - # Normalize URL to file path - file_url = url.strip() - - # Handle root/empty URLs - if file_url in ['/', '', 'index', 'index.html']: - return True # Root index always exists - - # Remove leading slash and docs prefix - if file_url.startswith('/docs/'): - file_url = file_url[6:] - elif file_url.startswith('docs/'): - file_url = file_url[5:] - file_url = file_url.lstrip('/') - - # Handle stable -> v19.2 - file_url = file_url.replace('/stable/', f'/{TARGET_VERSION}/') - file_url = file_url.replace('stable/', f'{TARGET_VERSION}/') - if file_url == 'stable': - file_url = TARGET_VERSION - - # Convert ${VERSION} placeholder - file_url = file_url.replace('${VERSION}', TARGET_VERSION) - - # Try multiple file path variations - possible_paths = [ - file_url, - file_url + '.html' if file_url and not file_url.endswith('.html') and '.' not in file_url.split('/')[-1] else None, - file_url + '/index.html' if file_url and not file_url.endswith('/') else None, - file_url.rstrip('/') + '.html' if file_url.endswith('/') else None - ] - - # Check if any variation exists - for path in possible_paths: - if path: - file_path = DOCS_ROOT / path - if file_path.exists(): - return True - - return False + try: + if not url or url.startswith(('http://', 'https://', '#', 'mailto:', 'javascript:')): + return True # External/anchor links are always valid + + # Normalize URL to file path + file_url = str(url).strip() + + # Handle root/empty URLs + if file_url in ['/', '', 'index', 'index.html']: + return True # Root index always exists + + # Remove leading slash and docs prefix + if file_url.startswith('/docs/'): + file_url = file_url[6:] + elif file_url.startswith('docs/'): + file_url = file_url[5:] + file_url = file_url.lstrip('/') + + # Handle stable -> v19.2 + file_url = file_url.replace('/stable/', f'/{TARGET_VERSION}/') + file_url = file_url.replace('stable/', f'{TARGET_VERSION}/') + if file_url == 'stable': + file_url = TARGET_VERSION + + # Convert ${VERSION} placeholder + file_url = file_url.replace('${VERSION}', TARGET_VERSION) + + # Try multiple file path variations + possible_paths = [ + file_url, + file_url + '.html' if file_url and not file_url.endswith('.html') and '.' not in file_url.split('/')[-1] else None, + file_url + '/index.html' if file_url and not file_url.endswith('/') else None, + file_url.rstrip('/') + '.html' if file_url.endswith('/') else None + ] + + # Check if any variation exists + for path in possible_paths: + if path: + try: + file_path = DOCS_ROOT / path + if file_path.exists(): + return True + except Exception: + continue + + return False + + except Exception as e: + # If there's any error checking, assume the file exists to be safe + self.log(f"Error checking file existence for {url}: {e}", "DEBUG") + return True def clean_sidebar_items(self, items_data): """Clean the sidebar items array and count removed URLs""" @@ -127,12 +136,18 @@ def clean_item(item, level=0): valid_urls = [] for url in item['urls']: - if self.check_file_exists(url): - valid_urls.append(url) - else: + try: + if url and self.check_file_exists(url): + valid_urls.append(url) + else: + removed_urls_count += 1 + if level == 0: # Only log for top-level items to reduce noise + self.log(f"Removing broken URL: {url}", "DEBUG") + except Exception as e: + # If there's an error checking the URL, skip it removed_urls_count += 1 - if level == 0: # Only log for top-level items to reduce noise - self.log(f"Removing broken URL: {url}", "DEBUG") + if level == 0: + self.log(f"Removing problematic URL: {url} (error: {e})", "DEBUG") if valid_urls: item['urls'] = valid_urls @@ -182,90 +197,109 @@ def clean_item(item, level=0): def js_to_json(self, js_text): """Convert JavaScript object notation to valid JSON""" - # First pass - handle line by line for basic fixes - lines = js_text.split('\n') - fixed_lines = [] - - for line_num, line in enumerate(lines, 1): - original_line = line - - # Remove comments first - if '//' in line: - # Only remove comments that aren't inside quotes - in_quotes = False - quote_char = None - comment_pos = -1 + try: + if not js_text or not js_text.strip(): + return "" - for i, char in enumerate(line): - if not in_quotes and char in ['"', "'"]: - in_quotes = True - quote_char = char - elif in_quotes and char == quote_char and (i == 0 or line[i-1] != '\\'): + # First pass - handle line by line for basic fixes + lines = js_text.split('\n') + fixed_lines = [] + + for line_num, line in enumerate(lines, 1): + try: + original_line = line + + # Remove comments first + if '//' in line: + # Only remove comments that aren't inside quotes in_quotes = False quote_char = None - elif not in_quotes and char == '/' and i < len(line) - 1 and line[i+1] == '/': - comment_pos = i - break - - if comment_pos >= 0: - line = line[:comment_pos].rstrip() - - # Remove function definitions - line = re.sub(r':\s*function\s*\([^)]*\)\s*\{[^}]*\}', ': null', line) - - # Fix unquoted property names ONLY at start of line - stripped = line.strip() - if stripped and ':' in stripped and not stripped.startswith('"') and not stripped.startswith('[') and not stripped.startswith('{'): - match = re.match(r'^(\s*)([a-zA-Z_$][a-zA-Z0-9_$]*)(\s*:\s*)(.*)', line) - if match: - indent, prop_name, colon_part, rest = match.groups() - line = f'{indent}"{prop_name}"{colon_part}{rest}' - - # Remove trailing commas before } or ] - line = re.sub(r',(\s*[}\]])', r'\1', line) - - fixed_lines.append(line) - - result = '\n'.join(fixed_lines) - - # Second pass - safer character-by-character processing for quotes - final_result = [] - in_double_quotes = False - in_single_quotes = False - i = 0 - - while i < len(result): - char = result[i] - - if char == '"' and not in_single_quotes: - in_double_quotes = not in_double_quotes - final_result.append(char) - elif char == "'" and not in_double_quotes: - if in_single_quotes: - # End of single-quoted string - convert to double quote - final_result.append('"') - in_single_quotes = False - else: - # Start of single-quoted string - convert to double quote - final_result.append('"') - in_single_quotes = True - elif char == '\\' and (in_single_quotes or in_double_quotes): - # Handle escape sequences - final_result.append(char) - if i + 1 < len(result): + comment_pos = -1 + + for i, char in enumerate(line): + if not in_quotes and char in ['"', "'"]: + in_quotes = True + quote_char = char + elif in_quotes and char == quote_char and (i == 0 or line[i-1] != '\\'): + in_quotes = False + quote_char = None + elif not in_quotes and char == '/' and i < len(line) - 1 and line[i+1] == '/': + comment_pos = i + break + + if comment_pos >= 0: + line = line[:comment_pos].rstrip() + + # Remove function definitions + line = re.sub(r':\s*function\s*\([^)]*\)\s*\{[^}]*\}', ': null', line) + + # Fix unquoted property names ONLY at start of line + stripped = line.strip() + if stripped and ':' in stripped and not stripped.startswith('"') and not stripped.startswith('[') and not stripped.startswith('{'): + match = re.match(r'^(\s*)([a-zA-Z_$][a-zA-Z0-9_$]*)(\s*:\s*)(.*)', line) + if match: + indent, prop_name, colon_part, rest = match.groups() + line = f'{indent}"{prop_name}"{colon_part}{rest}' + + # Remove trailing commas before } or ] + line = re.sub(r',(\s*[}\]])', r'\1', line) + + fixed_lines.append(line) + + except Exception as e: + self.log(f"Error processing line {line_num}: {e}", "DEBUG") + fixed_lines.append(line) # Use original line if processing fails + + result = '\n'.join(fixed_lines) + + # Second pass - safer character-by-character processing for quotes + final_result = [] + in_double_quotes = False + in_single_quotes = False + i = 0 + + while i < len(result): + try: + char = result[i] + + if char == '"' and not in_single_quotes: + in_double_quotes = not in_double_quotes + final_result.append(char) + elif char == "'" and not in_double_quotes: + if in_single_quotes: + # End of single-quoted string - convert to double quote + final_result.append('"') + in_single_quotes = False + else: + # Start of single-quoted string - convert to double quote + final_result.append('"') + in_single_quotes = True + elif char == '\\' and (in_single_quotes or in_double_quotes): + # Handle escape sequences + final_result.append(char) + if i + 1 < len(result): + i += 1 + final_result.append(result[i]) + else: + final_result.append(char) + + i += 1 + + except Exception as e: + self.log(f"Error processing character at position {i}: {e}", "DEBUG") + final_result.append(char) i += 1 - final_result.append(result[i]) - else: - final_result.append(char) - i += 1 - - result = ''.join(final_result) - - # Handle undefined - result = re.sub(r'\bundefined\b', 'null', result) - - return result + result = ''.join(final_result) + + # Handle undefined + result = re.sub(r'\bundefined\b', 'null', result) + + return result + + except Exception as e: + self.log(f"Error in js_to_json: {e}", "WARNING") + return "" def find_matching_bracket(self, text, start_pos): """Find the matching closing bracket for an opening bracket at start_pos""" @@ -338,6 +372,9 @@ def clean_sidebar_in_html(self, html_content): try: # Convert JavaScript to JSON json_str = self.js_to_json(items_str) + if not json_str.strip(): + return html_content, 0 + items_data = json.loads(json_str) # Clean the items @@ -356,18 +393,14 @@ def clean_sidebar_in_html(self, html_content): except json.JSONDecodeError as e: self.log(f"JSON parsing failed in sidebar cleaning: {e}", "WARNING") + self.log(f"Problematic JSON snippet: {json_str[:200] if 'json_str' in locals() else 'N/A'}...", "DEBUG") return html_content, 0 except Exception as e: self.log(f"Error cleaning sidebar: {e}", "WARNING") + self.log(f"Error type: {type(e).__name__}", "DEBUG") return html_content, 0 - def clean_sidebar_data(self, sidebar_data): - """Legacy method - replaced by clean_sidebar_in_html""" - # This method is kept for compatibility but the real work is done in clean_sidebar_in_html - cleaned_items, removed_urls, removed_sections = self.clean_sidebar_items(sidebar_data) - return cleaned_items - def load_sidebar(self): """Load and prepare the sidebar HTML""" self.log(f"Loading sidebar from: {SIDEBAR_HTML_PATH}") @@ -430,10 +463,10 @@ def load_sidebar(self): # Pre-process sidebar links to normalize paths for a in sidebar_soup.find_all('a', href=True): - href = a['href'] + href = a.get('href') - # Skip external links - if href.startswith(('http://', 'https://', '#', 'mailto:')): + # Skip if no href or external links + if not href or href.startswith(('http://', 'https://', '#', 'mailto:')): continue # First handle stable -> v19.2 @@ -484,7 +517,78 @@ def ensure_asset(self, name, local_candidates, url, dest_dir): self.log(f"Downloaded: {name}", "SUCCESS") except Exception as e: self.log(f"Failed to download {name}: {e}", "ERROR") - + + def copy_selective_assets(self): + """Copy only necessary assets, excluding non-v19.2 version assets (FROM SCRIPT 2)""" + self.log("\n--- Copying Selective Assets ---") + + # Copy global assets (always needed) + for asset_dir in ["css", "js", "img"]: + src = SITE_ROOT / asset_dir + if src.exists(): + dst = OUTPUT_ROOT / asset_dir + shutil.copytree(src, dst, dirs_exist_ok=True) + self.log(f"Copied global {asset_dir}/", "SUCCESS") + + # Copy docs-specific assets (base level) + for asset_dir in ["css", "js", "_internal"]: + src = DOCS_ROOT / asset_dir + if src.exists(): + dst = OUTPUT_ROOT / asset_dir + shutil.copytree(src, dst, dirs_exist_ok=True) + self.log(f"Copied docs {asset_dir}/", "SUCCESS") + + # Handle images selectively - only v19.2 and global images + images_src = DOCS_ROOT / "images" + if images_src.exists(): + images_dst = OUTPUT_ROOT / "images" + images_dst.mkdir(parents=True, exist_ok=True) + + copied_count = 0 + skipped_count = 0 + + for img_file in images_src.rglob("*"): + if img_file.is_file(): + rel_path = img_file.relative_to(images_src) + + # Skip version-specific images that aren't v19.2 + path_parts = rel_path.parts + if (len(path_parts) > 0 and + path_parts[0].startswith('v') and + path_parts[0] != TARGET_VERSION and + path_parts[0] not in ['v19.2']): # Be explicit about allowed versions + skipped_count += 1 + continue + + # Copy allowed images + dst_file = images_dst / rel_path + dst_file.parent.mkdir(parents=True, exist_ok=True) + shutil.copy2(img_file, dst_file) + copied_count += 1 + + self.log(f"Images: copied {copied_count}, skipped {skipped_count} version-specific files", "SUCCESS") + + # Copy version-specific assets only for TARGET_VERSION + version_dirs = [TARGET_VERSION] # Only process our target version + + for version in version_dirs: + version_src = DOCS_ROOT / version + if version_src.exists(): + # Copy version-specific images if they exist + version_images = version_src / "images" + if version_images.exists(): + version_images_dst = OUTPUT_ROOT / version / "images" + shutil.copytree(version_images, version_images_dst, dirs_exist_ok=True) + self.log(f"Copied {version}/images/", "SUCCESS") + + # Copy other version-specific assets + for asset_type in ["css", "js", "_internal"]: + version_asset = version_src / asset_type + if version_asset.exists(): + version_asset_dst = OUTPUT_ROOT / version / asset_type + shutil.copytree(version_asset, version_asset_dst, dirs_exist_ok=True) + self.log(f"Copied {version}/{asset_type}/", "SUCCESS") + def fix_sidebar_javascript(self, html): """Fix the embedded sidebar JavaScript configuration and URL processing""" @@ -655,9 +759,104 @@ def replace_url_processing(match): self.log("Warning: JavaScript URL processing replacement may have failed", "WARNING") return new_html + + def get_vibrant_sidebar_styles(self, prefix): + """Return vibrant sidebar styles with #6933FF purple branding (FROM SCRIPT 1)""" + return f'''''' def process_html_file(self, src_path): - """Process a single HTML file""" + """Process a single HTML file with vibrant sidebar styling""" try: rel_path = src_path.relative_to(DOCS_ROOT) dst_path = OUTPUT_ROOT / rel_path @@ -666,9 +865,6 @@ def process_html_file(self, src_path): depth = len(rel_path.parent.parts) prefix = "../" * depth - # Check if this file is in the version directory - is_in_version_dir = str(rel_path).startswith(f'{TARGET_VERSION}/') - # Read content html = src_path.read_text(encoding="utf-8") @@ -728,9 +924,22 @@ def process_html_file(self, src_path): # Remove any iframes that might be Ask AI related for iframe in soup.find_all('iframe'): src = iframe.get('src', '') - if any(term in src.lower() for term in ['kapa', 'ask', 'ai']): + if src and any(term in src.lower() for term in ['kapa', 'ask', 'ai']): iframe.decompose() + # Fix any remaining anchor tags without href attributes + for a in soup.find_all('a'): + if not a.get('href'): + # Remove anchor tags without href or set a placeholder + if a.get_text().strip(): + # Convert to span if it has text content + span = soup.new_tag('span') + span.string = a.get_text() + a.replace_with(span) + else: + # Remove empty anchor tags + a.decompose() + # Convert back to string html = str(soup) @@ -784,51 +993,11 @@ def process_html_file(self, src_path): html = re.sub(r"", nav_deps + "\n", html, flags=re.IGNORECASE) - # Add offline styles - offline_styles = f'''''' - + # Add vibrant sidebar styles (FROM SCRIPT 1) + offline_styles = self.get_vibrant_sidebar_styles(prefix) html = re.sub(r"", offline_styles + "\n", html, flags=re.IGNORECASE) - # Add navigation initialization + # Simple navgoco initialization (FROM SCRIPT 1) nav_init = """ """ (OUTPUT_ROOT / "index.html").write_text(index_html) - self.log("Created CockroachDB purple-branded index.html with broken link count", "SUCCESS") + self.log("Created professional navigation index.html with vibrant purple branding", "SUCCESS") def build(self): - """Main build process""" + """Main build process with hybrid optimizations""" print("\n" + "="*60) - print("šŸš€ COCKROACHDB OFFLINE DOCUMENTATION ARCHIVER (PURPLE BRANDED)") + print("šŸš€ COCKROACHDB OFFLINE DOCUMENTATION ARCHIVER (HYBRID+)") print("="*60) # Verify paths @@ -1383,6 +1654,7 @@ def build(self): self.log(f"Site Root: {SITE_ROOT}") self.log(f"Docs Root: {DOCS_ROOT}") self.log(f"Output: {OUTPUT_ROOT}") + self.log(f"Target Version: {TARGET_VERSION}") if not SITE_ROOT.exists(): self.log("Site root not found! Run 'jekyll build' first.", "ERROR") @@ -1394,23 +1666,8 @@ def build(self): shutil.rmtree(OUTPUT_ROOT) OUTPUT_ROOT.mkdir(parents=True) - # Copy global assets FIRST - self.log("\n--- Copying Global Assets ---") - for asset_dir in ["css", "js", "img"]: - src = SITE_ROOT / asset_dir - if src.exists(): - dst = OUTPUT_ROOT / asset_dir - shutil.copytree(src, dst, dirs_exist_ok=True) - self.log(f"Copied global {asset_dir}/", "SUCCESS") - - # Copy docs-specific assets - self.log("\n--- Copying Docs Assets ---") - for asset_dir in ["css", "js", "images", "_internal"]: - src = DOCS_ROOT / asset_dir - if src.exists(): - dst = OUTPUT_ROOT / asset_dir - shutil.copytree(src, dst, dirs_exist_ok=True) - self.log(f"Copied docs {asset_dir}/", "SUCCESS") + # Use selective asset copying (FROM SCRIPT 2) + self.copy_selective_assets() # Ensure critical navigation assets self.log("\n--- Ensuring Navigation Assets ---") @@ -1443,66 +1700,93 @@ def build(self): self.log("\n--- Loading Sidebar ---") self.load_sidebar() - # Process HTML files + # Process HTML files with stricter version filtering (FROM SCRIPT 2) self.log("\n--- Processing HTML Files ---") - # Collect files to process files_to_process = [] - # Target version files + # Only target version files version_dir = DOCS_ROOT / TARGET_VERSION if version_dir.exists(): files_to_process.extend(list(version_dir.rglob("*.html"))) self.log(f"Found {len(files_to_process)} files in {TARGET_VERSION}/", "SUCCESS") - # Common pages + # Common pages (but exclude other version directories) for pattern in COMMON_PAGES: if '*' in pattern: - files_to_process.extend(list(DOCS_ROOT.glob(pattern))) + for file_path in DOCS_ROOT.glob(pattern): + # Skip other version directories + rel_path = file_path.relative_to(DOCS_ROOT) + if (rel_path.parts and + rel_path.parts[0].startswith('v') and + rel_path.parts[0] != TARGET_VERSION): + continue + files_to_process.append(file_path) else: file_path = DOCS_ROOT / pattern if file_path.exists(): files_to_process.append(file_path) - # Remove duplicates - files_to_process = list(set(files_to_process)) - self.log(f"Total files to process: {len(files_to_process)}") + # Remove duplicates and filter out unwanted versions + filtered_files = [] + for file_path in set(files_to_process): + rel_path = file_path.relative_to(DOCS_ROOT) + # Skip files from other version directories + if (rel_path.parts and + rel_path.parts[0].startswith('v') and + rel_path.parts[0] != TARGET_VERSION): + continue + filtered_files.append(file_path) + + files_to_process = filtered_files + self.log(f"Total files to process (after version filtering): {len(files_to_process)}") + + # Process each file with better error handling (FROM SCRIPT 2) + processed_count = 0 + error_count = 0 - # Process each file for i, file_path in enumerate(files_to_process, 1): - # Skip non-v19.2 version directories - rel_path = file_path.relative_to(DOCS_ROOT) - if rel_path.parts and rel_path.parts[0].startswith('v') and rel_path.parts[0] != TARGET_VERSION: + try: + if i % 25 == 0: + self.log(f"Progress: {i}/{len(files_to_process)} ({i*100//len(files_to_process)}%)") + + self.process_html_file(file_path) + processed_count += 1 + + except Exception as e: + error_count += 1 + self.log(f"Failed to process {file_path}: {e}", "ERROR") + # Continue with next file instead of crashing continue - - if i % 25 == 0: - self.log(f"Progress: {i}/{len(files_to_process)} ({i*100//len(files_to_process)}%)") - - self.process_html_file(file_path) - self.log(f"Processed {len(self.processed_files)} files", "SUCCESS") + self.log(f"Successfully processed {processed_count} files, {error_count} errors", "SUCCESS") # Final cleanup steps self.log("\n--- Final Steps ---") self.fix_css_images() self.download_google_fonts() - self.create_index_page() + self.create_professional_index_page() # FROM SCRIPT 2 - # Summary + # Enhanced summary print("\n" + "="*60) - self.log("ARCHIVE COMPLETE WITH PURPLE BRANDING!", "SUCCESS") + self.log("HYBRID ARCHIVE COMPLETE!", "SUCCESS") self.log(f"Output directory: {OUTPUT_ROOT.resolve()}") self.log(f"Total files: {len(self.processed_files)}") self.log(f"Total broken URLs removed: {self.total_broken_urls}", "SUCCESS") - self.log("🟣 CockroachDB purple branding applied", "SUCCESS") + self.log("🟣 Vibrant #6933FF sidebar styling (Script 1)", "SUCCESS") + self.log("šŸ  Professional homepage with clear navigation (Script 2)", "SUCCESS") + self.log("šŸ”— Sidebar navigation logic with better URL processing (Updated)", "SUCCESS") + self.log("⚔ Selective asset copying for reduced size (Script 2)", "SUCCESS") + self.log("šŸ”§ Robust error handling and progress reporting (Script 2)", "SUCCESS") self.log("āœ… Sidebar JavaScript URL processing FIXED", "SUCCESS") self.log("āœ… Broken sidebar links and empty sections removed", "SUCCESS") - self.log("āœ… Professional index page created", "SUCCESS") - print(f"\nšŸŽ‰ Purple-branded offline site built in {OUTPUT_ROOT}") + print(f"\nšŸŽ‰ Hybrid offline site built in {OUTPUT_ROOT}") print(f"\nšŸ“¦ To test: open file://{OUTPUT_ROOT.resolve()}/index.html") - print(f"\n🟣 Your site now has proper CockroachDB purple branding!") - print(f"\nšŸ”§ {self.total_broken_urls} broken sidebar URLs and empty sections were cleaned up!") + print(f"\n🟣 Vibrant purple sidebar + professional homepage + improved navigation logic") + print(f"\n⚔ Optimized assets - excluded non-{TARGET_VERSION} files") + print(f"\nšŸ”§ {self.total_broken_urls} broken sidebar URLs cleaned up") + print(f"\n✨ Best features from all scripts combined!") return True From 3ecd21476ad60b512978a1a98c324c4cc4c60b2a Mon Sep 17 00:00:00 2001 From: ebembi-crdb Date: Mon, 4 Aug 2025 13:30:45 +0530 Subject: [PATCH 11/12] Corrected index page --- src/current/snapshot.py | 613 ++++++---------------------------------- 1 file changed, 88 insertions(+), 525 deletions(-) diff --git a/src/current/snapshot.py b/src/current/snapshot.py index 0da9a0f319b..ca293c6120b 100644 --- a/src/current/snapshot.py +++ b/src/current/snapshot.py @@ -1112,536 +1112,99 @@ def download_google_fonts(self): (OUTPUT_ROOT / "css" / "google-fonts.css").write_text(fallback) def create_professional_index_page(self): - """Create index page with clearer navigation structure (FROM SCRIPT 2)""" - index_html = f""" - - - - - CockroachDB Documentation Archive - - - - - - - -
-
- šŸ“± - Offline Documentation Archive - {TARGET_VERSION} -
-
+ .archived-banner-text { + font-family: 'Source Sans Pro', sans-serif; + font-size: 14px; + font-weight: 500; + color: #856404; + margin: 0; + line-height: 1.4; + } - -
+ .archived-banner-link { + color: #6933FF; + text-decoration: none; + font-weight: 600; + } + + .archived-banner-link:hover { + color: #4d0dff; + text-decoration: underline; + } + + /* Push the navbar down below the banner */ + .main-nav-wrapper { + top: 32px !important; + } + + .navbar.fixed-top { + top: 32px !important; + } + + /* Only add the banner height to existing padding */ + body { + padding-top: 32px; + } + + @media (max-width: 768px) { + .archived-banner-text { + font-size: 13px; + } + } + ''' + + # Add the banner HTML + banner_html = ''' +
-
-

CockroachDB Docs

-

Your offline archive of CockroachDB documentation for version {TARGET_VERSION} and related resources.

-
- - - - - - - -
-
-
⚔
-

Installation

-

Download and install CockroachDB on your system.

- - Install Guide → - -
- -
-
šŸ”§
-

SQL Reference

-

Complete SQL statements, functions, and operators reference.

- - SQL Docs → - -
- -
-
šŸ“Š
-

Performance

-

Best practices for optimizing your CockroachDB deployment.

- - Optimize → - -
-
-
-
- - - - - - -""" - - (OUTPUT_ROOT / "index.html").write_text(index_html) - self.log("Created professional navigation index.html with vibrant purple branding", "SUCCESS") +
''' + + # Insert CSS before + html_content = html_content.replace('', banner_css + '\n') + + # Insert banner HTML after + html_content = html_content.replace('', '\n' + banner_html) + + # Write back the modified content + index_path.write_text(html_content, encoding="utf-8") + self.log("Added archived banner to existing index.html", "SUCCESS") + else: + self.log("No existing index.html found to modify", "WARNING") def build(self): """Main build process with hybrid optimizations""" From db23f32d03ab4a5f2997e215b747c10f6cf28cfc Mon Sep 17 00:00:00 2001 From: ebembi-crdb Date: Wed, 13 Aug 2025 15:48:57 +0530 Subject: [PATCH 12/12] Review changes --- src/current/snapshot.py | 365 ++++++++++++++++++++++++++++++---------- 1 file changed, 279 insertions(+), 86 deletions(-) diff --git a/src/current/snapshot.py b/src/current/snapshot.py index ca293c6120b..b8cd01ea382 100644 --- a/src/current/snapshot.py +++ b/src/current/snapshot.py @@ -47,6 +47,7 @@ class OfflineArchiver: def __init__(self): self.sidebar_html = None + self.comprehensive_sidebar_html = None # Store comprehensive sidebar from cockroachcloud self.processed_files = set() self.missing_assets = set() self.copied_assets = set() @@ -121,6 +122,7 @@ def check_file_exists(self, url): def clean_sidebar_items(self, items_data): """Clean the sidebar items array and count removed URLs""" + import re removed_urls_count = 0 removed_sections_count = 0 @@ -137,6 +139,7 @@ def clean_item(item, level=0): for url in item['urls']: try: + # Simple check - let the original check_file_exists handle everything if url and self.check_file_exists(url): valid_urls.append(url) else: @@ -494,6 +497,116 @@ def load_sidebar(self): self.log("Sidebar not found", "WARNING") return False + def extract_comprehensive_sidebar(self, html): + """Extract comprehensive sidebar JavaScript from cockroachcloud pages and ensure correct format""" + try: + # Simple extraction - find the sidebar object + sidebar_start = html.find('const sidebar = {') + if sidebar_start == -1: + self.log("No sidebar JavaScript found in cockroachcloud page", "DEBUG") + return + + # Find end with simple pattern + sidebar_end = html.find('};\n', sidebar_start) + if sidebar_end == -1: + sidebar_end = html.find('};', sidebar_start) + if sidebar_end == -1: + self.log("Could not find end of sidebar JavaScript", "DEBUG") + return + + # Extract the sidebar JavaScript + comprehensive_sidebar_js = html[sidebar_start:sidebar_end + 2] + + self.log("Extracted comprehensive sidebar from cockroachcloud page", "SUCCESS") + self.log(f"Raw sidebar preview (first 300 chars): {comprehensive_sidebar_js[:300]}...", "DEBUG") + + # CRITICAL: Fix baseUrl to match original format + # The original script uses baseUrl: "" but comprehensive sidebar has baseUrl: "/docs" + if 'baseUrl: "/docs"' in comprehensive_sidebar_js: + comprehensive_sidebar_js = comprehensive_sidebar_js.replace('baseUrl: "/docs"', 'baseUrl: ""') + self.log("āœ“ Fixed baseUrl from '/docs' to empty string", "DEBUG") + elif 'baseUrl:"/docs"' in comprehensive_sidebar_js: + comprehensive_sidebar_js = comprehensive_sidebar_js.replace('baseUrl:"/docs"', 'baseUrl:""') + self.log("āœ“ Fixed baseUrl from '/docs' to empty string", "DEBUG") + + # DIRECT FIX: Replace the broken URL processing with working offline logic + # The comprehensive sidebar contains web-based URL processing that strips .html extensions + # This breaks offline navigation, so we replace it with proper offline logic + + # Always apply fix for comprehensive sidebar since it has web-based URL processing + if comprehensive_sidebar_js and len(comprehensive_sidebar_js) > 100: + self.log("šŸ” Found broken URL processing in comprehensive sidebar - fixing it", "DEBUG") + + # SIMPLE DIRECT REPLACEMENT: Replace the exact broken line with working logic + # Find and replace the specific problematic line + + broken_line = 'url = sidebar.baseUrl + url.replace("/index.html", "").replace(".html", "");' + + working_replacement = '''// Remove /docs/ prefix if present + url = url.replace(/^\\/docs\\//, '').replace(/^docs\\//, ''); + + // Handle root/home URLs + if (url === '/' || url === '' || url === 'index' || url === 'index.html') { + var currentPath = window.location.pathname; + var pathMatch = currentPath.match(/(cockroachcloud|v19\\.2|releases|advisories)\\/[^\\/]+$/); + if (pathMatch) { + url = '../index.html'; + } else { + url = 'index.html'; + } + } else { + if (url.startsWith('/')) { + url = url.substring(1); + } + url = url.replace(/^stable\\//, 'v19.2/').replace(/\\/stable\\//, '/v19.2/'); + + var currentPath = window.location.pathname; + var pathMatch = currentPath.match(/(cockroachcloud|v19\\.2|releases|advisories)\\/[^\\/]+$/); + if (pathMatch) { + var currentDir = pathMatch[1]; + if (url.startsWith(currentDir + '/')) { + url = url.substring(currentDir.length + 1); + } else if (url.includes('/')) { + url = '../' + url; + } else if (url !== '' && !url.endsWith('.html') && !url.endsWith('/')) { + url = '../' + url; + } + } + } + url = url.replace(/\\/+/g, '/'); + url = sidebar.baseUrl + url;''' + + if broken_line in comprehensive_sidebar_js: + comprehensive_sidebar_js = comprehensive_sidebar_js.replace(broken_line, working_replacement) + self.log("āœ… Successfully replaced broken URL processing line", "SUCCESS") + else: + # Debug: show what we're actually looking for vs what exists + self.log("āš ļø Could not find exact broken line to replace", "WARNING") + if 'url.replace("/index.html"' in comprehensive_sidebar_js: + lines = comprehensive_sidebar_js.split('\n') + for i, line in enumerate(lines): + if 'url.replace("/index.html"' in line: + self.log(f"Found actual line: '{line.strip()}'", "DEBUG") + break + self.log("āœ… Fixed comprehensive sidebar URL processing for offline use", "SUCCESS") + fixed_sidebar = comprehensive_sidebar_js + else: + # Fallback to original processing + self.log("šŸ” No broken URL processing found, using standard fix", "DEBUG") + fixed_sidebar = self.fix_sidebar_javascript(comprehensive_sidebar_js) + + cleaned_sidebar, removed_count = self.clean_sidebar_in_html(fixed_sidebar) + if removed_count > 0: + self.log(f"Cleaned {removed_count} broken URLs from comprehensive sidebar", "DEBUG") + fixed_sidebar = cleaned_sidebar + + # Store it + self.comprehensive_sidebar_html = fixed_sidebar + self.log(f"Final sidebar preview (first 300 chars): {fixed_sidebar[:300]}...", "DEBUG") + + except Exception as e: + self.log(f"Error extracting comprehensive sidebar: {e}", "ERROR") + def ensure_asset(self, name, local_candidates, url, dest_dir): """Ensure an asset exists, downloading if necessary""" dest_dir.mkdir(parents=True, exist_ok=True) @@ -590,12 +703,14 @@ def copy_selective_assets(self): self.log(f"Copied {version}/{asset_type}/", "SUCCESS") def fix_sidebar_javascript(self, html): - """Fix the embedded sidebar JavaScript configuration and URL processing""" + """Fix the embedded sidebar JavaScript configuration and URL processing (ORIGINAL WORKING VERSION)""" # Fix 1: Replace baseUrl in the embedded sidebar configuration + # For offline file:// URLs, use absolute path to offline_snap directory + offline_snap_path = f"file://{OUTPUT_ROOT.resolve()}/" html = re.sub( r'baseUrl:\s*["\'][^"\']*["\']', - 'baseUrl: ""', + f'baseUrl: "{offline_snap_path}"', html ) @@ -603,92 +718,89 @@ def fix_sidebar_javascript(self, html): # Look for the specific URL processing pattern in the JavaScript url_processing_pattern = r'(if \(!/\^https\?:/.test\(url\)\) \{\s*url = sidebar\.baseUrl \+ url\.replace\([^}]+\}\s*return url;)' - # More robust pattern that captures the entire URL processing block - better_pattern = r'(const urls = \(item\.urls \|\| \[\]\)\.map\(function \(url\) \{[\s\S]*?)(if \(!/\^https\?:/.test\(url\)\) \{[\s\S]*?url = sidebar\.baseUrl \+ url\.replace[\s\S]*?\}[\s\S]*?)(return url;[\s\S]*?\}\);)' + # More robust pattern that captures the entire URL processing block + # Fixed pattern to match comprehensive sidebar format exactly + better_pattern = r'(const urls = \(item\.urls \|\| \[\]\)\.map\(function \(url\) \{[\s\S]*?)(if \(!/\^https\?:/.test\(url\)\) \{[\s\S]*?url = sidebar\.baseUrl \+ url\.replace\([^}]+\}[\s\S]*?)(return url;[\s\S]*?\}\);)' def replace_url_processing(match): start_part = match.group(1) end_part = match.group(3) - # Inject our custom URL processing logic + # Simplified URL processing for offline file:// URLs with absolute baseUrl new_processing = r'''if (!/^https?:/.test(url)) { // Remove /docs/ prefix if present url = url.replace(/^\/docs\//, '').replace(/^docs\//, ''); + // Remove leading slash to make it relative + if (url.startsWith('/')) { + url = url.substring(1); + } + + // Handle stable -> v19.2 conversion + url = url.replace(/^stable\//, 'v19.2/').replace(/\/stable\//, '/v19.2/'); + // Handle root/home URLs - if (url === '/' || url === '' || url === 'index' || url === 'index.html') { - // For docs home, determine if we need to go up directories - var currentPath = window.location.pathname; - var pathMatch = currentPath.match(/(cockroachcloud|v19\.2|releases|advisories)\/[^\/]+$/); - if (pathMatch) { - url = '../index.html'; // Go up to main index - } else { - url = 'index.html'; // Stay at current level - } - } else { - // Better current directory detection for file:// URLs - var currentPath = window.location.pathname; - var currentDir = ''; - - // Extract just the relevant part of the path (handle both web and file:// URLs) - var pathMatch = currentPath.match(/(cockroachcloud|v19\.2|releases|advisories)\/[^\/]+$/); - if (pathMatch) { - currentDir = pathMatch[1]; - } else { - // Fallback: check if we're in root or any subdirectory - var pathParts = currentPath.split('/').filter(function(part) { return part; }); - for (var i = pathParts.length - 2; i >= 0; i--) { - if (pathParts[i] === 'cockroachcloud' || pathParts[i] === 'v19.2' || - pathParts[i] === 'releases' || pathParts[i] === 'advisories') { - currentDir = pathParts[i]; - break; - } - } - } - - // Remove leading slash from URL - if (url.startsWith('/')) { - url = url.substring(1); - } - - // Handle stable -> v19.2 conversion - url = url.replace(/^stable\//, 'v19.2/').replace(/\/stable\//, '/v19.2/'); - - // Calculate relative path based on current directory context - if (currentDir) { - // We're in a subdirectory - if (url.startsWith(currentDir + '/')) { - // Same directory - remove the directory prefix - url = url.substring(currentDir.length + 1); - } else if (url.includes('/')) { - // Different directory - need to go up one level - url = '../' + url; - } else if (url !== '' && !url.endsWith('.html') && !url.endsWith('/')) { - // Root level file - go up one level - url = '../' + url; - } - } + if (url === '' || url === 'index' || url === 'index.html') { + url = 'index.html'; } // Clean up any double slashes url = url.replace(/\/+/g, '/'); - // Note: Keep .html extensions for offline file:// URLs + + // Use absolute baseUrl for file:// URLs + url = sidebar.baseUrl + url; }''' return start_part + new_processing + end_part - # Try to apply the replacement - new_html = re.sub(better_pattern, replace_url_processing, html, flags=re.DOTALL) + # Try to apply the replacement - use global replacement to catch all instances + new_html = html + matches_found = 0 + def count_replacements(match): + nonlocal matches_found + matches_found += 1 + return replace_url_processing(match) + + new_html = re.sub(better_pattern, count_replacements, html, flags=re.DOTALL) + + if matches_found > 0: + self.log(f"āœ… Applied comprehensive URL processing replacement ({matches_found} matches)", "SUCCESS") + else: + self.log("āš ļø Comprehensive URL processing pattern not found", "WARNING") + + # If that didn't work, try direct replacement of the .html stripping pattern + # This is the most important fix for comprehensive sidebar + if new_html == html: + # Direct pattern matching for comprehensive sidebar format - handle spacing + new_html = re.sub( + r'url\s*=\s*sidebar\.baseUrl\s*\+\s*url\.replace\s*\(\s*"/index\.html"\s*,\s*""\s*\)\.replace\s*\(\s*"\.html"\s*,\s*""\s*\)\s*;', + 'url = sidebar.baseUrl + url.replace("/index.html", ""); // Keep .html for offline', + html + ) + if new_html != html: + self.log("Applied direct .html preservation fix to comprehensive sidebar", "DEBUG") # Also fix the .html stripping issue - replace the line that removes .html extensions - new_html = re.sub( - r'url = url\.replace\("/index\.html", ""\)\.replace\("\.html", ""\);', - 'url = url.replace("/index.html", ""); // Keep .html for offline', - new_html - ) + # The main pattern we need to fix is: + # url = sidebar.baseUrl + url.replace("/index.html", "").replace(".html", ""); + + # FINAL FIX: Simple string replacement to ensure .html extensions are preserved + old_text = 'url = sidebar.baseUrl + url.replace("/index.html", "").replace(".html", "");' + new_text = 'url = sidebar.baseUrl + url.replace("/index.html", ""); // Keep .html for offline' + + # Apply the fix regardless of previous replacements + new_html = new_html.replace(old_text, new_text) + + if old_text in html and old_text not in new_html: + self.log("āœ… Fixed .html stripping with simple string replacement", "SUCCESS") + elif old_text in html: + self.log("āš ļø Failed to replace .html stripping pattern", "WARNING") + else: + self.log("ā„¹ļø No .html stripping pattern found to fix", "INFO") # If the complex pattern didn't match, try a simpler approach if new_html == html: + self.log("Trying simple pattern replacement as fallback", "DEBUG") # Simple pattern - just replace the specific problematic line simple_pattern = r'url = sidebar\.baseUrl \+ url\.replace\([^}]+\}' @@ -728,6 +840,14 @@ def replace_url_processing(match): url = url.replace(/^stable\//, 'v19.2/').replace(/\/stable\//, '/v19.2/'); + // Handle cross-directory URLs (releases, cockroachcloud, advisories) + if (url.startsWith('releases/') || url.startsWith('cockroachcloud/') || url.startsWith('advisories/')) { + // These should go up from v19.2 directory to the root level + if (currentDir === 'v19.2') { + url = '../' + url; + } + } + if (currentDir) { if (url.startsWith(currentDir + '/')) { url = url.substring(currentDir.length + 1); @@ -745,12 +865,17 @@ def replace_url_processing(match): new_html = re.sub(simple_pattern, simple_replacement, html, flags=re.DOTALL) - # Also fix the .html stripping issue + # Also fix the .html stripping issue - handle both patterns new_html = re.sub( r'url = url\.replace\("/index\.html", ""\)\.replace\("\.html", ""\);', 'url = url.replace("/index.html", ""); // Keep .html for offline', new_html ) + new_html = re.sub( + r'url = sidebar\.baseUrl \+ url\.replace\("/index\.html", ""\)\.replace\("\.html", ""\);', + 'url = sidebar.baseUrl + url.replace("/index.html", ""); // Keep .html for offline', + new_html + ) # Debug output if new_html != html: @@ -857,6 +982,7 @@ def get_vibrant_sidebar_styles(self, prefix): def process_html_file(self, src_path): """Process a single HTML file with vibrant sidebar styling""" + import re # Import at the top to avoid UnboundLocalError try: rel_path = src_path.relative_to(DOCS_ROOT) dst_path = OUTPUT_ROOT / rel_path @@ -868,23 +994,82 @@ def process_html_file(self, src_path): # Read content html = src_path.read_text(encoding="utf-8") - # CRITICAL: Fix sidebar JavaScript BEFORE other processing - html = self.fix_sidebar_javascript(html) - - # CRITICAL: Clean embedded sidebar JavaScript - cleaned_html, removed_count = self.clean_sidebar_in_html(html) - if removed_count > 0: - self.total_broken_urls += removed_count - html = cleaned_html + # Extract comprehensive sidebar from cockroachcloud pages FIRST (if not already done) + if not self.comprehensive_sidebar_html and 'cockroachcloud' in str(rel_path): + self.extract_comprehensive_sidebar(html) + + # SIMPLE APPROACH: If we have comprehensive sidebar, replace it. Otherwise use original logic. + if self.comprehensive_sidebar_html: + # Find and replace the sidebar JavaScript with our comprehensive version + sidebar_pattern = r'const sidebar = \{[\s\S]*?\};' + match = re.search(sidebar_pattern, html, flags=re.DOTALL) + if match: + # Use simple string replacement to avoid regex escape issues + original_sidebar = match.group(0) + + # FINAL FIX: Apply URL processing fix to comprehensive sidebar before applying it + fixed_comprehensive_sidebar = self.comprehensive_sidebar_html + + # Fix the .html stripping issue in the comprehensive sidebar + broken_line = 'url = sidebar.baseUrl + url.replace("/index.html", "").replace(".html", "");' + fixed_line = 'url = sidebar.baseUrl + url.replace("/index.html", ""); // Keep .html for offline' + + if broken_line in fixed_comprehensive_sidebar: + fixed_comprehensive_sidebar = fixed_comprehensive_sidebar.replace(broken_line, fixed_line) + self.log("šŸ”§ Fixed .html stripping in comprehensive sidebar", "SUCCESS") + + # The simple fix above should be sufficient + + html = html.replace(original_sidebar, fixed_comprehensive_sidebar) + self.log(f"Applied comprehensive sidebar to {rel_path}", "DEBUG") + + # CRITICAL: Apply sidebar fixes AFTER comprehensive sidebar replacement + html = self.fix_sidebar_javascript(html) + + # Debug: check if "/" URL is present in replaced content + if '"/"' in self.comprehensive_sidebar_html: + self.log("āœ“ Root URL '/' found in comprehensive sidebar", "DEBUG") + else: + self.log("⚠ Root URL '/' NOT found in comprehensive sidebar", "WARNING") + else: + # No sidebar JS found, continue with normal processing + html = self.fix_sidebar_javascript(html) + cleaned_html, removed_count = self.clean_sidebar_in_html(html) + if removed_count > 0: + self.total_broken_urls += removed_count + html = cleaned_html + else: + # ORIGINAL LOGIC: Fix sidebar JavaScript BEFORE other processing + html = self.fix_sidebar_javascript(html) + + # Clean embedded sidebar JavaScript + cleaned_html, removed_count = self.clean_sidebar_in_html(html) + if removed_count > 0: + self.total_broken_urls += removed_count + html = cleaned_html - # Inject sidebar HTML if available + # Inject sidebar HTML if available (ORIGINAL LOGIC) if self.sidebar_html: - html = re.sub( - r"(
]*>)(\s*?
)", - rf"\1{self.sidebar_html}\2", + sidebar_to_inject = self.sidebar_html + # Try to inject into ul#sidebar first + ul_replaced = re.sub( + r"(]*id=\"sidebar\"[^>]*>)([^<]*)()", + rf"\1{sidebar_to_inject}\3", html, - flags=re.IGNORECASE, + flags=re.IGNORECASE | re.DOTALL, ) + + # If ul replacement worked, use it + if ul_replaced != html: + html = ul_replaced + else: + # Fallback to div#sidebar + html = re.sub( + r"(
]*>)(\s*?
)", + rf"\1{sidebar_to_inject}\2", + html, + flags=re.IGNORECASE, + ) # Parse with BeautifulSoup for additional cleanup soup = BeautifulSoup(html, "html.parser") @@ -1336,13 +1521,21 @@ def build(self): self.log(f"Output directory: {OUTPUT_ROOT.resolve()}") self.log(f"Total files: {len(self.processed_files)}") self.log(f"Total broken URLs removed: {self.total_broken_urls}", "SUCCESS") - self.log("🟣 Vibrant #6933FF sidebar styling (Script 1)", "SUCCESS") - self.log("šŸ  Professional homepage with clear navigation (Script 2)", "SUCCESS") - self.log("šŸ”— Sidebar navigation logic with better URL processing (Updated)", "SUCCESS") - self.log("⚔ Selective asset copying for reduced size (Script 2)", "SUCCESS") - self.log("šŸ”§ Robust error handling and progress reporting (Script 2)", "SUCCESS") - self.log("āœ… Sidebar JavaScript URL processing FIXED", "SUCCESS") - self.log("āœ… Broken sidebar links and empty sections removed", "SUCCESS") + + # Navigation summary + if self.comprehensive_sidebar_html: + self.log("āœ… Comprehensive sidebar extracted and applied to all pages", "SUCCESS") + else: + self.log("āš ļø No comprehensive sidebar found - using original individual processing", "WARNING") + + self.log("🟣 Vibrant #6933FF sidebar styling", "SUCCESS") + self.log("šŸ  Professional homepage with archived banner", "SUCCESS") + self.log("šŸ”— ORIGINAL working navigation logic restored", "SUCCESS") + self.log("⚔ Selective asset copying for reduced size", "SUCCESS") + self.log("šŸ”§ Robust error handling and progress reporting", "SUCCESS") + self.log("āœ… JavaScript URL processing: ORIGINAL working version", "SUCCESS") + self.log("āœ… Filtered out non-v19.2 version links (v25.1, v24.x, etc.)", "SUCCESS") + self.log("āœ… Broken sidebar links removed from comprehensive sidebar", "SUCCESS") print(f"\nšŸŽ‰ Hybrid offline site built in {OUTPUT_ROOT}") print(f"\nšŸ“¦ To test: open file://{OUTPUT_ROOT.resolve()}/index.html")