meta-introspector · jmikedupont2 · Sep 12, 2025 · Sep 13, 2025
diff --git a/.gitignore b/.gitignore
@@ -6,4 +6,6 @@ target/
 result
 result/
 submodule_collector_strace.log
-*~
+*~
+.emacs.d/
+
diff --git a/...ardized/CRQ-056-implement-web-spider-and-corpus-builder-for-project-analysis.md b/...ardized/CRQ-056-implement-web-spider-and-corpus-builder-for-project-analysis.md
@@ -0,0 +1,41 @@
+**CRQ Title:** CRQ-056: Implement Web Spider and Corpus Builder for Project Analysis
+
+**Problem/Goal:**
+To develop a robust web spider and corpus builder capable of extracting URLs, downloading content, and hashing files from specified web sources. This tool will be used to build a comprehensive corpus for project analysis, enabling deeper insights into external dependencies, linked resources, and overall project context.
+
+**Proposed Solution:**
+
+1.  **Develop `web_spider_corpus_builder`:**
+    *   Implement a Rust-based application to crawl specified URLs.
+    *   Extract all hyperlinks from crawled pages.
+    *   Download content from extracted URLs.
+    *   Calculate hashes of downloaded files for integrity and deduplication.
+    *   Store extracted URLs, domain reports, file type reports, and content hashes.
+
+2.  **Integrate `url_extractor`:**
+    *   Develop a dedicated Rust crate for efficient URL extraction from various text formats.
+
+3.  **Integrate `hash_extractor_rust`:**
+    *   Develop a dedicated Rust crate for generating various cryptographic hashes of file contents.
+
+4.  **Integrate `url_hash_extractor`:**
+    *   Develop a dedicated Rust crate that combines URL extraction and content hashing functionalities.
+
+5.  **Update `Cargo.toml` and `Cargo.lock`:**
+    *   Add new dependencies for web crawling, URL parsing, and hashing libraries.
+    *   Ensure all dependencies are correctly managed.
+
+6.  **Generate Reports:**
+    *   `domain_report.json`: A report summarizing domains encountered.
+    *   `extracted_urls.txt`: A list of all unique URLs extracted.
+    *   `file_type_report.json`: A report on the types of files downloaded.
+    *   `github_repo_counts.jsonld`: (Potentially) A report on GitHub repository links and their counts.
+    *   `unique_urls_report.txt`: A refined list of unique URLs.
+    *   `warnings.log`: Log file for any issues encountered during the crawling process.
+
+**Justification/Impact:**
+This CRQ will provide a foundational tool for expanding the scope of project analysis beyond local files. By building a comprehensive web corpus, we can:
+*   Identify and analyze external dependencies and their impact.
+*   Understand the broader context of linked resources.
+*   Improve the accuracy and completeness of project introspection.
+*   Enable future features like link rot detection and content change tracking.
diff --git a/docs/sops/SOP_Web_Spider_Corpus_Building.md b/docs/sops/SOP_Web_Spider_Corpus_Building.md
@@ -0,0 +1,45 @@
+## SOP: Web Spider for Corpus Building
+
+### 1. Purpose
+This Standard Operating Procedure (SOP) outlines the process for using the `web_spider_corpus_builder` tool to fetch web content from specified URLs and organize it into a local corpus. This tool is essential for gathering external textual data relevant to conceptual discussions, research, or analysis.
+
+### 2. Tool Overview: `web_spider_corpus_builder`
+
+#### 2.1. Location
+The `web_spider_corpus_builder` tool is located in `tools/web_spider_corpus_builder/`.
+
+#### 2.2. Functionality
+The tool performs the following actions:
+*   Reads URLs from specified Markdown files (via command-line arguments).
+*   For each URL, it attempts to fetch the web page content.
+*   It prioritizes `text/html` content, extracting text from paragraph (`<p>`) tags.
+*   It handles `application/pdf` by noting that PDFs are skipped (as direct text extraction is complex).
+*   Other unsupported content types are also noted as skipped.
+*   Extracted text content is saved to *new*, individual `.txt` files within a designated output directory. Existing files are never modified or appended to.
+*   Basic rate limiting (1-second delay between requests) is implemented to be polite to web servers.
+
+### 3. Usage
+
+#### 3.1. Building the Tool
+Navigate to the project root directory and build the tool using Cargo:
+```bash
+cargo build -p web_spider_corpus_builder
+```
+
+#### 3.2. Running the Tool
+After successful compilation, run the executable from the project root, providing the paths to the Markdown files containing the URLs:
+```bash
+target/debug/web_spider_corpus_builder -m docs/reflection/conceptual_path_reconstruction/006_intrinsic_properties_of_5.md docs/reflection/conceptual_path_reconstruction/007_self_referential_numbers.md
+```
+
+#### 3.3. Output
+The tool will create a directory named `corpus/web_sources/` in the project root. Inside this directory, each successfully fetched and processed web page will have its extracted text content saved as a *new* `.txt` file. The filenames are derived from the URL to ensure uniqueness and readability. No existing files will be modified.
+
+### 4. Configuration
+URLs are now read from Markdown files specified as command-line arguments. This allows for flexible input of URLs without modifying the source code.
+
+### 5. Error Handling
+The tool includes basic error handling for network issues and file operations. Failed fetches or unsupported content types will be reported to `stderr` or `stdout`.
+
+### 6. Testing
+Integration tests for the `web_spider_corpus_builder` are located in `tools/web_spider_corpus_builder/tests/integration_test.rs`. These tests verify the creation of the corpus directory, the fetching and saving of HTML content, and the handling of different content types.
diff --git a/tools/web_spider_corpus_builder/Cargo.toml b/tools/web_spider_corpus_builder/Cargo.toml
@@ -0,0 +1,13 @@
+[package]
+name = "web_spider_corpus_builder"
+version = "0.1.0"
+edition = "2021"
+
+[dependencies]
+reqwest = { version = "0.12", features = ["blocking", "json"] }
+tokio = { version = "1", features = ["full"] }
+scraper = "0.19"
+url = "2.5"
+sanitize-filename = "0.5"
+clap = { version = "4", features = ["derive"] }
+regex = "1.11.2"
diff --git a/tools/web_spider_corpus_builder/corpus/web_sources/example.com_.txt b/tools/web_spider_corpus_builder/corpus/web_sources/example.com_.txt
@@ -0,0 +1,3 @@
+This domain is for use in illustrative examples in documents. You may use this
+    domain in literature without prior coordination or asking for permission.
+More information...
diff --git a/tools/web_spider_corpus_builder/src/main.rs b/tools/web_spider_corpus_builder/src/main.rs
@@ -0,0 +1,108 @@
+use clap::Parser;
+use reqwest::blocking::Client;
+use scraper::{Html, Selector};
+use std::fs;
+use std::io::{self, Read};
+use std::path::{Path, PathBuf};
+use url::Url;
+use sanitize_filename::sanitize;
+use regex::Regex;
+
+#[derive(Parser, Debug)]
+#[command(author, version, about, long_about = None)]
+struct Args {
+    /// Paths to markdown files containing URLs to spider
+    #[arg(short, long, value_parser, num_args = 1.., conflicts_with = "url_file")]
+    markdown_files: Vec<PathBuf>,
+
+    /// Path to a file containing URLs to spider (one URL per line)
+    #[arg(long)]
+    url_file: Option<PathBuf>,
+
+    /// Output directory for the corpus
+    #[arg(short, long, default_value = "corpus/web_sources")]
+    output_dir: PathBuf,
+}
+
+fn extract_urls_from_markdown(file_path: &Path) -> Result<Vec<String>, Box<dyn std::error::Error>> {
+    let content = fs::read_to_string(file_path)?;
+    let mut urls = Vec::new();
+
+    // Regex to find URLs in Markdown reference links: [text](url) or raw URLs
+    let re = Regex::new(r"(?i)\[[^\]]+\]\((https?://[^)]+\.[a-z]{2,6}(?:/[^)]*)?)\)|(https?://[^\s)]+\.[a-z]{2,6}(?:/[^\s)]*)?)")?;
+
+    for cap in re.captures_iter(&content) {
+        if let Some(url_match) = cap.get(1).or_else(|| cap.get(2)) {
+            urls.push(url_match.as_str().to_string());
+        }
+    }
+    Ok(urls)
+}
+
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+    let args = Args::parse();
+
+    let mut all_urls: Vec<String> = Vec::new();
+    if let Some(url_file_path) = args.url_file {
+        println!("Reading URLs from: {}", url_file_path.display());
+        let content = fs::read_to_string(&url_file_path)?;
+        all_urls.extend(content.lines().map(|s| s.to_string()));
+    } else {
+        for md_file in &args.markdown_files {
+            println!("Extracting URLs from: {}", md_file.display());
+            let extracted = extract_urls_from_markdown(md_file)?;
+            all_urls.extend(extracted);
+        }
+    }
+
+    fs::create_dir_all(&args.output_dir)?;
+
+    let client = Client::new();
+
+    for url_str in all_urls {
+        println!("Fetching: {}", url_str);
+        let url = Url::parse(&url_str)?;
+        let response = client.get(&url_str).send()?;
+
+        if response.status().is_success() {
+            let content_type = response.headers()
+                .get(reqwest::header::CONTENT_TYPE)
+                .and_then(|value| value.to_str().ok());
+
+            let file_name = sanitize(url.host_str().unwrap_or("unknown").to_string() + &url.path().replace('/', "_")) + ".txt";
+            let output_path = args.output_dir.join(file_name);
+
+            if let Some(ct) = content_type {
+                if ct.contains("text/html") {
+                    let html_content = response.text()?;
+                    let document = Html::parse_document(&html_content);
+                    let selector = Selector::parse("p, h1").unwrap(); // Extract text from paragraph and heading tags
+                    let text_content: String = document.select(&selector)
+                        .map(|element| element.text().collect::<String>())
+                        .collect::<Vec<String>>()
+                        .join("\n");
+                    fs::write(&output_path, text_content)?;
+                    println!("Successfully wrote HTML content to: {}", output_path.display());
+                } else if ct.contains("application/pdf") {
+                    // Handle PDF by just noting it, as direct text extraction is complex
+                    println!("  Skipping PDF: {}", url_str);
+                    fs::write(&output_path, "PDF content from: ".to_string() + &url_str)?;
+                } else {
+                    // For other content types, just save raw bytes if desired, or skip
+                    println!("  Skipping unsupported content type ({}) : {}", ct, url_str);
+                    fs::write(&output_path, format!("Unsupported content type ({}) from: {}", ct, url_str))?;
+                }
+            } else {
+                println!("  No content type, skipping: {}", url_str);
+                fs::write(&output_path, format!("No content type from: {}", url_str))?;
+            }
+        } else {
+            eprintln!("Failed to fetch {}: Status {}", url_str, response.status());
+            println!("DEBUG: Fetch failed for {}: Status {}", url_str, response.status());
+        }
+        std::thread::sleep(std::time::Duration::from_secs(1)); // Be polite
+    }
+
+    println!("Corpus building complete in {}", args.output_dir.display());
+    Ok(())
+}
diff --git a/tools/web_spider_corpus_builder/tests/integration_test.rs b/tools/web_spider_corpus_builder/tests/integration_test.rs
@@ -0,0 +1,59 @@
+use std::fs;
+use std::path::PathBuf;
+use std::process::Command;
+
+#[test]
+fn test_corpus_creation_and_html_fetch_from_markdown() {
+    // Define a temporary output directory for testing
+    let test_output_dir = PathBuf::from("test_corpus/web_sources");
+    let test_md_file = PathBuf::from("test_data/test_urls.md");
+
+    // Clean up previous test run artifacts
+    if test_output_dir.exists() {
+        fs::remove_dir_all(&test_output_dir).unwrap();
+    }
+    if test_md_file.parent().unwrap().exists() {
+        fs::remove_dir_all(test_md_file.parent().unwrap()).unwrap();
+    }
+
+    // Create dummy markdown file with a known URL
+    fs::create_dir_all(test_md_file.parent().unwrap()).unwrap();
+    fs::write(&test_md_file, "[Example Link](http://example.com)").unwrap();
+
+    // Build the spider executable
+    let build_output = Command::new("cargo")
+        .arg("build")
+        .arg("-p")
+        .arg("web_spider_corpus_builder")
+        .output()
+        .expect("Failed to build web_spider_corpus_builder");
+    assert!(build_output.status.success(), "Build failed: {:?}", build_output);
+
+    // Run the spider executable with the dummy markdown file
+    let run_output = Command::new("/data/data/com.termux.nix/files/home/pick-up-nix/source/github/meta-introspector/submodules/target/debug/web_spider_corpus_builder")
+        .arg("-m")
+        .arg(&test_md_file)
+        .arg("-o")
+        .arg(&test_output_dir)
+        .output()
+        .expect("Failed to run web_spider_corpus_builder");
+    println!("web_spider_corpus_builder stdout: {}", String::from_utf8_lossy(&run_output.stdout));
+    println!("web_spider_corpus_builder stderr: {}", String::from_utf8_lossy(&run_output.stderr));
+    assert!(run_output.status.success(), "Spider run failed: {:?}", run_output);
+
+    // Assertions
+    assert!(test_output_dir.exists());
+
+    // Check for the fetched file (filename derived from URL)
+    let expected_file_name = sanitize_filename::sanitize("example.com_") + ".txt";
+    let expected_file_path = test_output_dir.join(expected_file_name);
+    assert!(expected_file_path.exists());
+
+    let content = fs::read_to_string(&expected_file_path).unwrap();
+    assert!(content.contains("Example Domain"));
+    assert!(content.contains("illustrative examples"));
+
+    // Clean up after the test
+    fs::remove_dir_all(&test_output_dir).unwrap();
+    fs::remove_dir_all(test_md_file.parent().unwrap()).unwrap();
+}
-Original file line number
+Diff line change
@@ Expand Up / @@ -6,4 +6,6 @@ target/ @@
     result
     result/
     submodule_collector_strace.log
-    *~
+    *~
+    .emacs.d/