From 6d7c5420cdecc9a327437750e44d06a96c59f283 Mon Sep 17 00:00:00 2001 From: Fritz Nastor <98058294+fritznastor@users.noreply.github.com> Date: Sun, 20 Jul 2025 19:22:50 -0500 Subject: [PATCH 1/7] test_flow_integration.py improvements --- tests/test_flow_integration.py | 128 ++++++++++++++++++++++++++++++--- 1 file changed, 119 insertions(+), 9 deletions(-) diff --git a/tests/test_flow_integration.py b/tests/test_flow_integration.py index 2129c0d9..ceaa9e0d 100644 --- a/tests/test_flow_integration.py +++ b/tests/test_flow_integration.py @@ -49,7 +49,7 @@ async def test_remote_repository_analysis(request: pytest.FixtureRequest) -> Non """Test the complete flow of analyzing a remote repository.""" client = request.getfixturevalue("test_client") form_data = { - "input_text": "https://github.com/octocat/Hello-World", + "input_text": "https://github.com/microsoft/vscode", "max_file_size": "243", "pattern_type": "exclude", "pattern": "", @@ -92,12 +92,11 @@ async def test_invalid_repository_url(request: pytest.FixtureRequest) -> None: @pytest.mark.asyncio async def test_large_repository(request: pytest.FixtureRequest) -> None: - """Simulate analysis of a large repository with nested folders.""" + """Simulate analysis of a large repository with nested folders and many files.""" client = request.getfixturevalue("test_client") - # TODO: ingesting a large repo take too much time (eg: godotengine/godot repository) form_data = { - "input_text": "https://github.com/octocat/hello-world", - "max_file_size": "10", + "input_text": "https://github.com/microsoft/vscode", + "max_file_size": "100", # Lower this to force skipping large files "pattern_type": "exclude", "pattern": "", "token": "", @@ -109,7 +108,7 @@ async def test_large_repository(request: pytest.FixtureRequest) -> None: response_data = response.json() if response.status_code == status.HTTP_200_OK: assert "content" in response_data - assert response_data["content"] + assert isinstance(response_data["content"], str) else: assert "error" in response_data @@ -121,7 +120,7 @@ async def test_concurrent_requests(request: pytest.FixtureRequest) -> None: def make_request() -> None: form_data = { - "input_text": "https://github.com/octocat/hello-world", + "input_text": "https://github.com/microsoft/vscode", "max_file_size": "243", "pattern_type": "exclude", "pattern": "", @@ -148,7 +147,7 @@ async def test_large_file_handling(request: pytest.FixtureRequest) -> None: """Test handling of repositories with large files.""" client = request.getfixturevalue("test_client") form_data = { - "input_text": "https://github.com/octocat/Hello-World", + "input_text": "https://github.com/microsoft/vscode", "max_file_size": "1", "pattern_type": "exclude", "pattern": "", @@ -171,7 +170,7 @@ async def test_repository_with_patterns(request: pytest.FixtureRequest) -> None: """Test repository analysis with include/exclude patterns.""" client = request.getfixturevalue("test_client") form_data = { - "input_text": "https://github.com/octocat/Hello-World", + "input_text": "https://github.com/microsoft/vscode", "max_file_size": "243", "pattern_type": "include", "pattern": "*.md", @@ -184,9 +183,120 @@ async def test_repository_with_patterns(request: pytest.FixtureRequest) -> None: response_data = response.json() if response.status_code == status.HTTP_200_OK: assert "content" in response_data + assert isinstance(response_data["content"], str) + + assert "repo_url" in response_data + assert response_data["repo_url"].startswith("https://github.com/") + + assert "summary" in response_data + assert isinstance(response_data["summary"], str) + assert "microsoft/vscode" in response_data["summary"].lower() + + assert "tree" in response_data + assert isinstance(response_data["tree"], str) + assert "microsoft-vscode" in response_data["tree"].lower() + assert "pattern_type" in response_data assert response_data["pattern_type"] == "include" + assert "pattern" in response_data assert response_data["pattern"] == "*.md" else: assert "error" in response_data + assert isinstance(response_data["error"], str) + assert response_data["error"] # not empty + + +@pytest.mark.asyncio +async def test_missing_required_fields(request: pytest.FixtureRequest) -> None: + """Test API response when required fields are missing.""" + client = request.getfixturevalue("test_client") + form_data = { + "max_file_size": "243", + "pattern_type": "exclude", + "pattern": "", + "token": "", + } + response = client.post("/api/ingest", json=form_data) + assert response.status_code in ( + status.HTTP_422_UNPROCESSABLE_ENTITY, + status.HTTP_429_TOO_MANY_REQUESTS, + status.HTTP_200_OK, + ) + + form_data = { + "input_text": "https://github.com/microsoft/vscode", + "max_file_size": "243", + "pattern": "", + "token": "", + } + response = client.post("/api/ingest", json=form_data) + assert response.status_code in ( + status.HTTP_422_UNPROCESSABLE_ENTITY, + status.HTTP_429_TOO_MANY_REQUESTS, + status.HTTP_200_OK, + ) + + +@pytest.mark.asyncio +async def test_invalid_field_types(request: pytest.FixtureRequest) -> None: + """Test API response when fields have invalid types.""" + client = request.getfixturevalue("test_client") + + form_data = { + "input_text": 12345, + "max_file_size": "243", + "pattern_type": "exclude", + "pattern": "", + "token": "", + } + response = client.post("/api/ingest", json=form_data) + assert response.status_code == status.HTTP_422_UNPROCESSABLE_ENTITY + + form_data = { + "input_text": "https://github.com/microsoft/vscode", + "max_file_size": "243", + "pattern_type": "exclude", + "pattern": ["*.md"], + "token": "", + } + response = client.post("/api/ingest", json=form_data) + assert response.status_code == status.HTTP_422_UNPROCESSABLE_ENTITY + + +@pytest.mark.asyncio +async def test_unsupported_pattern_type(request: pytest.FixtureRequest) -> None: + """Test API response for unsupported pattern_type.""" + client = request.getfixturevalue("test_client") + form_data = { + "input_text": "https://github.com/microsoft/vscode", + "max_file_size": "243", + "pattern_type": "invalid_type", + "pattern": "*.md", + "token": "", + } + response = client.post("/api/ingest", json=form_data) + assert response.status_code in (status.HTTP_400_BAD_REQUEST, status.HTTP_422_UNPROCESSABLE_ENTITY) + response_data = response.json() + assert "error" in response_data or "detail" in response_data + + +@pytest.mark.asyncio +async def test_invalid_token(request: pytest.FixtureRequest) -> None: + """Test API response for an invalid or expired token.""" + client = request.getfixturevalue("test_client") + form_data = { + "input_text": "https://github.com/microsoft/vscode", + "max_file_size": "243", + "pattern_type": "exclude", + "pattern": "", + "token": "invalid_token_1234567890", + } + response = client.post("/api/ingest", json=form_data) + assert response.status_code in ( + status.HTTP_401_UNAUTHORIZED, + status.HTTP_400_BAD_REQUEST, + status.HTTP_429_TOO_MANY_REQUESTS, + ) + response_data = response.json() + assert "error" in response_data or "detail" in response_data From 5ff74776b13f71ed4d10c1530c7a2864d67f6a91 Mon Sep 17 00:00:00 2001 From: Fritz Nastor <98058294+fritznastor@users.noreply.github.com> Date: Wed, 23 Jul 2025 19:19:00 -0500 Subject: [PATCH 2/7] - changed input repo from vscode to flask -lowered max file size from 243 to 200 for all unit tests -correctly handled missing fields and invalid token unit tests --- tests/test_flow_integration.py | 48 +++++++++++++++++----------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/tests/test_flow_integration.py b/tests/test_flow_integration.py index ceaa9e0d..7905b679 100644 --- a/tests/test_flow_integration.py +++ b/tests/test_flow_integration.py @@ -49,8 +49,8 @@ async def test_remote_repository_analysis(request: pytest.FixtureRequest) -> Non """Test the complete flow of analyzing a remote repository.""" client = request.getfixturevalue("test_client") form_data = { - "input_text": "https://github.com/microsoft/vscode", - "max_file_size": "243", + "input_text": "https://github.com/pallets/flask", + "max_file_size": "200", "pattern_type": "exclude", "pattern": "", "token": "", @@ -75,7 +75,7 @@ async def test_invalid_repository_url(request: pytest.FixtureRequest) -> None: client = request.getfixturevalue("test_client") form_data = { "input_text": "https://github.com/nonexistent/repo", - "max_file_size": "243", + "max_file_size": "200", "pattern_type": "exclude", "pattern": "", "token": "", @@ -95,7 +95,7 @@ async def test_large_repository(request: pytest.FixtureRequest) -> None: """Simulate analysis of a large repository with nested folders and many files.""" client = request.getfixturevalue("test_client") form_data = { - "input_text": "https://github.com/microsoft/vscode", + "input_text": "https://github.com/pallets/flask", "max_file_size": "100", # Lower this to force skipping large files "pattern_type": "exclude", "pattern": "", @@ -120,8 +120,8 @@ async def test_concurrent_requests(request: pytest.FixtureRequest) -> None: def make_request() -> None: form_data = { - "input_text": "https://github.com/microsoft/vscode", - "max_file_size": "243", + "input_text": "https://github.com/pallets/flask", + "max_file_size": "200", "pattern_type": "exclude", "pattern": "", "token": "", @@ -147,7 +147,7 @@ async def test_large_file_handling(request: pytest.FixtureRequest) -> None: """Test handling of repositories with large files.""" client = request.getfixturevalue("test_client") form_data = { - "input_text": "https://github.com/microsoft/vscode", + "input_text": "https://github.com/pallets/flask", "max_file_size": "1", "pattern_type": "exclude", "pattern": "", @@ -170,8 +170,8 @@ async def test_repository_with_patterns(request: pytest.FixtureRequest) -> None: """Test repository analysis with include/exclude patterns.""" client = request.getfixturevalue("test_client") form_data = { - "input_text": "https://github.com/microsoft/vscode", - "max_file_size": "243", + "input_text": "https://github.com/pallets/flask", + "max_file_size": "200", "pattern_type": "include", "pattern": "*.md", "token": "", @@ -190,11 +190,11 @@ async def test_repository_with_patterns(request: pytest.FixtureRequest) -> None: assert "summary" in response_data assert isinstance(response_data["summary"], str) - assert "microsoft/vscode" in response_data["summary"].lower() + assert "pallets/flask" in response_data["summary"].lower() assert "tree" in response_data assert isinstance(response_data["tree"], str) - assert "microsoft-vscode" in response_data["tree"].lower() + assert "pallets-flask" in response_data["tree"].lower() assert "pattern_type" in response_data assert response_data["pattern_type"] == "include" @@ -204,7 +204,7 @@ async def test_repository_with_patterns(request: pytest.FixtureRequest) -> None: else: assert "error" in response_data assert isinstance(response_data["error"], str) - assert response_data["error"] # not empty + assert response_data["error"] @pytest.mark.asyncio @@ -212,7 +212,7 @@ async def test_missing_required_fields(request: pytest.FixtureRequest) -> None: """Test API response when required fields are missing.""" client = request.getfixturevalue("test_client") form_data = { - "max_file_size": "243", + "max_file_size": "200", "pattern_type": "exclude", "pattern": "", "token": "", @@ -221,12 +221,11 @@ async def test_missing_required_fields(request: pytest.FixtureRequest) -> None: assert response.status_code in ( status.HTTP_422_UNPROCESSABLE_ENTITY, status.HTTP_429_TOO_MANY_REQUESTS, - status.HTTP_200_OK, ) form_data = { - "input_text": "https://github.com/microsoft/vscode", - "max_file_size": "243", + "input_text": "https://github.com/pallets/flask", + "max_file_size": "200", "pattern": "", "token": "", } @@ -245,7 +244,7 @@ async def test_invalid_field_types(request: pytest.FixtureRequest) -> None: form_data = { "input_text": 12345, - "max_file_size": "243", + "max_file_size": "200", "pattern_type": "exclude", "pattern": "", "token": "", @@ -254,8 +253,8 @@ async def test_invalid_field_types(request: pytest.FixtureRequest) -> None: assert response.status_code == status.HTTP_422_UNPROCESSABLE_ENTITY form_data = { - "input_text": "https://github.com/microsoft/vscode", - "max_file_size": "243", + "input_text": "https://github.com/pallets/flask", + "max_file_size": "200", "pattern_type": "exclude", "pattern": ["*.md"], "token": "", @@ -269,8 +268,8 @@ async def test_unsupported_pattern_type(request: pytest.FixtureRequest) -> None: """Test API response for unsupported pattern_type.""" client = request.getfixturevalue("test_client") form_data = { - "input_text": "https://github.com/microsoft/vscode", - "max_file_size": "243", + "input_text": "https://github.com/pallets/flask", + "max_file_size": "200", "pattern_type": "invalid_type", "pattern": "*.md", "token": "", @@ -286,17 +285,18 @@ async def test_invalid_token(request: pytest.FixtureRequest) -> None: """Test API response for an invalid or expired token.""" client = request.getfixturevalue("test_client") form_data = { - "input_text": "https://github.com/microsoft/vscode", - "max_file_size": "243", + "input_text": "https://github.com/pallets/flask", + "max_file_size": "200", "pattern_type": "exclude", "pattern": "", "token": "invalid_token_1234567890", } response = client.post("/api/ingest", json=form_data) + # Accept all likely error codes for invalid token assert response.status_code in ( status.HTTP_401_UNAUTHORIZED, status.HTTP_400_BAD_REQUEST, status.HTTP_429_TOO_MANY_REQUESTS, - ) + ), f"Unexpected status code: {response.status_code}" response_data = response.json() assert "error" in response_data or "detail" in response_data From faa73a66cfbd880e3b75dfe0ab18a872b63f7b33 Mon Sep 17 00:00:00 2001 From: Fritz Nastor <98058294+fritznastor@users.noreply.github.com> Date: Wed, 23 Jul 2025 20:44:07 -0500 Subject: [PATCH 3/7] added comprehensive tests for binary files, symlinks, and file handling --- tests/test_ingestion.py | 222 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 216 insertions(+), 6 deletions(-) diff --git a/tests/test_ingestion.py b/tests/test_ingestion.py index f3585e05..36e72c2a 100644 --- a/tests/test_ingestion.py +++ b/tests/test_ingestion.py @@ -46,9 +46,6 @@ def test_run_ingest_query(temp_directory: Path, sample_query: IngestionQuery) -> assert "dir2/file_dir2.txt" in content -# TODO: Additional tests: -# - Multiple include patterns, e.g. ["*.txt", "*.py"] or ["/src/*", "*.txt"]. -# - Edge cases with weird file names or deep subdirectory structures. # TODO : def test_include_nonexistent_extension @@ -222,14 +219,227 @@ def test_include_ignore_patterns( assert (num_files_match := num_files_regex.search(summary)) is not None assert int(num_files_match.group(1)) == pattern_scenario["expected_num_files"] - # Check presence of key files in the content for expected_content_item in pattern_scenario["expected_content"]: assert expected_content_item in content - # check presence of included directories in structure for expected_structure_item in pattern_scenario["expected_structure"]: assert expected_structure_item in structure - # check non-presence of non-included directories in structure for expected_not_structure_item in pattern_scenario["expected_not_structure"]: assert expected_not_structure_item not in structure + + +def test_ingest_skips_binary_files(temp_directory: Path, sample_query: IngestionQuery) -> None: + """Test that binary files are not included as raw content, but as a marker.""" + binary_file = temp_directory / "binary.bin" + binary_file.write_bytes(b"\x00\xff\x00\xff") + + sample_query.local_path = temp_directory + sample_query.subpath = "/" + sample_query.type = None + + _, _, content = ingest_query(sample_query) + assert "binary.bin" in content + assert "[Binary file]" in content + assert b"\x00\xff\x00\xff".decode(errors="ignore") not in content + + +def test_ingest_binary_file_summary(temp_directory: Path, sample_query: IngestionQuery) -> None: + """Ensure binary files are counted and marked in content.""" + binary_file = temp_directory / "binary.bin" + binary_file.write_bytes(b"\x00\xff\x00\xff") + sample_query.local_path = temp_directory + sample_query.subpath = "/" + sample_query.type = None + summary, _, content = ingest_query(sample_query) + assert "binary.bin" in content + assert "[Binary file]" in content + assert "Files analyzed:" in summary + + +def test_ingest_skips_symlinks(temp_directory: Path, sample_query: IngestionQuery) -> None: + """Test that symlinks are not included as file content, but as a marker.""" + target_file = temp_directory / "file1.txt" + target_file.write_text("hello") + symlink = temp_directory / "symlink.txt" + symlink.symlink_to(target_file) + + sample_query.local_path = temp_directory + sample_query.subpath = "/" + sample_query.type = None + + _, _, content = ingest_query(sample_query) + assert "symlink.txt" in content + assert "SYMLINK: symlink.txt" in content + assert "hello" not in content.split("SYMLINK: symlink.txt")[1] + + +def test_ingest_symlink_summary(temp_directory: Path, sample_query: IngestionQuery) -> None: + """Ensure symlinks are marked in content.""" + target_file = temp_directory / "file1.txt" + target_file.write_text("hello") + symlink = temp_directory / "symlink.txt" + symlink.symlink_to(target_file) + sample_query.local_path = temp_directory + sample_query.subpath = "/" + sample_query.type = None + summary, _, content = ingest_query(sample_query) + assert "symlink.txt" in content + assert "SYMLINK: symlink.txt" in content + assert "Files analyzed:" in summary + + +def test_ingest_large_file_handling(temp_directory: Path, sample_query: IngestionQuery) -> None: + """Test that files exceeding max_file_size are skipped.""" + large_file = temp_directory / "large.txt" + large_file.write_text("A" * (sample_query.max_file_size + 1)) + + sample_query.local_path = temp_directory + sample_query.subpath = "/" + sample_query.type = None + + _, _, content = ingest_query(sample_query) + assert "large.txt" not in content, "Large files should be skipped from content." + + +def test_ingest_hidden_files(temp_directory: Path, sample_query: IngestionQuery) -> None: + """Test that hidden files are handled according to ignore/include patterns.""" + hidden_file = temp_directory / ".hidden.txt" + hidden_file.write_text("secret") + + sample_query.local_path = temp_directory + sample_query.subpath = "/" + sample_query.type = None + sample_query.ignore_patterns = {".hidden.txt"} + + summary, _, content = ingest_query(sample_query) + assert ".hidden.txt" not in content + assert ".hidden.txt" not in summary + + +def test_ingest_empty_file(temp_directory: Path, sample_query: IngestionQuery) -> None: + """Test that empty files are included but content is empty.""" + empty_file = temp_directory / "empty.txt" + empty_file.write_text("") + + sample_query.local_path = temp_directory + sample_query.subpath = "/" + sample_query.type = None + + _, _, content = ingest_query(sample_query) + assert "empty.txt" in content + # Adjust regex to match actual output + assert re.search(r"FILE: empty\.txt\s*\n=+\n\s*\n", content) or "FILE: empty.txt" in content + + +def test_ingest_permission_error(temp_directory: Path, sample_query: IngestionQuery) -> None: + """Test that files with permission errors are marked in content.""" + restricted_file = temp_directory / "restricted.txt" + restricted_file.write_text("top secret") + restricted_file.chmod(0o000) + + sample_query.local_path = temp_directory + sample_query.subpath = "/" + sample_query.type = None + + _, _, content = ingest_query(sample_query) + assert "restricted.txt" in content + assert "Error reading file" in content + restricted_file.chmod(0o644) + + +def test_ingest_weird_encoding(temp_directory: Path, sample_query: IngestionQuery) -> None: + """Test that files with non-UTF8 encoding are marked in content.""" + weird_file = temp_directory / "weird.txt" + weird_file.write_bytes("café".encode("utf-16")) + + sample_query.local_path = temp_directory + sample_query.subpath = "/" + sample_query.type = None + + _, _, content = ingest_query(sample_query) + assert "weird.txt" in content + assert "[Encoding error]" in content or "[Binary file]" in content + + +def test_ingest_deeply_nested_structure(temp_directory: Path, sample_query: IngestionQuery) -> None: + """Test that deeply nested files are included if patterns match.""" + nested_dir = temp_directory / "a/b/c/d/e" + nested_dir.mkdir(parents=True) + nested_file = nested_dir / "deep.txt" + nested_file.write_text("deep content") + + sample_query.local_path = temp_directory + sample_query.subpath = "/" + sample_query.type = None + sample_query.include_patterns = {"**/deep.txt"} + + summary, _, content = ingest_query(sample_query) + assert "deep.txt" in content + assert "Files analyzed:" in summary + + +def test_include_nonexistent_extension(temp_directory: Path, sample_query: IngestionQuery) -> None: + """Test that include patterns with nonexistent extensions match no files.""" + sample_query.local_path = temp_directory + sample_query.subpath = "/" + sample_query.type = None + sample_query.include_patterns = {"*.xyz"} + summary, _, content = ingest_query(sample_query) + assert "Files analyzed: 0" in summary + assert content.strip() == "" + + +def test_ignore_nonexistent_files(temp_directory: Path, sample_query: IngestionQuery) -> None: + """Test that ignore patterns with nonexistent files do not affect results.""" + sample_query.local_path = temp_directory + sample_query.subpath = "/" + sample_query.type = None + sample_query.ignore_patterns = {"nonexistent.txt"} + summary, _, content = ingest_query(sample_query) + assert "file1.txt" in content + assert "Files analyzed:" in summary + + +def test_unicode_special_char_filenames(temp_directory: Path, sample_query: IngestionQuery) -> None: + """Test ingestion of files with unicode/special characters in filenames.""" + unicode_file = temp_directory / "unicodé_文件.txt" + unicode_file.write_text("hello unicode") + sample_query.local_path = temp_directory + sample_query.subpath = "/" + sample_query.type = None + _, _, content = ingest_query(sample_query) + assert "unicodé_文件.txt" in content + assert "hello unicode" in content + + +def test_mixed_line_endings(temp_directory: Path, sample_query: IngestionQuery) -> None: + """Test ingestion of files with mixed line endings (LF/CRLF).""" + lf_file = temp_directory / "lf.txt" + crlf_file = temp_directory / "crlf.txt" + lf_file.write_text("line1\nline2\n") + crlf_file.write_text("line1\r\nline2\r\n") + sample_query.local_path = temp_directory + sample_query.subpath = "/" + sample_query.type = None + _, _, content = ingest_query(sample_query) + assert "lf.txt" in content + assert "crlf.txt" in content + assert "line1" in content + assert "line2" in content + + +def test_mixed_file_types_in_directory(temp_directory: Path, sample_query: IngestionQuery) -> None: + """Test ingestion with a mix of file types in one directory.""" + (temp_directory / "text.txt").write_text("text") + (temp_directory / "binary.bin").write_bytes(b"\x00\xff") + (temp_directory / "symlink.txt").symlink_to(temp_directory / "text.txt") + sample_query.local_path = temp_directory + sample_query.subpath = "/" + sample_query.type = None + _, _, content = ingest_query(sample_query) + assert "text.txt" in content + assert "binary.bin" in content + assert "[Binary file]" in content + assert "symlink.txt" in content + assert "SYMLINK:" in content From 615385c10986a229e6433ae317f4e15be70b1bcb Mon Sep 17 00:00:00 2001 From: Fritz Nastor <98058294+fritznastor@users.noreply.github.com> Date: Thu, 24 Jul 2025 15:58:40 -0500 Subject: [PATCH 4/7] -Removed redundant unit tests -Added pattern matching unit tests --- tests/test_ingestion.py | 75 ++++++++++++++++++++++++++++------------- 1 file changed, 52 insertions(+), 23 deletions(-) diff --git a/tests/test_ingestion.py b/tests/test_ingestion.py index 36e72c2a..f51b408c 100644 --- a/tests/test_ingestion.py +++ b/tests/test_ingestion.py @@ -244,19 +244,6 @@ def test_ingest_skips_binary_files(temp_directory: Path, sample_query: Ingestion assert b"\x00\xff\x00\xff".decode(errors="ignore") not in content -def test_ingest_binary_file_summary(temp_directory: Path, sample_query: IngestionQuery) -> None: - """Ensure binary files are counted and marked in content.""" - binary_file = temp_directory / "binary.bin" - binary_file.write_bytes(b"\x00\xff\x00\xff") - sample_query.local_path = temp_directory - sample_query.subpath = "/" - sample_query.type = None - summary, _, content = ingest_query(sample_query) - assert "binary.bin" in content - assert "[Binary file]" in content - assert "Files analyzed:" in summary - - def test_ingest_skips_symlinks(temp_directory: Path, sample_query: IngestionQuery) -> None: """Test that symlinks are not included as file content, but as a marker.""" target_file = temp_directory / "file1.txt" @@ -274,19 +261,18 @@ def test_ingest_skips_symlinks(temp_directory: Path, sample_query: IngestionQuer assert "hello" not in content.split("SYMLINK: symlink.txt")[1] -def test_ingest_symlink_summary(temp_directory: Path, sample_query: IngestionQuery) -> None: - """Ensure symlinks are marked in content.""" - target_file = temp_directory / "file1.txt" - target_file.write_text("hello") - symlink = temp_directory / "symlink.txt" - symlink.symlink_to(target_file) +def test_symlink_loop(temp_directory: Path, sample_query: IngestionQuery) -> None: + """Test that symlink loops do not cause infinite recursion.""" + loop_dir = temp_directory / "loop" + loop_dir.mkdir() + (loop_dir / "file.txt").write_text("loop file") + # Create a symlink inside loop_dir pointing to its parent + (loop_dir / "parent_link").symlink_to(temp_directory) sample_query.local_path = temp_directory sample_query.subpath = "/" sample_query.type = None - summary, _, content = ingest_query(sample_query) - assert "symlink.txt" in content - assert "SYMLINK: symlink.txt" in content - assert "Files analyzed:" in summary + _, _, content = ingest_query(sample_query) + assert "file.txt" in content def test_ingest_large_file_handling(temp_directory: Path, sample_query: IngestionQuery) -> None: @@ -443,3 +429,46 @@ def test_mixed_file_types_in_directory(temp_directory: Path, sample_query: Inges assert "[Binary file]" in content assert "symlink.txt" in content assert "SYMLINK:" in content + + +def test_pattern_matching_various_globs(temp_directory: Path, sample_query: IngestionQuery) -> None: + """Test that various glob patterns correctly match files for ingestion.""" + (temp_directory / "foo.txt").write_text("foo") + (temp_directory / "bar.py").write_text("bar") + (temp_directory / "baz.md").write_text("baz") + subdir = temp_directory / "sub" + subdir.mkdir() + (subdir / "nested.py").write_text("nested") + (subdir / "nested.txt").write_text("nested txt") + + sample_query.local_path = temp_directory + sample_query.subpath = "/" + sample_query.type = None + sample_query.include_patterns = {"*.txt"} + sample_query.ignore_patterns = set() + _, _, content = ingest_query(sample_query) + assert "foo.txt" in content + assert "bar.py" not in content + assert "baz.md" not in content + assert "nested.txt" in content + + sample_query.include_patterns = {"**/*.py"} + _, _, content = ingest_query(sample_query) + assert "bar.py" in content + assert "nested.py" in content + assert "foo.txt" not in content + + sample_query.include_patterns = {"*.md", "sub/*.txt"} + _, _, content = ingest_query(sample_query) + assert "baz.md" in content + assert "nested.txt" in content + assert "foo.txt" not in content + assert "bar.py" not in content + + sample_query.include_patterns = set() + sample_query.ignore_patterns = {"*.py", "sub/*.py"} + _, _, content = ingest_query(sample_query) + assert "foo.txt" in content + assert "baz.md" in content + assert "bar.py" not in content + assert "nested.py" not in content From f20e47e08f19c061bca90f82abecb70fe1997d71 Mon Sep 17 00:00:00 2001 From: Fritz Nastor <98058294+fritznastor@users.noreply.github.com> Date: Thu, 24 Jul 2025 16:17:57 -0500 Subject: [PATCH 5/7] Improve git_utils.py test cases: added edge case, security, and input validation tests --- tests/test_git_utils.py | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/tests/test_git_utils.py b/tests/test_git_utils.py index 48408130..5433b90d 100644 --- a/tests/test_git_utils.py +++ b/tests/test_git_utils.py @@ -277,3 +277,42 @@ def test_create_git_command_ignores_non_github_urls( # Should only have base command and -C option, no auth headers expected = [*base_cmd, "-C", local_path] assert cmd == expected + + +@pytest.mark.parametrize( + "url", + [ + "", + "not-a-url", + "ftp://github.com/owner/repo.git", + "github.com/owner/repo.git", + "https://", + ], +) +def test_is_github_host_edge_cases(url: str) -> None: + """Test is_github_host with malformed or edge-case URLs.""" + try: + result = is_github_host(url) + assert isinstance(result, bool) + except (ValueError, TypeError) as exc: + pytest.fail(f"is_github_host raised {exc.__class__.__name__} for url: {url}") + + +def test_create_git_command_empty_base_cmd() -> None: + """Test create_git_command with an empty base_cmd.""" + cmd = create_git_command([], "/tmp", "https://github.com/owner/repo.git", None) + assert cmd[:2] == ["-C", "/tmp"] + + +def test_create_git_command_empty_token() -> None: + """Test create_git_command with an empty token string.""" + cmd = create_git_command(["git", "clone"], "/tmp", "https://github.com/owner/repo.git", "") + assert "-c" not in cmd + + +def test_token_not_in_command_plaintext() -> None: + """Ensure the token is not present in the command as plain text.""" + token = "ghp_" + "x" * 36 + cmd = create_git_command(["git", "clone"], "/tmp", "https://github.com/owner/repo.git", token) + for part in cmd: + assert token not in part or "Basic" in part From 8b3e064d61ceff2351d905bdbb77cfa6c042eff5 Mon Sep 17 00:00:00 2001 From: Fritz Nastor <98058294+fritznastor@users.noreply.github.com> Date: Fri, 25 Jul 2025 18:59:39 -0500 Subject: [PATCH 6/7] removed def test_create_git_command_empty_base_cmd() --- tests/test_git_utils.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/tests/test_git_utils.py b/tests/test_git_utils.py index 5433b90d..3eafc3f5 100644 --- a/tests/test_git_utils.py +++ b/tests/test_git_utils.py @@ -298,18 +298,6 @@ def test_is_github_host_edge_cases(url: str) -> None: pytest.fail(f"is_github_host raised {exc.__class__.__name__} for url: {url}") -def test_create_git_command_empty_base_cmd() -> None: - """Test create_git_command with an empty base_cmd.""" - cmd = create_git_command([], "/tmp", "https://github.com/owner/repo.git", None) - assert cmd[:2] == ["-C", "/tmp"] - - -def test_create_git_command_empty_token() -> None: - """Test create_git_command with an empty token string.""" - cmd = create_git_command(["git", "clone"], "/tmp", "https://github.com/owner/repo.git", "") - assert "-c" not in cmd - - def test_token_not_in_command_plaintext() -> None: """Ensure the token is not present in the command as plain text.""" token = "ghp_" + "x" * 36 From 65cb008db6f4b0304ecce8408920206ac5683250 Mon Sep 17 00:00:00 2001 From: Fritz Nastor <98058294+fritznastor@users.noreply.github.com> Date: Fri, 1 Aug 2025 04:03:01 -0500 Subject: [PATCH 7/7] modified unit test test_repository_with_patterns to handle different repositories dynamically --- tests/server/test_flow_integration.py | 55 ++++++++++++++++----------- 1 file changed, 32 insertions(+), 23 deletions(-) diff --git a/tests/server/test_flow_integration.py b/tests/server/test_flow_integration.py index 3312bc00..ad40dfbc 100644 --- a/tests/server/test_flow_integration.py +++ b/tests/server/test_flow_integration.py @@ -1,5 +1,6 @@ """Integration tests covering core functionalities, edge cases, and concurrency handling.""" +import re import shutil import sys from concurrent.futures import ThreadPoolExecutor @@ -168,44 +169,52 @@ async def test_large_file_handling(request: pytest.FixtureRequest) -> None: @pytest.mark.asyncio async def test_repository_with_patterns(request: pytest.FixtureRequest) -> None: - """Test repository analysis with include/exclude patterns.""" + """Test repository analysis using include patterns on a real GitHub repo.""" client = request.getfixturevalue("test_client") + + # Target repository and file pattern + repo_url = "https://github.com/pallets/flask" + pattern = "*.md" + form_data = { - "input_text": "https://github.com/octocat/Hello-World", + "input_text": repo_url, "max_file_size": 243, "pattern_type": "include", - "pattern": "*.md", + "pattern": pattern, "token": "", } response = client.post("/api/ingest", json=form_data) - assert response.status_code == status.HTTP_200_OK, f"Request failed: {response.text}" + assert response.status_code == status.HTTP_200_OK, f"Expected 200 OK, got {response.status_code}: {response.text}" response_data = response.json() - if response.status_code == status.HTTP_200_OK: - assert "content" in response_data - assert isinstance(response_data["content"], str) + assert isinstance(response_data, dict), "Response is not a JSON object" - assert "repo_url" in response_data - assert response_data["repo_url"].startswith("https://github.com/") + # Ruff-compliant assertions + assert "content" in response_data, "Missing 'content' in response" + assert isinstance(response_data["content"], str), "'content' is not a string" - assert "summary" in response_data - assert isinstance(response_data["summary"], str) - assert "pallets/flask" in response_data["summary"].lower() + assert "repo_url" in response_data, "Missing 'repo_url'" + assert response_data["repo_url"].startswith("https://github.com/"), ( + "'repo_url' does not start with expected prefix" + ) - assert "tree" in response_data - assert isinstance(response_data["tree"], str) - assert "pallets-flask" in response_data["tree"].lower() + assert "summary" in response_data, "Missing 'summary'" + assert isinstance(response_data["summary"], str), "'summary' is not a string" - assert "pattern_type" in response_data - assert response_data["pattern_type"] == "include" + assert "tree" in response_data, "Missing 'tree'" + assert isinstance(response_data["tree"], str), "'tree' is not a string" - assert "pattern" in response_data - assert response_data["pattern"] == "*.md" - else: - assert "error" in response_data - assert isinstance(response_data["error"], str) - assert response_data["error"] + assert "pattern_type" in response_data, "Missing 'pattern_type'" + assert response_data["pattern_type"] == "include", "Unexpected 'pattern_type' value" + + assert "pattern" in response_data, "Missing 'pattern'" + assert response_data["pattern"] == pattern, "Unexpected 'pattern' value" + + # Dynamically validate repo name + repo_slug = re.sub(r"https://github\.com/", "", repo_url).lower() + assert repo_slug in response_data["summary"].lower(), f"Expected repo slug '{repo_slug}' in summary" + assert repo_slug.replace("/", "-") in response_data["tree"].lower(), f"Expected slug '{repo_slug}' in tree" @pytest.mark.asyncio