Skip to content

Commit

Permalink
prepare tests for examples and run them
Browse files Browse the repository at this point in the history
  • Loading branch information
sh-rp committed Mar 25, 2024
1 parent 78ae021 commit 4f5fe0b
Show file tree
Hide file tree
Showing 5 changed files with 81 additions and 26 deletions.
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ lint-and-test-snippets:
lint-and-test-examples:
poetry run mypy --config-file mypy.ini docs/examples
poetry run flake8 --max-line-length=200 docs/examples
cd docs/tools && poetry run python prepare_examples_tests.py
cd docs/examples && poetry run pytest


Expand Down
1 change: 1 addition & 0 deletions docs/examples/chess_production/.dlt/config.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
chess_url="https://api.chess.com/pub/"
1 change: 0 additions & 1 deletion docs/examples/chess_production/chess_production.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@
from dlt.common.runtime.slack import send_slack_message



@dlt.source
def chess(
chess_url: str = dlt.config.value,
Expand Down
51 changes: 26 additions & 25 deletions docs/examples/pdf_to_weaviate/pdf_to_weaviate.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,28 +57,29 @@ def pdf_to_text(file_item, separate_pages: bool = False):
yield page_item


pipeline = dlt.pipeline(pipeline_name="pdf_to_text", destination="weaviate")

# this constructs a simple pipeline that: (1) reads files from "invoices" folder (2) filters only those ending with ".pdf"
# (3) sends them to pdf_to_text transformer with pipe (|) operator
pdf_pipeline = list_files("assets/invoices").add_filter(
lambda item: item["file_name"].endswith(".pdf")
) | pdf_to_text(separate_pages=True)

# set the name of the destination table to receive pages
# NOTE: Weaviate, dlt's tables are mapped to classes
pdf_pipeline.table_name = "InvoiceText"

# use weaviate_adapter to tell destination to vectorize "text" column
load_info = pipeline.run(weaviate_adapter(pdf_pipeline, vectorize="text"))
row_counts = pipeline.last_trace.last_normalize_info
print(row_counts)
print("------")
print(load_info)

import weaviate

client = weaviate.Client("http://localhost:8080")
# get text of all the invoices in InvoiceText class we just created above
print(client.query.get("InvoiceText", ["text", "file_name", "mtime", "page_id"]).do())
assert_load_info(load_info)
if __name__ == "__main__":
pipeline = dlt.pipeline(pipeline_name="pdf_to_text", destination="weaviate")

# this constructs a simple pipeline that: (1) reads files from "invoices" folder (2) filters only those ending with ".pdf"
# (3) sends them to pdf_to_text transformer with pipe (|) operator
pdf_pipeline = list_files("assets/invoices").add_filter(
lambda item: item["file_name"].endswith(".pdf")
) | pdf_to_text(separate_pages=True)

# set the name of the destination table to receive pages
# NOTE: Weaviate, dlt's tables are mapped to classes
pdf_pipeline.table_name = "InvoiceText"

# use weaviate_adapter to tell destination to vectorize "text" column
load_info = pipeline.run(weaviate_adapter(pdf_pipeline, vectorize="text"))
row_counts = pipeline.last_trace.last_normalize_info
print(row_counts)
print("------")
print(load_info)

import weaviate

client = weaviate.Client("http://localhost:8080")
# get text of all the invoices in InvoiceText class we just created above
print(client.query.get("InvoiceText", ["text", "file_name", "mtime", "page_id"]).do())
assert_load_info(load_info)
53 changes: 53 additions & 0 deletions docs/tools/prepare_examples_tests.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
"""
Creates the pytest files for our examples tests. These will not be committed
"""
import os
import dlt.cli.echo as fmt

EXAMPLES_DIR = "../examples"

# settings
SKIP_FOLDERS = ["archive", ".dlt", "__pycache__"]

# the entry point for the script
MAIN_CLAUSE = 'if __name__ == "__main__":'

# some stuff to insert for setting up and tearing down fixtures
TEST_HEADER = """
from tests.utils import skipifgithubfork
"""


if __name__ == "__main__":
count = 0
for example in next(os.walk(EXAMPLES_DIR))[1]:
if example in SKIP_FOLDERS:
continue
count += 1

example_file = f"{EXAMPLES_DIR}/{example}/{example}.py"
test_example_file = f"{EXAMPLES_DIR}/{example}/test_{example}.py"

with open(example_file, "r", encoding="utf-8") as f:
lines = f.read().split("\n")

processed_lines = TEST_HEADER.split("\n")
main_clause_found = False

for line in lines:
if line.startswith(MAIN_CLAUSE):
main_clause_found = True
processed_lines.append("@skipifgithubfork")
processed_lines.append(f"def test_{example}():")
else:
processed_lines.append(line)

if not main_clause_found:
fmt.error(f"No main clause defined for example {example}")
exit(1)

with open(test_example_file, "w", encoding="utf-8") as f:
f.write("\n".join(processed_lines))

fmt.note(f"Prepared {count} examples for testing.")

0 comments on commit 4f5fe0b

Please sign in to comment.