tests: base test examples and strategies

datasciencecampus · Aug 20, 2024 · 0c7659f · 0c7659f
1 parent 1f83415
commit 0c7659f
Show file tree

Hide file tree

Showing 2 changed files with 68 additions and 0 deletions.
diff --git a/tests/readers/base/strategies.py b/tests/readers/base/strategies.py
@@ -0,0 +1,37 @@
+"""Composite strategies for testing the base reader."""
+
+from hypothesis import strategies as st
+from langchain.docstore.document import Document
+
+from ...common import SEARCH_TERMS, ST_FREE_TEXT
+
+
+@st.composite
+def st_terms_and_texts(draw, terms=SEARCH_TERMS):
+    """Create a possibly term-ridden string."""
+
+    term = draw(st.lists(st.sampled_from(terms), max_size=1))
+    string = draw(ST_FREE_TEXT)
+    add_in = draw(st.booleans())
+
+    text = " ".join((string, *term)) if add_in else string
+
+    return term, text
+
+
+@st.composite
+def st_chunks_contains_responses(draw):
+    """Create a set of chunks, booleans, and responses for a test."""
+
+    chunks = draw(
+        st.lists(
+            ST_FREE_TEXT.map(lambda x: Document(page_content=x)),
+            min_size=1,
+            max_size=5,
+        )
+    )
+
+    contains = [True, *(draw(st.booleans()) for _ in chunks[1:])]
+    responses = [draw(ST_FREE_TEXT) for con in contains if con is True]
+
+    return chunks, contains, responses
diff --git a/tests/readers/base/test_examples.py b/tests/readers/base/test_examples.py
@@ -0,0 +1,31 @@
+"""Example tests for the base reader class."""
+
+import requests
+from bs4 import BeautifulSoup
+
+from ...common import ToyReader
+
+
+def test_does_not_match_for_extra_abbreviations():
+    """Ensure the string checker does not flag ONS+ abbreviations."""
+
+    reader = ToyReader(urls=[], terms=["ONS"])
+    strings = (
+        "The ONSR is the Only National Sandwich Ranking.",
+        "I AM UNLUCKY! SOME MIGHT SAY I AM DONSY!",
+    )
+
+    for string in strings:
+        assert not reader.check_contains_terms(string)
+
+
+def test_81_add_ons_not_matched():
+    """Ensure the example from #81 does not match."""
+
+    reader = ToyReader([], terms=["ONS"])
+    url = "https://theyworkforyou.com/wrans/?id=2024-04-12.21381.h"
+
+    response = requests.get(url)
+    soup = BeautifulSoup(response.content, "html.parser")
+
+    assert not reader.check_contains_terms(soup.get_text())