Skip to content

Commit

Permalink
allow to use multiple retrieval backends
Browse files Browse the repository at this point in the history
  • Loading branch information
mam10eks committed Nov 12, 2024
1 parent 22d1864 commit 94b0803
Show file tree
Hide file tree
Showing 7 changed files with 104 additions and 0 deletions.
17 changes: 17 additions & 0 deletions chatnoir_api/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,20 @@ def cache_contents(
)
response.raise_for_status()
return response.text

def term_vectors(
trec_id: str,
index: Index,
timeout: int = DEFAULT_TIMEOUT,
) -> str:
response: Response = get(
urljoin('https://chatnoir-webcontent.web.webis.de/', "_termvectors"),

params={
"trec-id": trec_id,
"index": index_id(index)
},
timeout=timeout,
)
response.raise_for_status()
return response.json()
1 change: 1 addition & 0 deletions chatnoir_api/defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,5 @@
DEFAULT_BACKOFF_SECONDS: int = 1
DEFAULT_TIMEOUT: int = 60
# Note: This public API key has a small request budget. If you want to use ChatNoir more extensively, please request an API key at https://chatnoir.web.webis.de/apikey/
DEFAULT_RETRIEVAL_SYSTEM = 'default'
DEFAULT_API_KEY = "LTmnNLQeQvBlNjwWeuNxz1vdya3HpSzN"
2 changes: 2 additions & 0 deletions chatnoir_api/v1/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ class Request(DataClassJsonMixin):
explain: bool
minimal: bool
extended_meta: bool
search_method: str


@dataclass(frozen=True)
Expand Down Expand Up @@ -103,6 +104,7 @@ class MetaResponse(Meta, DataClassJsonMixin):
)
query_time: int
total_results: int
search_method: str


@dataclass(frozen=True)
Expand Down
5 changes: 5 additions & 0 deletions chatnoir_api/v1/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
DEFAULT_RETRIES,
DEFAULT_BACKOFF_SECONDS,
DEFAULT_EXTENDED_META,
DEFAULT_RETRIEVAL_SYSTEM,
)
from chatnoir_api.v1.model import (
MinimalSearchResponse,
Expand Down Expand Up @@ -180,6 +181,7 @@ def search(
retries: int = DEFAULT_RETRIES,
backoff_seconds: float = DEFAULT_BACKOFF_SECONDS,
api_key: str = DEFAULT_API_KEY,
retrieval_system: str = DEFAULT_RETRIEVAL_SYSTEM
) -> Results[
Union[Meta, ExtendedMeta],
Union[
Expand Down Expand Up @@ -209,6 +211,7 @@ def load_page(start: int, size: int) -> Results[
size=size,
retries=retries,
backoff_seconds=backoff_seconds,
retrieval_system=retrieval_system,
)

return LazyResultSequence(
Expand Down Expand Up @@ -371,6 +374,7 @@ def search_page(
retries: int = DEFAULT_RETRIES,
backoff_seconds: float = DEFAULT_BACKOFF_SECONDS,
api_key: str = DEFAULT_API_KEY,
retrieval_system: str = DEFAULT_RETRIEVAL_SYSTEM
) -> Results[
Union[Meta, ExtendedMeta],
Union[
Expand Down Expand Up @@ -406,6 +410,7 @@ def search_page(
explain=explain,
minimal=minimal,
extended_meta=extended_meta,
search_method=retrieval_system,
)
if not extended_meta:
if minimal:
Expand Down
1 change: 1 addition & 0 deletions chatnoir_api/v1/search_phrases.py
Original file line number Diff line number Diff line change
Expand Up @@ -431,6 +431,7 @@ def search_phrases_page(
minimal=minimal,
extended_meta=extended_meta,
slop=slop,
search_method='default'
)
if not extended_meta:
if minimal:
Expand Down
40 changes: 40 additions & 0 deletions tests/test_search_for_different_systems.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
from typing import Sequence

from pytest import fixture

from chatnoir_api import Index, Meta, Results, MinimalResult, \
Result, ExplainedMinimalResult, ExtendedMeta
from chatnoir_api.v1 import search_page, search


@fixture(params=[True, False])
def minimal(request) -> bool:
return request.param


def test_default_retrieval_model(api_key: str) -> None:
results = search("how to find a test dataset", index='msmarco-document-v2.1', retrieval_system='default')

meta = results.meta

assert meta is not None
assert isinstance(meta, Meta)

assert meta.total_results is not None
assert isinstance(meta.total_results, int)
assert meta.total_results == 5517
assert meta.search_method == 'default'

def test_bm25_retrieval_model(api_key: str) -> None:
results = search("how to find a test dataset", index='msmarco-document-v2.1', retrieval_system='bm25')

meta = results.meta

assert meta is not None
assert isinstance(meta, Meta)

assert meta.total_results is not None
assert isinstance(meta.total_results, int)
assert meta.total_results == 2800000
assert meta.search_method == 'bm25'

38 changes: 38 additions & 0 deletions tests/test_term_vectors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
from chatnoir_api.cache import term_vectors

def test_ms_marco_document_1() -> None:
contents = term_vectors(
trec_id='msmarco_doc_16_4126868268',
index='msmarco-document-v2',
)

assert contents is not None
assert 'term_vectors' in contents
assert 'body_lang_en' in contents['term_vectors']
assert 'terms' in contents['term_vectors']['body_lang_en']
terms = contents['term_vectors']['body_lang_en']['terms']

assert 'hidden' in terms
assert 25730 == terms['hidden']['doc_freq']
assert 37614 == terms['hidden']['ttf']
assert 1 == terms['hidden']['term_freq']

def test_ms_marco_document_2() -> None:
contents = term_vectors(
trec_id='msmarco_doc_56_310927995',
index='msmarco-document-v2',
)

assert contents is not None
assert 'term_vectors' in contents
assert 'body_lang_en' in contents['term_vectors']
assert 'terms' in contents['term_vectors']['body_lang_en']
terms = contents['term_vectors']['body_lang_en']['terms']

assert 'classifier' in terms
assert 132 == terms['classifier']['doc_freq']
assert 442 == terms['classifier']['ttf']
assert 6 == terms['classifier']['term_freq']



0 comments on commit 94b0803

Please sign in to comment.