Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixed ZeroDivisionError in JoinDocuments #7972

Merged
merged 35 commits into from
Jul 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
73dd817
added new strategy DBRF
nickprock Jun 23, 2024
92efec5
fix hook
nickprock Jun 23, 2024
1988d57
fix typos
nickprock Jun 23, 2024
310d88d
added test for DBRF
nickprock Jun 23, 2024
3c8dc3a
fix format
nickprock Jun 23, 2024
5332d67
new release note
nickprock Jun 23, 2024
967c441
reformatted with black
nickprock Jun 24, 2024
fb1ad6b
Update haystack/components/joiners/document_joiner.py
nickprock Jun 25, 2024
af48479
updated comments
nickprock Jun 25, 2024
12e2113
added type-hint and return type
nickprock Jun 25, 2024
7268cbe
Merge branch 'main' into dev/dbrf
nickprock Jun 25, 2024
50a0f6b
fix
nickprock Jun 25, 2024
87f04e9
revert for lint problems
nickprock Jun 25, 2024
d1e9b9b
fix
nickprock Jul 1, 2024
72ce083
fix
nickprock Jul 1, 2024
edcc311
fix
nickprock Jul 1, 2024
eddf64e
fix
nickprock Jul 1, 2024
6b65606
another tentative
nickprock Jul 1, 2024
66607ef
dict out file
nickprock Jul 1, 2024
d5c1d5f
only output
nickprock Jul 1, 2024
13ff2f0
fix output
nickprock Jul 1, 2024
892b4e7
revert
nickprock Jul 1, 2024
6e632bb
removed unused imports
nickprock Jul 1, 2024
b287977
fix typing
anakin87 Jul 3, 2024
e802d06
Merge branch 'main' into dev/dbrf
nickprock Jul 3, 2024
dd03d51
fixed ZeroDivisionError
nickprock Jul 3, 2024
14e4d74
added test
nickprock Jul 3, 2024
5b700ea
add release note
nickprock Jul 3, 2024
f4084cd
removed try - except
nickprock Jul 3, 2024
8981938
renamed test
nickprock Jul 3, 2024
866bf8e
Update test/components/joiners/test_document_joiner.py
nickprock Jul 3, 2024
eb37fb2
Update haystack/components/joiners/document_joiner.py
nickprock Jul 3, 2024
dbd0173
fix format error
nickprock Jul 3, 2024
f4070c1
removed releasenotes/notes/release-note-9b2bc03a8a398078.yaml
nickprock Jul 3, 2024
192cae4
added comment
nickprock Jul 3, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion haystack/components/joiners/document_joiner.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,9 +195,11 @@ def _distribution_based_rank_fusion(self, document_lists: List[List[Document]])
std_dev = (sum((x - mean_score) ** 2 for x in scores_list) / len(scores_list)) ** 0.5
min_score = mean_score - 3 * std_dev
max_score = mean_score + 3 * std_dev
delta_score = max_score - min_score

for doc in documents:
doc.score = (doc.score - min_score) / (max_score - min_score)
doc.score = (doc.score - min_score) / delta_score if delta_score != 0.0 else 0.0
# if all docs have the same score delta_score is 0, the docs are uninformative for the query

output = self._concatenate(document_lists=document_lists)

Expand Down
30 changes: 30 additions & 0 deletions test/components/joiners/test_document_joiner.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,36 @@ def test_run_with_distribution_based_rank_fusion_join_mode(self):
]
assert all(doc.id in expected_document_ids for doc in output["documents"])

def test_run_with_distribution_based_rank_fusion_join_mode_same_scores(self):
joiner = DocumentJoiner(join_mode="distribution_based_rank_fusion")
documents_1 = [
Document(content="a", score=0.2),
Document(content="b", score=0.2),
Document(content="c", score=0.2),
]
documents_2 = [
Document(content="d", score=0.5),
Document(content="e", score=0.8),
Document(content="f", score=1.1, meta={"key": "value"}),
Document(content="g", score=0.3),
Document(content="a", score=0.3),
]
output = joiner.run([documents_1, documents_2])
assert len(output["documents"]) == 7
expected_document_ids = [
doc.id
for doc in [
Document(content="a", score=0),
Document(content="b", score=0),
Document(content="c", score=0),
Document(content="d", score=0.44),
Document(content="e", score=0.60),
Document(content="f", score=0.76, meta={"key": "value"}),
Document(content="g", score=0.33),
]
]
assert all(doc.id in expected_document_ids for doc in output["documents"])

def test_run_with_top_k_in_run_method(self):
joiner = DocumentJoiner()
documents_1 = [Document(content="a"), Document(content="b"), Document(content="c")]
Expand Down