Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

HTML Search: omit anchor reference from document titles in the search index. #12047

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
e951f65
searchindex: omit the section reference when indexing each document's…
jayaddison Mar 4, 2024
5e05a27
refactor: move the fix to the wordcollector node visitor
jayaddison Mar 4, 2024
d44dc11
typing: emit empty-string instead of null for document title anchor v…
jayaddison Mar 4, 2024
a978110
tests: add failing test to catch a regression; other top-level sectio…
jayaddison Mar 4, 2024
ed47ab9
fixup: only omit the anchor for the first-encountered title
jayaddison Mar 4, 2024
26d6e3c
Revert "typing: emit empty-string instead of null for document title …
jayaddison Mar 4, 2024
5c75c81
linting: add ignore-E501 to overlong line
jayaddison Mar 4, 2024
3c557cd
Merge branch 'master' into issue-11961/searchindex-omit-doctitle-anchors
jayaddison Mar 15, 2024
b8177c5
Add CHANGES.rst entry
jayaddison Mar 15, 2024
d3a40d8
Merge branch 'master' into issue-11961/searchindex-omit-doctitle-anchors
jayaddison Mar 25, 2024
f794764
Merge branch 'master' into issue-11961/searchindex-omit-doctitle-anchors
jayaddison Apr 13, 2024
7c06fe9
Merge branch 'master' into issue-11961/searchindex-omit-doctitle-anchors
jayaddison Jun 10, 2024
e947f6b
Merge branch 'master' into issue-11961/searchindex-omit-doctitle-anchors
jayaddison Jun 13, 2024
9fef43b
[tests] search: regenerate test fixtures using 'utils/generate_js_fix…
jayaddison Jun 13, 2024
ad3488e
[docs] Move changelog entry to correct release, and remove preceding …
jayaddison Jun 13, 2024
dcdd743
[search] refactor title link ID logic to avoid E501 lint error.
jayaddison Jun 14, 2024
30d626e
[search] lint fixup: allow node-id reference in WordStore.titles to b…
jayaddison Jun 14, 2024
86fd24c
Merge branch 'master' into issue-11961/searchindex-omit-doctitle-anchors
jayaddison Jun 18, 2024
c7245d2
Fixup: remove excess heading delineation characters.
jayaddison Jun 18, 2024
2857c7b
Merge branch 'master' into issue-11961/searchindex-omit-doctitle-anchors
jayaddison Jun 24, 2024
520a205
[tests] JavaScript: remove workaround test data from mainline that is…
jayaddison Jun 24, 2024
64387c4
Merge branch 'master' into issue-11961/searchindex-omit-doctitle-anchors
jayaddison Jul 4, 2024
b349764
Code review feedback: add explanatory comments regarding runtime vs s…
jayaddison Jul 6, 2024
b256267
Merge branch 'master' into issue-11961/searchindex-omit-doctitle-anchors
jayaddison Jul 6, 2024
964d179
Merge branch 'master' into issue-11961/searchindex-omit-doctitle-anchors
picnixz Jul 8, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGES.rst
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,9 @@ Bugs fixed
* #12494: Fix invalid genindex.html file produced with translated docs
(regression in 7.1.0).
Patch by Nicolas Peugnet.
* #11961: Omit anchor references from document title entries in the search index,
removing duplication of search results.
Patch by James Addison.

Testing
-------
Expand Down
4 changes: 2 additions & 2 deletions sphinx/environment/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,15 +253,15 @@ def __init__(self, app: Sphinx) -> None:
# search index data

# docname -> title
self._search_index_titles: dict[str, str] = {}
self._search_index_titles: dict[str, str | None] = {}
# docname -> filename
self._search_index_filenames: dict[str, str] = {}
# stemmed words -> set(docname)
self._search_index_mapping: dict[str, set[str]] = {}
# stemmed words in titles -> set(docname)
self._search_index_title_mapping: dict[str, set[str]] = {}
# docname -> all titles in document
self._search_index_all_titles: dict[str, list[tuple[str, str]]] = {}
self._search_index_all_titles: dict[str, list[tuple[str, str | None]]] = {}
# docname -> list(index entry)
self._search_index_index_entries: dict[str, list[tuple[str, str, str]]] = {}
# objtype -> index
Expand Down
20 changes: 14 additions & 6 deletions sphinx/search/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ def _is_meta_keywords(
@dataclasses.dataclass
class WordStore:
words: list[str] = dataclasses.field(default_factory=list)
titles: list[tuple[str, str]] = dataclasses.field(default_factory=list)
titles: list[tuple[str, str | None]] = dataclasses.field(default_factory=list)
title_words: list[str] = dataclasses.field(default_factory=list)


Expand Down Expand Up @@ -253,15 +253,15 @@ class IndexBuilder:
def __init__(self, env: BuildEnvironment, lang: str, options: dict[str, str], scoring: str) -> None:
self.env = env
# docname -> title
self._titles: dict[str, str] = env._search_index_titles
self._titles: dict[str, str | None] = env._search_index_titles
# docname -> filename
self._filenames: dict[str, str] = env._search_index_filenames
# stemmed words -> set(docname)
self._mapping: dict[str, set[str]] = env._search_index_mapping
# stemmed words in titles -> set(docname)
self._title_mapping: dict[str, set[str]] = env._search_index_title_mapping
# docname -> all titles in document
self._all_titles: dict[str, list[tuple[str, str]]] = env._search_index_all_titles
self._all_titles: dict[str, list[tuple[str, str | None]]] = env._search_index_all_titles
# docname -> list(index entry)
self._index_entries: dict[str, list[tuple[str, str, str]]] = env._search_index_index_entries
# objtype -> index
Expand Down Expand Up @@ -369,6 +369,13 @@ def get_objects(self, fn2index: dict[str, int]
return rv

def get_terms(self, fn2index: dict[str, int]) -> tuple[dict[str, list[int] | int], dict[str, list[int] | int]]:
"""
Return a mapping of document and title terms to their corresponding sorted document IDs.

When a term is only found within a single document, then the value for that term will be
an integer value. When a term is found within multiple documents, the value will be a list
of integers.
"""
rvs: tuple[dict[str, list[int] | int], dict[str, list[int] | int]] = ({}, {})
for rv, mapping in zip(rvs, (self._mapping, self._title_mapping)):
for k, v in mapping.items():
Expand All @@ -391,7 +398,7 @@ def freeze(self) -> dict[str, Any]:
objtypes = {v: k[0] + ':' + k[1] for (k, v) in self._objtypes.items()}
objnames = self._objnames

alltitles: dict[str, list[tuple[int, str]]] = {}
alltitles: dict[str, list[tuple[int, str | None]]] = {}
Copy link
Member

@picnixz picnixz Jun 18, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you explain the shape of this value, i.e., to what corresponds list[tuple[int, str | None] (what is the integer, what is the str | None and) and what is the key str of the dict, and why does it change from the all_titles ? (the list has pairs of int and strings and not pairs of strings anymore).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Mm, this is not very clear at the moment, but I believe that the integer values here are what I'd call documentId values - sequential IDs assigned to each of the source .rst files at build-time. The second value in each tuple -- the str or empty value -- is a target reference (aka anchor when talking about HTML) -- it's an indicator about what destination on the page/document the user should be taken to were they to navigate using this entry.

Perhaps this is a situation where we should refactor and extract out meaningful type aliases (a bit like the recent Intersphinx refactorings).

Copy link
Member

@picnixz picnixz Jun 18, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would not use aliases, but the int always seem to me like corresponding to the index in the list, so that you could possibly have multiple times the same element but with different indices. However, I'd be happy to know if this is really the reason why we duplicate that or if we could just live with a list of strings.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess an example to exlain that could be a title like Recent Changes that could appear in multiple documents and might have different headings (or perhaps even be the document title in some cases).

alltitles: {"Recent Changes": [0, None], [1, "coolmodule-recentchanges"], [5, "anothermodule-recentchanges"], ...}

(a bit hypothetical, but it's to handle situations like that I believe. again, a good case for checking and documenting)

for docname, titlelist in sorted(self._all_titles.items()):
for title, titleid in titlelist:
alltitles.setdefault(title, []).append((fn2index[docname], titleid))
Expand Down Expand Up @@ -502,9 +509,10 @@ def _visit_nodes(node):
elif isinstance(node, nodes.Text):
word_store.words.extend(split(node.astext()))
elif isinstance(node, nodes.title):
title = node.astext()
title, is_main_title = node.astext(), len(word_store.titles) == 0
ids = node.parent['ids']
word_store.titles.append((title, ids[0] if ids else None))
title_node_id = None if is_main_title else ids[0] if ids else None
word_store.titles.append((title, title_node_id))
word_store.title_words.extend(split(title))
for child in node.children:
_visit_nodes(child)
Expand Down
2 changes: 1 addition & 1 deletion tests/js/fixtures/multiterm/searchindex.js

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion tests/js/fixtures/partial/searchindex.js

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 0 additions & 9 deletions tests/js/searchtools.js
Original file line number Diff line number Diff line change
Expand Up @@ -70,21 +70,12 @@ describe('Basic html theme search', function() {

searchParameters = Search._parseQuery('main page');

// fixme: duplicate result due to https://github.com/sphinx-doc/sphinx/issues/11961
hits = [
[
'index',
'Main Page',
'',
null,
15,
'index.rst'
],
[
'index',
'Main Page',
'#main-page',
null,
100,
'index.rst'
]
Expand Down
40 changes: 33 additions & 7 deletions tests/test_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,9 @@ def is_registered_term(index, keyword):

.. test that comments are not indexed: boson

another_title
=============

test that non-comments are indexed: fermion
'''

Expand Down Expand Up @@ -168,6 +171,10 @@ def test_IndexBuilder():
'docname2_1': 'title2_1', 'docname2_2': 'title2_2'}
assert index._filenames == {'docname1_1': 'filename1_1', 'docname1_2': 'filename1_2',
'docname2_1': 'filename2_1', 'docname2_2': 'filename2_2'}
# note: element iteration order (sort order) is important when the index
# is frozen (serialized) during build -- however, the _mapping-related
# dictionaries below may be iterated in arbitrary order by Python at
# runtime.
assert index._mapping == {
'ar': {'docname1_1', 'docname1_2', 'docname2_1', 'docname2_2'},
'fermion': {'docname1_1', 'docname1_2', 'docname2_1', 'docname2_2'},
Expand All @@ -176,7 +183,10 @@ def test_IndexBuilder():
'index': {'docname1_1', 'docname1_2', 'docname2_1', 'docname2_2'},
'test': {'docname1_1', 'docname1_2', 'docname2_1', 'docname2_2'},
}
assert index._title_mapping == {'section_titl': {'docname1_1', 'docname1_2', 'docname2_1', 'docname2_2'}}
assert index._title_mapping == {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here, I see that the values are sets and thus can be unordered. Is it possible to actually have a more reliable guarantee on the title_mapping actually?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To check that I understand correctly: use an assertion that checks not only the contents of the title mapping, but also the ordering of the elements inside it?

(makes sense to me, order can be significant for index data)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If my understanding there was correct: I'd prefer to handle that in a separate issue thread and PR, if that's OK. Or, if I misunderstood: let's make sure to resolve that first.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If there is more than one occurrence of that, yes we could discuss it in another thread.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep; I think it would be applicable for both _title_mapping and _mapping; because both of those currently use a pattern of <mapping>.setdefault(..., set()).add(...) (implying that insertion-order may not match subsequent access iteration order):

for word in word_store.title_words:
# add stemmed and unstemmed as the stemmer must not remove words
# from search index.
stemmed_word = stem(word)
if _filter(stemmed_word):
self._title_mapping.setdefault(stemmed_word, set()).add(docname)
elif _filter(word):
self._title_mapping.setdefault(word, set()).add(docname)
for word in word_store.words:
# add stemmed and unstemmed as the stemmer must not remove words
# from search index.
stemmed_word = stem(word)
if not _filter(stemmed_word) and _filter(word):
stemmed_word = word
already_indexed = docname in self._title_mapping.get(stemmed_word, ())
if _filter(stemmed_word) and not already_indexed:
self._mapping.setdefault(stemmed_word, set()).add(docname)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Correction: any runtime (in-memory) differences in the ordering of objects within the _title_mapping and _mapping set entries are later resolved into deterministic order by an explicit sort that occurs during freezing (serialization) of the search index (thanks to d24bd73).

I'm not aware of any other current bugs causing nondeterminism in our searchindex.js output since v7.3.0 (95fb0e5 included in that release handled a similar situation).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So _title_mapping are unsorted in the tests, but in production they are sorted? if so, a comment is needed (at least to remember the reason).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep, pretty much. I'll be a bit verbose:

  • In-memory (and therefore in the tests), each value in _title_mapping may store elements in arbitrary ordering.
  • The tests use Python set equality comparison, and that is order-agnostic (as it should be) -- so the tests aren't really checking the order, even though it might appear from the code that they would.
  • The output of a Sphinx production build to output a searchindex.js sorts these entries.

What you suggest about a comment makes sense...

...however.. perhaps I could attempt a separate PR to refactor the code so that sorting occurs in-place and in-memory, and to adjust the unit tests to assert on that. Those would provide stronger properties than a comment in the code.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No need for that. It's better that at runtime, the objects are represented in sets since they are more efficient. It's only for serialization that you need to sort to ensure reproducibility.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, agreed (storing the lists in-place for ordered iteration would probably add various types of overhead).

I've pushed a commit to add some commentary as a docstring on the relevant method and also in the unit tests.

'another_titl': {'docname1_1', 'docname1_2', 'docname2_1', 'docname2_2'},
jayaddison marked this conversation as resolved.
Show resolved Hide resolved
'section_titl': {'docname1_1', 'docname1_2', 'docname2_1', 'docname2_2'},
}
assert index._objtypes == {}
assert index._objnames == {}

Expand All @@ -196,8 +206,14 @@ def test_IndexBuilder():
'non': [0, 1, 2, 3],
'test': [0, 1, 2, 3]},
'titles': ('title1_1', 'title1_2', 'title2_1', 'title2_2'),
'titleterms': {'section_titl': [0, 1, 2, 3]},
'alltitles': {'section_title': [(0, 'section-title'), (1, 'section-title'), (2, 'section-title'), (3, 'section-title')]},
'titleterms': {
'another_titl': [0, 1, 2, 3],
'section_titl': [0, 1, 2, 3],
},
'alltitles': {
'another_title': [(0, 'another-title'), (1, 'another-title'), (2, 'another-title'), (3, 'another-title')],
'section_title': [(0, None), (1, None), (2, None), (3, None)],
},
'indexentries': {},
}
assert index._objtypes == {('dummy1', 'objtype1'): 0, ('dummy2', 'objtype1'): 1}
Expand Down Expand Up @@ -238,7 +254,10 @@ def test_IndexBuilder():
'index': {'docname1_2', 'docname2_2'},
'test': {'docname1_2', 'docname2_2'},
}
assert index._title_mapping == {'section_titl': {'docname1_2', 'docname2_2'}}
assert index._title_mapping == {
'another_titl': {'docname1_2', 'docname2_2'},
'section_titl': {'docname1_2', 'docname2_2'},
}
assert index._objtypes == {('dummy1', 'objtype1'): 0, ('dummy2', 'objtype1'): 1}
assert index._objnames == {0: ('dummy1', 'objtype1', 'objtype1'), 1: ('dummy2', 'objtype1', 'objtype1')}

Expand All @@ -257,8 +276,14 @@ def test_IndexBuilder():
'non': [0, 1],
'test': [0, 1]},
'titles': ('title1_2', 'title2_2'),
'titleterms': {'section_titl': [0, 1]},
'alltitles': {'section_title': [(0, 'section-title'), (1, 'section-title')]},
'titleterms': {
'another_titl': [0, 1],
'section_titl': [0, 1],
},
'alltitles': {
'another_title': [(0, 'another-title'), (1, 'another-title')],
'section_title': [(0, None), (1, None)],
},
'indexentries': {},
}
assert index._objtypes == {('dummy1', 'objtype1'): 0, ('dummy2', 'objtype1'): 1}
Expand Down Expand Up @@ -347,7 +372,8 @@ def assert_is_sorted(item, path: str):
assert_is_sorted(value, f'{path}.{key}')
elif isinstance(item, list):
if not is_title_tuple_type(item) and path not in lists_not_to_sort:
assert item == sorted(item), f'{err_path} is not sorted'
# sort nulls last; http://stackoverflow.com/questions/19868767/
assert item == sorted(item, key=lambda x: (x is None, x)), f'{err_path} is not sorted'
for i, child in enumerate(item):
assert_is_sorted(child, f'{path}[{i}]')

Expand Down