From 47938842e7659cfd3b99e604f1e8f385635d1151 Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Mon, 23 Oct 2023 13:30:49 +0300 Subject: [PATCH 001/210] refactor(linker): refactor RefResolver so the entire resolver is language specific. This greatly simplifies the input parameters and generally makes more sense. --- sefaria/helper/linker.py | 6 +- sefaria/model/linker/ref_resolver.py | 134 ++++++++---------- sefaria/model/linker/tests/linker_test.py | 15 +- .../model/linker/tests/linker_test_utils.py | 6 +- sefaria/model/text.py | 29 ++-- 5 files changed, 87 insertions(+), 103 deletions(-) diff --git a/sefaria/helper/linker.py b/sefaria/helper/linker.py index 998f346a05..3b454102f0 100644 --- a/sefaria/helper/linker.py +++ b/sefaria/helper/linker.py @@ -119,10 +119,10 @@ def _make_find_refs_response_with_cache(request_text: _FindRefsText, options: _F def _make_find_refs_response_linker_v3(request_text: _FindRefsText, options: _FindRefsTextOptions) -> dict: - resolver = library.get_ref_resolver() - resolved_title = resolver.bulk_resolve_refs(request_text.lang, [None], [request_text.title]) + resolver = library.get_ref_resolver(request_text.lang) + resolved_title = resolver.bulk_resolve_refs([None], [request_text.title]) context_ref = resolved_title[0][0].ref if (len(resolved_title[0]) == 1 and not resolved_title[0][0].is_ambiguous) else None - resolved_body = resolver.bulk_resolve_refs(request_text.lang, [context_ref], [request_text.body], with_failures=True) + resolved_body = resolver.bulk_resolve_refs([context_ref], [request_text.body], with_failures=True) response = { "title": _make_find_refs_response_inner(resolved_title, options), diff --git a/sefaria/model/linker/ref_resolver.py b/sefaria/model/linker/ref_resolver.py index e0721e0d95..b87ba9a153 100644 --- a/sefaria/model/linker/ref_resolver.py +++ b/sefaria/model/linker/ref_resolver.py @@ -241,15 +241,15 @@ def get_ref_by_title(self, title: str) -> Optional[text.Ref]: class RefResolver: - def __init__(self, raw_ref_model_by_lang: Dict[str, Language], raw_ref_part_model_by_lang: Dict[str, Language], - ref_part_title_trie_by_lang: Dict[str, MatchTemplateTrie], - term_matcher_by_lang: Dict[str, TermMatcher]) -> None: + def __init__(self, lang: str, raw_ref_model: Language, raw_ref_part_model: Language, + ref_part_title_trie: MatchTemplateTrie, term_matcher: TermMatcher) -> None: from sefaria.helper.normalization import NormalizerByLang, NormalizerComposer - self._raw_ref_model_by_lang = raw_ref_model_by_lang - self._raw_ref_part_model_by_lang = raw_ref_part_model_by_lang - self._ref_part_title_trie_by_lang = ref_part_title_trie_by_lang - self._term_matcher_by_lang = term_matcher_by_lang + self._lang = lang + self._raw_ref_model = raw_ref_model + self._raw_ref_part_model = raw_ref_part_model + self._ref_part_title_trie = ref_part_title_trie + self._term_matcher = term_matcher self._ibid_history = IbidHistory() self._thoroughness = ResolutionThoroughness.NORMAL @@ -264,19 +264,19 @@ def __init__(self, raw_ref_model_by_lang: Dict[str, Language], raw_ref_part_mode def reset_ibid_history(self): self._ibid_history = IbidHistory() - def _normalize_input(self, lang: str, input: List[str]): + def _normalize_input(self, input: List[str]): """ Normalize input text to match normalization that happened at training time """ - return [self._normalizer.normalize(s, lang=lang) for s in input] + return [self._normalizer.normalize(s, lang=self._lang) for s in input] - def _map_normal_output_to_original_input(self, lang: str, input: List[str], resolved: List[List[Union[ResolvedRef, AmbiguousResolvedRef]]]) -> None: + def _map_normal_output_to_original_input(self, input: List[str], resolved: List[List[Union[ResolvedRef, AmbiguousResolvedRef]]]) -> None: """ Ref resolution ran on normalized input. Remap resolved refs to original (non-normalized) input """ for temp_input, temp_resolved in zip(input, resolved): - unnorm_doc = self.get_raw_ref_model(lang).make_doc(temp_input) - mapping = self._normalizer.get_mapping_after_normalization(temp_input, lang=lang) + unnorm_doc = self.get_raw_ref_model().make_doc(temp_input) + mapping = self._normalizer.get_mapping_after_normalization(temp_input, lang=self._lang) conv = self._normalizer.convert_normalized_indices_to_unnormalized_indices # this function name is waaay too long norm_inds = [rr.raw_ref.char_indices for rr in temp_resolved] unnorm_inds = conv(norm_inds, mapping) @@ -286,10 +286,9 @@ def _map_normal_output_to_original_input(self, lang: str, input: List[str], reso for resolved_ref, temp_unnorm_inds, temp_unnorm_part_inds in zip(temp_resolved, unnorm_inds, unnorm_part_inds): resolved_ref.raw_ref.map_new_indices(unnorm_doc, temp_unnorm_inds, temp_unnorm_part_inds) - def bulk_resolve_refs(self, lang: str, book_context_refs: List[Optional[text.Ref]], input: List[str], with_failures=False, verbose=False, reset_ibids_every_context_ref=True, thoroughness=ResolutionThoroughness.NORMAL) -> List[List[Union[ResolvedRef, AmbiguousResolvedRef]]]: + def bulk_resolve_refs(self, book_context_refs: List[Optional[text.Ref]], input: List[str], with_failures=False, verbose=False, reset_ibids_every_context_ref=True, thoroughness=ResolutionThoroughness.NORMAL) -> List[List[Union[ResolvedRef, AmbiguousResolvedRef]]]: """ Main function for resolving refs in text. Given a list of texts, returns ResolvedRefs for each - @param lang: @param book_context_refs: @param input: @param with_failures: @@ -300,8 +299,8 @@ def bulk_resolve_refs(self, lang: str, book_context_refs: List[Optional[text.Ref """ self._thoroughness = thoroughness self.reset_ibid_history() - normalized_input = self._normalize_input(lang, input) - all_raw_refs = self._bulk_get_raw_refs(lang, normalized_input) + normalized_input = self._normalize_input(input) + all_raw_refs = self._bulk_get_raw_refs(normalized_input) resolved = [] iter = zip(book_context_refs, all_raw_refs) if verbose: @@ -311,7 +310,7 @@ def bulk_resolve_refs(self, lang: str, book_context_refs: List[Optional[text.Ref self.reset_ibid_history() inner_resolved = [] for raw_ref in raw_refs: - temp_resolved = self.resolve_raw_ref(lang, book_context_ref, raw_ref) + temp_resolved = self.resolve_raw_ref(book_context_ref, raw_ref) if len(temp_resolved) == 0: self.reset_ibid_history() if with_failures: @@ -324,13 +323,13 @@ def bulk_resolve_refs(self, lang: str, book_context_refs: List[Optional[text.Ref self._ibid_history.last_refs = temp_resolved[-1].ref inner_resolved += temp_resolved resolved += [inner_resolved] - self._map_normal_output_to_original_input(lang, input, resolved) + self._map_normal_output_to_original_input(input, resolved) return resolved - def _bulk_get_raw_refs(self, lang: str, input: List[str]) -> List[List[RawRef]]: - all_raw_ref_spans = list(self._bulk_get_raw_ref_spans(lang, input)) + def _bulk_get_raw_refs(self, input: List[str]) -> List[List[RawRef]]: + all_raw_ref_spans = list(self._bulk_get_raw_ref_spans(input)) ref_part_input = reduce(lambda a, b: a + [(sub_b.text, b[0]) for sub_b in b[1]], enumerate(all_raw_ref_spans), []) - all_raw_ref_part_spans = list(self._bulk_get_raw_ref_part_spans(lang, ref_part_input, as_tuples=True)) + all_raw_ref_part_spans = list(self._bulk_get_raw_ref_part_spans(ref_part_input, as_tuples=True)) all_raw_ref_part_span_map = defaultdict(list) for ref_part_span, input_idx in all_raw_ref_part_spans: all_raw_ref_part_span_map[input_idx] += [ref_part_span] @@ -347,7 +346,7 @@ def _bulk_get_raw_refs(self, lang: str, input: List[str]) -> List[List[RawRef]]: if part_type == RefPartType.DH: dh_continuation = self._get_dh_continuation(ispan, ipart, raw_ref_spans, part_span_list, span, part_span) raw_ref_parts += [RawRefPart(part_type, part_span, dh_continuation)] - raw_refs += [RawRef(lang, raw_ref_parts, span)] + raw_refs += [RawRef(self._lang, raw_ref_parts, span)] all_raw_refs += [raw_refs] return all_raw_refs @@ -368,50 +367,43 @@ def _get_dh_continuation(ispan: int, ipart: int, raw_ref_spans: List[SpanOrToken return dh_cont - def __get_attr_by_lang(self, lang: str, by_lang_attr: dict, error_msg: str): - try: - return by_lang_attr[lang] - except KeyError as e: - raise KeyError(f"{error_msg} for lang `{lang}`") - - def get_raw_ref_model(self, lang: str) -> Language: - return self.__get_attr_by_lang(lang, self._raw_ref_model_by_lang, 'No Raw Ref Model') + def get_raw_ref_model(self) -> Language: + return self._raw_ref_model - def get_raw_ref_part_model(self, lang: str) -> Language: - return self.__get_attr_by_lang(lang, self._raw_ref_part_model_by_lang, 'No Raw Ref Model') + def get_raw_ref_part_model(self) -> Language: + return self._raw_ref_part_model - def get_ref_part_title_trie(self, lang: str) -> MatchTemplateTrie: - return self.__get_attr_by_lang(lang, self._ref_part_title_trie_by_lang, 'No Raw Ref Part Title Trie') + def get_ref_part_title_trie(self) -> MatchTemplateTrie: + return self._ref_part_title_trie - def get_term_matcher(self, lang: str) -> TermMatcher: - return self.__get_attr_by_lang(lang, self._term_matcher_by_lang, 'No Term Matcher') + def get_term_matcher(self) -> TermMatcher: + return self._term_matcher - def _get_raw_ref_spans_in_string(self, lang: str, st: str) -> List[Span]: - doc = self.get_raw_ref_model(lang)(st) + def _get_raw_ref_spans_in_string(self, st: str) -> List[Span]: + doc = self.get_raw_ref_model()(st) return doc.ents - def _bulk_get_raw_ref_spans(self, lang: str, input: List[str], batch_size=150, **kwargs) -> Generator[List[Span], None, None]: - for doc in self.get_raw_ref_model(lang).pipe(input, batch_size=batch_size, **kwargs): + def _bulk_get_raw_ref_spans(self, input: List[str], batch_size=150, **kwargs) -> Generator[List[Span], None, None]: + for doc in self.get_raw_ref_model().pipe(input, batch_size=batch_size, **kwargs): if kwargs.get('as_tuples', False): doc, context = doc yield doc.ents, context else: yield doc.ents - def _get_raw_ref_part_spans_in_string(self, lang: str, st: str) -> List[Span]: - doc = self.get_raw_ref_part_model(lang)(st) + def _get_raw_ref_part_spans_in_string(self, st: str) -> List[Span]: + doc = self.get_raw_ref_part_model()(st) return doc.ents - def _bulk_get_raw_ref_part_spans(self, lang: str, input: List[str], batch_size=None, **kwargs) -> Generator[List[Span], None, None]: - for doc in self.get_raw_ref_part_model(lang).pipe(input, batch_size=batch_size or len(input), **kwargs): + def _bulk_get_raw_ref_part_spans(self, input: List[str], batch_size=None, **kwargs) -> Generator[List[Span], None, None]: + for doc in self.get_raw_ref_part_model().pipe(input, batch_size=batch_size or len(input), **kwargs): if kwargs.get('as_tuples', False): doc, context = doc yield doc.ents, context else: yield doc.ents - @staticmethod - def split_non_cts_parts(lang, raw_ref: RawRef) -> List[RawRef]: + def split_non_cts_parts(self, raw_ref: RawRef) -> List[RawRef]: if not any(part.type == RefPartType.NON_CTS for part in raw_ref.raw_ref_parts): return [raw_ref] split_raw_refs = [] curr_parts = [] @@ -426,7 +418,7 @@ def split_non_cts_parts(lang, raw_ref: RawRef) -> List[RawRef]: try: raw_ref_span = raw_ref.subspan(slice(curr_part_start, curr_part_end)) curr_parts = [p.realign_to_new_raw_ref(raw_ref.span, raw_ref_span) for p in curr_parts] - split_raw_refs += [RawRef(lang, curr_parts, raw_ref_span)] + split_raw_refs += [RawRef(self._lang, curr_parts, raw_ref_span)] except AssertionError: pass curr_parts = [] @@ -436,8 +428,8 @@ def split_non_cts_parts(lang, raw_ref: RawRef) -> List[RawRef]: def set_thoroughness(self, thoroughness: ResolutionThoroughness) -> None: self._thoroughness = thoroughness - def resolve_raw_ref(self, lang: str, book_context_ref: Optional[text.Ref], raw_ref: RawRef) -> List[Union[ResolvedRef, AmbiguousResolvedRef]]: - split_raw_refs = self.split_non_cts_parts(lang, raw_ref) + def resolve_raw_ref(self, book_context_ref: Optional[text.Ref], raw_ref: RawRef) -> List[Union[ResolvedRef, AmbiguousResolvedRef]]: + split_raw_refs = self.split_non_cts_parts(raw_ref) resolved_list = [] for i, temp_raw_ref in enumerate(split_raw_refs): is_non_cts = i > 0 and len(resolved_list) > 0 @@ -446,8 +438,8 @@ def resolve_raw_ref(self, lang: str, book_context_ref: Optional[text.Ref], raw_r book_context_ref = resolved_list[0].ref context_swap_map = None if book_context_ref is None else getattr(book_context_ref.index.nodes, 'ref_resolver_context_swaps', None) - self._apply_context_swaps(lang, raw_ref, context_swap_map) - unrefined_matches = self.get_unrefined_ref_part_matches(lang, book_context_ref, temp_raw_ref) + self._apply_context_swaps(raw_ref, context_swap_map) + unrefined_matches = self.get_unrefined_ref_part_matches(book_context_ref, temp_raw_ref) if is_non_cts: # filter unrefined matches to matches that resolved previously resolved_titles = {r.ref.index.title for r in resolved_list} @@ -458,7 +450,7 @@ def resolve_raw_ref(self, lang: str, book_context_ref: Optional[text.Ref], raw_r match.ref = match.ref.subref(book_context_ref.sections[:-len(temp_raw_ref.raw_ref_parts)]) except (InputError, AttributeError): continue - temp_resolved_list = self.refine_ref_part_matches(lang, book_context_ref, unrefined_matches) + temp_resolved_list = self.refine_ref_part_matches(book_context_ref, unrefined_matches) if len(temp_resolved_list) > 1: resolved_list += [AmbiguousResolvedRef(temp_resolved_list)] else: @@ -477,25 +469,25 @@ def resolve_raw_ref_using_ref_instantiation(raw_ref: RawRef) -> List[ResolvedRef except: return [] - def get_unrefined_ref_part_matches(self, lang: str, book_context_ref: Optional[text.Ref], raw_ref: RawRef) -> List[ + def get_unrefined_ref_part_matches(self, book_context_ref: Optional[text.Ref], raw_ref: RawRef) -> List[ 'ResolvedRef']: - context_free_matches = self._get_unrefined_ref_part_matches_recursive(lang, raw_ref, ref_parts=raw_ref.parts_to_match) + context_free_matches = self._get_unrefined_ref_part_matches_recursive(raw_ref, ref_parts=raw_ref.parts_to_match) contexts = [(book_context_ref, ContextType.CURRENT_BOOK)] + [(ibid_ref, ContextType.IBID) for ibid_ref in self._ibid_history.last_refs] matches = context_free_matches if len(matches) == 0: context_full_matches = [] for context_ref, context_type in contexts: - context_full_matches += self._get_unrefined_ref_part_matches_for_title_context(lang, context_ref, raw_ref, context_type) + context_full_matches += self._get_unrefined_ref_part_matches_for_title_context(context_ref, raw_ref, context_type) matches = context_full_matches + context_free_matches return matches - def _get_unrefined_ref_part_matches_for_title_context(self, lang: str, context_ref: Optional[text.Ref], raw_ref: RawRef, context_type: ContextType) -> List[ResolvedRef]: + def _get_unrefined_ref_part_matches_for_title_context(self, context_ref: Optional[text.Ref], raw_ref: RawRef, context_type: ContextType) -> List[ResolvedRef]: matches = [] if context_ref is None: return matches term_contexts = self._get_term_contexts(context_ref.index.nodes) if len(term_contexts) == 0: return matches temp_ref_parts = raw_ref.parts_to_match + term_contexts - temp_matches = self._get_unrefined_ref_part_matches_recursive(lang, raw_ref, ref_parts=temp_ref_parts) + temp_matches = self._get_unrefined_ref_part_matches_recursive(raw_ref, ref_parts=temp_ref_parts) for match in temp_matches: if match.num_resolved(include={TermContext}) == 0: continue match.context_ref = context_ref @@ -504,7 +496,7 @@ def _get_unrefined_ref_part_matches_for_title_context(self, lang: str, context_r matches += [match] return matches - def _apply_context_swaps(self, lang: str, raw_ref: RawRef, context_swap_map: Dict[str, str]=None): + def _apply_context_swaps(self, raw_ref: RawRef, context_swap_map: Dict[str, str]=None): """ Use `context_swap_map` to swap matching element of `ref_parts` Allows us to redefine how a ref part is interpreted depending on the context @@ -513,7 +505,7 @@ def _apply_context_swaps(self, lang: str, raw_ref: RawRef, context_swap_map: Dic Modifies `raw_ref` with updated ref_parts """ swapped_ref_parts = [] - term_matcher = self.get_term_matcher(lang) + term_matcher = self.get_term_matcher() if context_swap_map is None: return for part in raw_ref.raw_ref_parts: # TODO assumes only one match in term_matches @@ -527,8 +519,8 @@ def _apply_context_swaps(self, lang: str, raw_ref: RawRef, context_swap_map: Dic if not found_match: swapped_ref_parts += [part] raw_ref.parts_to_match = swapped_ref_parts - def _get_unrefined_ref_part_matches_recursive(self, lang: str, raw_ref: RawRef, title_trie: MatchTemplateTrie = None, ref_parts: list = None, prev_ref_parts: list = None) -> List[ResolvedRef]: - title_trie = title_trie or self.get_ref_part_title_trie(lang) + def _get_unrefined_ref_part_matches_recursive(self, raw_ref: RawRef, title_trie: MatchTemplateTrie = None, ref_parts: list = None, prev_ref_parts: list = None) -> List[ResolvedRef]: + title_trie = title_trie or self.get_ref_part_title_trie() prev_ref_parts = prev_ref_parts or [] matches = [] for part in ref_parts: @@ -555,16 +547,16 @@ def _get_unrefined_ref_part_matches_recursive(self, lang: str, raw_ref: RawRef, continue matches += [ResolvedRef(temp_raw_ref, temp_prev_ref_parts, node, ref, _thoroughness=self._thoroughness)] temp_ref_parts = [temp_part for temp_part in ref_parts if temp_part != part] - matches += self._get_unrefined_ref_part_matches_recursive(lang, temp_raw_ref, temp_title_trie, ref_parts=temp_ref_parts, prev_ref_parts=temp_prev_ref_parts) + matches += self._get_unrefined_ref_part_matches_recursive(temp_raw_ref, temp_title_trie, ref_parts=temp_ref_parts, prev_ref_parts=temp_prev_ref_parts) return ResolvedRefPruner.prune_unrefined_ref_part_matches(matches) - def refine_ref_part_matches(self, lang: str, book_context_ref: Optional[text.Ref], matches: List[ResolvedRef]) -> List[ResolvedRef]: + def refine_ref_part_matches(self, book_context_ref: Optional[text.Ref], matches: List[ResolvedRef]) -> List[ResolvedRef]: temp_matches = [] refs_matched = {match.ref.normal() for match in matches} for unrefined_match in matches: unused_parts = list(set(unrefined_match.raw_ref.parts_to_match) - set(unrefined_match.resolved_parts)) - context_free_matches = self._get_refined_ref_part_matches_recursive(lang, unrefined_match, unused_parts) + context_free_matches = self._get_refined_ref_part_matches_recursive(unrefined_match, unused_parts) # context # if unrefined_match already used context, make sure it continues to use it @@ -573,7 +565,7 @@ def refine_ref_part_matches(self, lang: str, book_context_ref: Optional[text.Ref context_type_list = [ContextType.CURRENT_BOOK, ContextType.IBID] if unrefined_match.context_ref is None else [unrefined_match.context_type] context_full_matches = [] for context_ref, context_type in zip(context_ref_list, context_type_list): - context_full_matches += self._get_refined_ref_part_matches_for_section_context(lang, context_ref, context_type, unrefined_match, unused_parts) + context_full_matches += self._get_refined_ref_part_matches_for_section_context(context_ref, context_type, unrefined_match, unused_parts) # combine if len(context_full_matches) > 0: @@ -641,8 +633,7 @@ def _get_term_contexts(node: schema.SchemaNode) -> List[TermContext]: longest_template = min(match_templates, key=lambda x: len(list(x.terms))) return [TermContext(term) for term in longest_template.terms] - @staticmethod - def _get_refined_ref_part_matches_for_section_context(lang: str, context_ref: Optional[text.Ref], context_type: ContextType, ref_part_match: ResolvedRef, ref_parts: List[RawRefPart]) -> List[ResolvedRef]: + def _get_refined_ref_part_matches_for_section_context(self, context_ref: Optional[text.Ref], context_type: ContextType, ref_part_match: ResolvedRef, ref_parts: List[RawRefPart]) -> List[ResolvedRef]: """ Tries to infer sections from context ref and uses them to refine `ref_part_match` """ @@ -655,7 +646,7 @@ def _get_refined_ref_part_matches_for_section_context(lang: str, context_ref: Op sec_contexts = RefResolver._get_section_contexts(context_ref, ref_part_match.ref.index, common_index) term_contexts = RefResolver._get_all_term_contexts(context_ref.index_node, include_root=False) context_to_consider = sec_contexts + term_contexts - temp_matches = RefResolver._get_refined_ref_part_matches_recursive(lang, ref_part_match, ref_parts + context_to_consider) + temp_matches = self._get_refined_ref_part_matches_recursive(ref_part_match, ref_parts + context_to_consider) # remove matches which don't use context temp_matches = list(filter(lambda x: len(set(x.get_resolved_parts(include={ContextPart})) & set(context_to_consider)) > 0, temp_matches)) @@ -667,17 +658,16 @@ def _get_refined_ref_part_matches_for_section_context(lang: str, context_ref: Op matches += temp_matches return matches - @staticmethod - def _get_refined_ref_part_matches_recursive(lang: str, match: ResolvedRef, ref_parts: List[RawRefPart]) -> List[ResolvedRef]: + def _get_refined_ref_part_matches_recursive(self, match: ResolvedRef, ref_parts: List[RawRefPart]) -> List[ResolvedRef]: fully_refined = [] children = match.get_node_children() for part in ref_parts: for child in children: resolved_ref_refiner = resolved_ref_refiner_factory.create(part, child, match) - temp_matches = resolved_ref_refiner.refine(lang) + temp_matches = resolved_ref_refiner.refine(self._lang) for temp_match in temp_matches: temp_ref_parts = list(set(ref_parts) - set(temp_match.resolved_parts)) - fully_refined += RefResolver._get_refined_ref_part_matches_recursive(lang, temp_match, temp_ref_parts) + fully_refined += self._get_refined_ref_part_matches_recursive(temp_match, temp_ref_parts) if len(fully_refined) == 0: # original match is better than no matches return [match] diff --git a/sefaria/model/linker/tests/linker_test.py b/sefaria/model/linker/tests/linker_test.py index 75397f1280..60977fa5da 100644 --- a/sefaria/model/linker/tests/linker_test.py +++ b/sefaria/model/linker/tests/linker_test.py @@ -8,8 +8,6 @@ if not ENABLE_LINKER: pytest.skip("Linker not enabled", allow_module_level=True) -ref_resolver = library.get_ref_resolver() - def test_referenceable_child(): i = library.get_index("Rashi on Berakhot") @@ -233,8 +231,9 @@ def test_resolved_raw_ref_clone(): [crrd(['@Rashi on Genesis', '#1', '#1', '#1'], lang='en'), ["Rashi on Genesis 1:1:1"]], ]) def test_resolve_raw_ref(resolver_data, expected_trefs): - ref_resolver.reset_ibid_history() # reset from previous test runs raw_ref, context_ref, lang, prev_trefs = resolver_data + ref_resolver = library.get_ref_resolver(lang) + ref_resolver.reset_ibid_history() # reset from previous test runs if prev_trefs: for prev_tref in prev_trefs: if prev_tref is None: @@ -243,7 +242,7 @@ def test_resolve_raw_ref(resolver_data, expected_trefs): ref_resolver._ibid_history.last_refs = Ref(prev_tref) print_spans(raw_ref) ref_resolver.set_thoroughness(ResolutionThoroughness.HIGH) - matches = ref_resolver.resolve_raw_ref(lang, context_ref, raw_ref) + matches = ref_resolver.resolve_raw_ref(context_ref, raw_ref) matched_orefs = sorted(reduce(lambda a, b: a + b, [[match.ref] if not match.is_ambiguous else [inner_match.ref for inner_match in match.resolved_raw_refs] for match in matches], []), key=lambda x: x.normal()) if len(expected_trefs) != len(matched_orefs): print(f"Found {len(matched_orefs)} refs instead of {len(expected_trefs)}") @@ -265,7 +264,8 @@ class TestResolveRawRef: ]) def test_full_pipeline_ref_resolver(context_tref, input_str, lang, expected_trefs, expected_pretty_texts): context_oref = context_tref and Ref(context_tref) - resolved = ref_resolver.bulk_resolve_refs(lang, [context_oref], [input_str])[0] + ref_resolver = library.get_ref_resolver(lang) + resolved = ref_resolver.bulk_resolve_refs([context_oref], [input_str])[0] assert len(resolved) == len(expected_trefs) resolved_orefs = sorted(reduce(lambda a, b: a + b, [[match.ref] if not match.is_ambiguous else [inner_match.ref for inner_match in match.resolved_raw_refs] for match in resolved], []), key=lambda x: x.normal()) if len(expected_trefs) != len(resolved_orefs): @@ -384,7 +384,8 @@ def test_map_new_indices(crrd_params): # unnorm data raw_ref, _, lang, _ = crrd(*crrd_params) text = raw_ref.text - doc = ref_resolver.get_raw_ref_model(lang).make_doc(text) + ref_resolver = library.get_ref_resolver(lang) + doc = ref_resolver.get_raw_ref_model().make_doc(text) indices = raw_ref.char_indices part_indices = [p.char_indices for p in raw_ref.raw_ref_parts] print_spans(raw_ref) @@ -392,7 +393,7 @@ def test_map_new_indices(crrd_params): # norm data n = ref_resolver._normalizer norm_text = n.normalize(text, lang=lang) - norm_doc = ref_resolver.get_raw_ref_model(lang).make_doc(norm_text) + norm_doc = ref_resolver.get_raw_ref_model().make_doc(norm_text) mapping = n.get_mapping_after_normalization(text, reverse=True, lang=lang) norm_part_indices = n.convert_normalized_indices_to_unnormalized_indices(part_indices, mapping, reverse=True) norm_part_spans = [norm_doc.char_span(s, e) for (s, e) in norm_part_indices] diff --git a/sefaria/model/linker/tests/linker_test_utils.py b/sefaria/model/linker/tests/linker_test_utils.py index 0651510b87..4b461cd8bb 100644 --- a/sefaria/model/linker/tests/linker_test_utils.py +++ b/sefaria/model/linker/tests/linker_test_utils.py @@ -8,8 +8,6 @@ if not ENABLE_LINKER: pytest.skip("Linker not enabled", allow_module_level=True) -ref_resolver = library.get_ref_resolver() - class RefPartTypeNone: """ @@ -58,7 +56,7 @@ def get_symbol_by_part_type(part_type): @staticmethod def convert_to_raw_encoded_part_list(lang, text, span_inds, part_types): - nlp = ref_resolver.get_raw_ref_part_model(lang) + nlp = library.get_ref_resolver(lang).get_raw_ref_part_model() doc = nlp.make_doc(text) span = doc[0:] raw_encoded_part_list = [] @@ -104,7 +102,7 @@ def part_types(self): @property def span(self): if not self._span: - nlp = ref_resolver.get_raw_ref_part_model(self.lang) + nlp = library.get_ref_resolver(self.lang).get_raw_ref_part_model() doc = nlp.make_doc(self.input_str) self._span = doc[0:] return self._span diff --git a/sefaria/model/text.py b/sefaria/model/text.py index d8484b59b7..2a2c00f558 100644 --- a/sefaria/model/text.py +++ b/sefaria/model/text.py @@ -4865,7 +4865,7 @@ def __init__(self): self._simple_term_mapping = {} self._full_term_mapping = {} self._simple_term_mapping_json = None - self._ref_resolver = None + self._ref_resolver = {} # Topics self._topic_mapping = {} @@ -5600,13 +5600,13 @@ def _build_topic_mapping(self): self._topic_mapping = {t.slug: {"en": t.get_primary_title("en"), "he": t.get_primary_title("he")} for t in TopicSet()} return self._topic_mapping - def get_ref_resolver(self, rebuild=False): - resolver = self._ref_resolver + def get_ref_resolver(self, lang: str, rebuild=False): + resolver = self._ref_resolver.get(lang) if not resolver or rebuild: - resolver = self.build_ref_resolver() + resolver = self.build_ref_resolver(lang) return resolver - def build_ref_resolver(self): + def build_ref_resolver(self, lang: str): from .linker.match_template import MatchTemplateTrie from .linker.ref_resolver import RefResolver, TermMatcher from sefaria.model.schema import NonUniqueTermSet @@ -5617,19 +5617,14 @@ def build_ref_resolver(self): root_nodes = list(filter(lambda n: getattr(n, 'match_templates', None) is not None, self.get_index_forest())) alone_nodes = reduce(lambda a, b: a + b.index.get_referenceable_alone_nodes(), root_nodes, []) non_unique_terms = NonUniqueTermSet() - self._ref_resolver = RefResolver( - {k: load_spacy_model(v) for k, v in RAW_REF_MODEL_BY_LANG_FILEPATH.items() if v is not None}, - {k: load_spacy_model(v) for k, v in RAW_REF_PART_MODEL_BY_LANG_FILEPATH.items() if v is not None}, - { - "en": MatchTemplateTrie('en', nodes=(root_nodes + alone_nodes), scope='alone'), - "he": MatchTemplateTrie('he', nodes=(root_nodes + alone_nodes), scope='alone') - }, - { - "en": TermMatcher('en', non_unique_terms), - "he": TermMatcher('he', non_unique_terms), - } + self._ref_resolver[lang] = RefResolver( + lang, + load_spacy_model(RAW_REF_MODEL_BY_LANG_FILEPATH[lang]), + load_spacy_model(RAW_REF_PART_MODEL_BY_LANG_FILEPATH[lang]), + MatchTemplateTrie(lang, nodes=(root_nodes + alone_nodes), scope='alone'), + TermMatcher(lang, non_unique_terms), ) - return self._ref_resolver + return self._ref_resolver[lang] def get_index_forest(self): """ From a86688f731a678e10200b22a0edd01db59930024 Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Mon, 23 Oct 2023 15:19:10 +0300 Subject: [PATCH 002/210] refactor(linker): move model and related functions to NamedEntityRecognizer to help abstract this functionality and make it more reusable. --- sefaria/model/linker/named_entity_resolver.py | 130 ++++++++++++++++++ sefaria/model/linker/ref_resolver.py | 118 ++-------------- sefaria/model/linker/tests/linker_test.py | 11 +- .../model/linker/tests/linker_test_utils.py | 4 +- sefaria/model/text.py | 11 +- 5 files changed, 155 insertions(+), 119 deletions(-) create mode 100644 sefaria/model/linker/named_entity_resolver.py diff --git a/sefaria/model/linker/named_entity_resolver.py b/sefaria/model/linker/named_entity_resolver.py new file mode 100644 index 0000000000..0a4094ad53 --- /dev/null +++ b/sefaria/model/linker/named_entity_resolver.py @@ -0,0 +1,130 @@ +from typing import List, Generator, Optional +from functools import reduce +from collections import defaultdict +from sefaria.model.linker.ref_part import RawRef, RawRefPart, SpanOrToken, span_inds, RefPartType +from sefaria.helper.normalization import NormalizerComposer + +try: + import spacy + from spacy.tokens import Span + from spacy.language import Language +except ImportError: + spacy = Doc = Span = Token = Language = None + + +class NamedEntityRecognizer: + + def __init__(self, lang: str, raw_ref_model: Language, raw_ref_part_model: Language): + self._lang = lang + self._raw_ref_model = raw_ref_model + self._raw_ref_part_model = raw_ref_part_model + self._normalizer = self.__init_normalizer() + + def __init_normalizer(self) -> NormalizerComposer: + # see ML Repo library_exporter.py:TextWalker.__init__() which uses same normalization + # important that normalization is equivalent to normalization done at training time + normalizer_steps = ['unidecode', 'html', 'double-space'] + if self._lang == 'he': + normalizer_steps += ['maqaf', 'cantillation'] + return NormalizerComposer(normalizer_steps) + + def _normalize_input(self, input: List[str]): + """ + Normalize input text to match normalization that happened at training time + """ + return [self._normalizer.normalize(s) for s in input] + + @property + def raw_ref_model(self): + return self._raw_ref_model + + @property + def raw_ref_part_model(self): + return self._raw_ref_part_model + + def bulk_map_normal_output_to_original_input(self, input: List[str], raw_ref_list_list: List[List[RawRef]]): + for temp_input, raw_ref_list in zip(input, raw_ref_list_list): + self.map_normal_output_to_original_input(temp_input, raw_ref_list) + + def map_normal_output_to_original_input(self, input: str, raw_ref_list: List[RawRef]) -> None: + """ + Ref resolution ran on normalized input. Remap raw refs to original (non-normalized) input + """ + unnorm_doc = self._raw_ref_model.make_doc(input) + mapping = self._normalizer.get_mapping_after_normalization(input) + # this function name is waaay too long + conv = self._normalizer.convert_normalized_indices_to_unnormalized_indices + norm_inds = [raw_ref.char_indices for raw_ref in raw_ref_list] + unnorm_inds = conv(norm_inds, mapping) + unnorm_part_inds = [] + for (raw_ref, (norm_raw_ref_start, _)) in zip(raw_ref_list, norm_inds): + unnorm_part_inds += [conv([[norm_raw_ref_start + i for i in part.char_indices] + for part in raw_ref.raw_ref_parts], mapping)] + for raw_ref, temp_unnorm_inds, temp_unnorm_part_inds in zip(raw_ref_list, unnorm_inds, unnorm_part_inds): + raw_ref.map_new_indices(unnorm_doc, temp_unnorm_inds, temp_unnorm_part_inds) + + def get_raw_ref_spans_in_string(self, st: str) -> List[Span]: + doc = self._raw_ref_model(st) + return doc.ents + + def _get_raw_ref_part_spans_in_string(self, st: str) -> List[Span]: + doc = self._raw_ref_part_model(st) + return doc.ents + + def _bulk_get_raw_ref_spans(self, input: List[str], batch_size=150, **kwargs) -> Generator[List[Span], None, None]: + for doc in self._raw_ref_model.pipe(input, batch_size=batch_size, **kwargs): + if kwargs.get('as_tuples', False): + doc, context = doc + yield doc.ents, context + else: + yield doc.ents + + def _bulk_get_raw_ref_part_spans(self, input: List[str], batch_size=None, **kwargs) -> Generator[List[Span], None, None]: + for doc in self._raw_ref_part_model.pipe(input, batch_size=batch_size or len(input), **kwargs): + if kwargs.get('as_tuples', False): + doc, context = doc + yield doc.ents, context + else: + yield doc.ents + + def bulk_get_raw_refs(self, input: List[str]) -> List[List[RawRef]]: + normalized_input = self._normalize_input(input) + all_raw_ref_spans = list(self._bulk_get_raw_ref_spans(normalized_input)) + ref_part_input = reduce(lambda a, b: a + [(sub_b.text, b[0]) for sub_b in b[1]], enumerate(all_raw_ref_spans), []) + all_raw_ref_part_spans = list(self._bulk_get_raw_ref_part_spans(ref_part_input, as_tuples=True)) + all_raw_ref_part_span_map = defaultdict(list) + for ref_part_span, input_idx in all_raw_ref_part_spans: + all_raw_ref_part_span_map[input_idx] += [ref_part_span] + + all_raw_refs = [] + for input_idx, raw_ref_spans in enumerate(all_raw_ref_spans): + raw_ref_part_spans = all_raw_ref_part_span_map[input_idx] + raw_refs = [] + for ispan, (span, part_span_list) in enumerate(zip(raw_ref_spans, raw_ref_part_spans)): + raw_ref_parts = [] + for ipart, part_span in enumerate(part_span_list): + part_type = RefPartType.span_label_to_enum(part_span.label_) + dh_continuation = None + if part_type == RefPartType.DH: + dh_continuation = self._get_dh_continuation(ispan, ipart, raw_ref_spans, part_span_list, span, part_span) + raw_ref_parts += [RawRefPart(part_type, part_span, dh_continuation)] + raw_refs += [RawRef(self._lang, raw_ref_parts, span)] + all_raw_refs += [raw_refs] + return all_raw_refs + + @staticmethod + def _get_dh_continuation(ispan: int, ipart: int, raw_ref_spans: List[SpanOrToken], part_span_list: List[SpanOrToken], span: SpanOrToken, part_span: SpanOrToken) -> Optional[SpanOrToken]: + if ipart == len(part_span_list) - 1: + curr_doc = span.doc + _, span_end = span_inds(span) + if ispan == len(raw_ref_spans) - 1: + dh_cont = curr_doc[span_end:] + else: + next_span_start, _ = span_inds(raw_ref_spans[ispan + 1]) + dh_cont = curr_doc[span_end:next_span_start] + else: + _, part_span_end = span_inds(part_span) + next_part_span_start, _ = span_inds(part_span_list[ipart + 1]) + dh_cont = part_span.doc[part_span_end:next_part_span_start] + + return dh_cont diff --git a/sefaria/model/linker/ref_resolver.py b/sefaria/model/linker/ref_resolver.py index b87ba9a153..5ca8b89533 100644 --- a/sefaria/model/linker/ref_resolver.py +++ b/sefaria/model/linker/ref_resolver.py @@ -1,12 +1,12 @@ from collections import defaultdict -from typing import List, Union, Dict, Optional, Tuple, Generator, Iterable, Set +from typing import List, Union, Dict, Optional, Tuple, Iterable, Set from enum import IntEnum, Enum -from functools import reduce from tqdm import tqdm from sefaria.system.exceptions import InputError from sefaria.model import abstract as abst from sefaria.model import text from sefaria.model import schema +from sefaria.model.linker.named_entity_resolver import NamedEntityRecognizer from sefaria.model.linker.ref_part import RawRef, RawRefPart, SpanOrToken, span_inds, RefPartType, SectionContext, ContextPart, TermContext from sefaria.model.linker.referenceable_book_node import NamedReferenceableBookNode, ReferenceableBookNode from sefaria.model.linker.match_template import MatchTemplateTrie, LEAF_TRIE_ENTRY @@ -241,51 +241,19 @@ def get_ref_by_title(self, title: str) -> Optional[text.Ref]: class RefResolver: - def __init__(self, lang: str, raw_ref_model: Language, raw_ref_part_model: Language, + def __init__(self, lang: str, named_entity_recognizer: NamedEntityRecognizer, ref_part_title_trie: MatchTemplateTrie, term_matcher: TermMatcher) -> None: - from sefaria.helper.normalization import NormalizerByLang, NormalizerComposer self._lang = lang - self._raw_ref_model = raw_ref_model - self._raw_ref_part_model = raw_ref_part_model + self._named_entity_recognizer = named_entity_recognizer self._ref_part_title_trie = ref_part_title_trie self._term_matcher = term_matcher self._ibid_history = IbidHistory() self._thoroughness = ResolutionThoroughness.NORMAL - # see ML Repo library_exporter.py:TextWalker.__init__() which uses same normalization - # important that normalization is equivalent to normalization done at training time - base_normalizer_steps = ['unidecode', 'html', 'double-space'] - self._normalizer = NormalizerByLang({ - 'en': NormalizerComposer(base_normalizer_steps), - 'he': NormalizerComposer(base_normalizer_steps + ['maqaf', 'cantillation']), - }) - def reset_ibid_history(self): self._ibid_history = IbidHistory() - def _normalize_input(self, input: List[str]): - """ - Normalize input text to match normalization that happened at training time - """ - return [self._normalizer.normalize(s, lang=self._lang) for s in input] - - def _map_normal_output_to_original_input(self, input: List[str], resolved: List[List[Union[ResolvedRef, AmbiguousResolvedRef]]]) -> None: - """ - Ref resolution ran on normalized input. Remap resolved refs to original (non-normalized) input - """ - for temp_input, temp_resolved in zip(input, resolved): - unnorm_doc = self.get_raw_ref_model().make_doc(temp_input) - mapping = self._normalizer.get_mapping_after_normalization(temp_input, lang=self._lang) - conv = self._normalizer.convert_normalized_indices_to_unnormalized_indices # this function name is waaay too long - norm_inds = [rr.raw_ref.char_indices for rr in temp_resolved] - unnorm_inds = conv(norm_inds, mapping) - unnorm_part_inds = [] - for (rr, (norm_raw_ref_start, _)) in zip(temp_resolved, norm_inds): - unnorm_part_inds += [conv([[norm_raw_ref_start + i for i in part.char_indices] for part in rr.raw_ref.raw_ref_parts], mapping)] - for resolved_ref, temp_unnorm_inds, temp_unnorm_part_inds in zip(temp_resolved, unnorm_inds, unnorm_part_inds): - resolved_ref.raw_ref.map_new_indices(unnorm_doc, temp_unnorm_inds, temp_unnorm_part_inds) - def bulk_resolve_refs(self, book_context_refs: List[Optional[text.Ref]], input: List[str], with_failures=False, verbose=False, reset_ibids_every_context_ref=True, thoroughness=ResolutionThoroughness.NORMAL) -> List[List[Union[ResolvedRef, AmbiguousResolvedRef]]]: """ Main function for resolving refs in text. Given a list of texts, returns ResolvedRefs for each @@ -299,8 +267,7 @@ def bulk_resolve_refs(self, book_context_refs: List[Optional[text.Ref]], input: """ self._thoroughness = thoroughness self.reset_ibid_history() - normalized_input = self._normalize_input(input) - all_raw_refs = self._bulk_get_raw_refs(normalized_input) + all_raw_refs = self._named_entity_recognizer.bulk_get_raw_refs(input) resolved = [] iter = zip(book_context_refs, all_raw_refs) if verbose: @@ -323,55 +290,12 @@ def bulk_resolve_refs(self, book_context_refs: List[Optional[text.Ref]], input: self._ibid_history.last_refs = temp_resolved[-1].ref inner_resolved += temp_resolved resolved += [inner_resolved] - self._map_normal_output_to_original_input(input, resolved) + raw_ref_list_list = [[rr.raw_ref for rr in inner_resolved] for inner_resolved in resolved] + self._named_entity_recognizer.bulk_map_normal_output_to_original_input(input, raw_ref_list_list) return resolved - def _bulk_get_raw_refs(self, input: List[str]) -> List[List[RawRef]]: - all_raw_ref_spans = list(self._bulk_get_raw_ref_spans(input)) - ref_part_input = reduce(lambda a, b: a + [(sub_b.text, b[0]) for sub_b in b[1]], enumerate(all_raw_ref_spans), []) - all_raw_ref_part_spans = list(self._bulk_get_raw_ref_part_spans(ref_part_input, as_tuples=True)) - all_raw_ref_part_span_map = defaultdict(list) - for ref_part_span, input_idx in all_raw_ref_part_spans: - all_raw_ref_part_span_map[input_idx] += [ref_part_span] - - all_raw_refs = [] - for input_idx, raw_ref_spans in enumerate(all_raw_ref_spans): - raw_ref_part_spans = all_raw_ref_part_span_map[input_idx] - raw_refs = [] - for ispan, (span, part_span_list) in enumerate(zip(raw_ref_spans, raw_ref_part_spans)): - raw_ref_parts = [] - for ipart, part_span in enumerate(part_span_list): - part_type = RefPartType.span_label_to_enum(part_span.label_) - dh_continuation = None - if part_type == RefPartType.DH: - dh_continuation = self._get_dh_continuation(ispan, ipart, raw_ref_spans, part_span_list, span, part_span) - raw_ref_parts += [RawRefPart(part_type, part_span, dh_continuation)] - raw_refs += [RawRef(self._lang, raw_ref_parts, span)] - all_raw_refs += [raw_refs] - return all_raw_refs - - @staticmethod - def _get_dh_continuation(ispan: int, ipart: int, raw_ref_spans: List[SpanOrToken], part_span_list: List[SpanOrToken], span: SpanOrToken, part_span: SpanOrToken) -> Optional[SpanOrToken]: - if ipart == len(part_span_list) - 1: - curr_doc = span.doc - _, span_end = span_inds(span) - if ispan == len(raw_ref_spans) - 1: - dh_cont = curr_doc[span_end:] - else: - next_span_start, _ = span_inds(raw_ref_spans[ispan + 1]) - dh_cont = curr_doc[span_end:next_span_start] - else: - _, part_span_end = span_inds(part_span) - next_part_span_start, _ = span_inds(part_span_list[ipart + 1]) - dh_cont = part_span.doc[part_span_end:next_part_span_start] - - return dh_cont - - def get_raw_ref_model(self) -> Language: - return self._raw_ref_model - - def get_raw_ref_part_model(self) -> Language: - return self._raw_ref_part_model + def get_ner(self) -> NamedEntityRecognizer: + return self._named_entity_recognizer def get_ref_part_title_trie(self) -> MatchTemplateTrie: return self._ref_part_title_trie @@ -379,30 +303,6 @@ def get_ref_part_title_trie(self) -> MatchTemplateTrie: def get_term_matcher(self) -> TermMatcher: return self._term_matcher - def _get_raw_ref_spans_in_string(self, st: str) -> List[Span]: - doc = self.get_raw_ref_model()(st) - return doc.ents - - def _bulk_get_raw_ref_spans(self, input: List[str], batch_size=150, **kwargs) -> Generator[List[Span], None, None]: - for doc in self.get_raw_ref_model().pipe(input, batch_size=batch_size, **kwargs): - if kwargs.get('as_tuples', False): - doc, context = doc - yield doc.ents, context - else: - yield doc.ents - - def _get_raw_ref_part_spans_in_string(self, st: str) -> List[Span]: - doc = self.get_raw_ref_part_model()(st) - return doc.ents - - def _bulk_get_raw_ref_part_spans(self, input: List[str], batch_size=None, **kwargs) -> Generator[List[Span], None, None]: - for doc in self.get_raw_ref_part_model().pipe(input, batch_size=batch_size or len(input), **kwargs): - if kwargs.get('as_tuples', False): - doc, context = doc - yield doc.ents, context - else: - yield doc.ents - def split_non_cts_parts(self, raw_ref: RawRef) -> List[RawRef]: if not any(part.type == RefPartType.NON_CTS for part in raw_ref.raw_ref_parts): return [raw_ref] split_raw_refs = [] diff --git a/sefaria/model/linker/tests/linker_test.py b/sefaria/model/linker/tests/linker_test.py index 60977fa5da..354d002893 100644 --- a/sefaria/model/linker/tests/linker_test.py +++ b/sefaria/model/linker/tests/linker_test.py @@ -385,16 +385,17 @@ def test_map_new_indices(crrd_params): raw_ref, _, lang, _ = crrd(*crrd_params) text = raw_ref.text ref_resolver = library.get_ref_resolver(lang) - doc = ref_resolver.get_raw_ref_model().make_doc(text) + nlp = ref_resolver.get_ner().raw_ref_model + doc = nlp.make_doc(text) indices = raw_ref.char_indices part_indices = [p.char_indices for p in raw_ref.raw_ref_parts] print_spans(raw_ref) # norm data - n = ref_resolver._normalizer - norm_text = n.normalize(text, lang=lang) - norm_doc = ref_resolver.get_raw_ref_model().make_doc(norm_text) - mapping = n.get_mapping_after_normalization(text, reverse=True, lang=lang) + n = ref_resolver.get_ner()._normalizer + norm_text = n.normalize(text) + norm_doc = nlp.make_doc(norm_text) + mapping = n.get_mapping_after_normalization(text, reverse=True) norm_part_indices = n.convert_normalized_indices_to_unnormalized_indices(part_indices, mapping, reverse=True) norm_part_spans = [norm_doc.char_span(s, e) for (s, e) in norm_part_indices] norm_part_token_inds = [] diff --git a/sefaria/model/linker/tests/linker_test_utils.py b/sefaria/model/linker/tests/linker_test_utils.py index 4b461cd8bb..5d18486e07 100644 --- a/sefaria/model/linker/tests/linker_test_utils.py +++ b/sefaria/model/linker/tests/linker_test_utils.py @@ -56,7 +56,7 @@ def get_symbol_by_part_type(part_type): @staticmethod def convert_to_raw_encoded_part_list(lang, text, span_inds, part_types): - nlp = library.get_ref_resolver(lang).get_raw_ref_part_model() + nlp = library.get_ref_resolver(lang).get_ner().raw_ref_part_model doc = nlp.make_doc(text) span = doc[0:] raw_encoded_part_list = [] @@ -102,7 +102,7 @@ def part_types(self): @property def span(self): if not self._span: - nlp = library.get_ref_resolver(self.lang).get_raw_ref_part_model() + nlp = library.get_ref_resolver(self.lang).get_ner().raw_ref_part_model doc = nlp.make_doc(self.input_str) self._span = doc[0:] return self._span diff --git a/sefaria/model/text.py b/sefaria/model/text.py index 2a2c00f558..01ba48daab 100644 --- a/sefaria/model/text.py +++ b/sefaria/model/text.py @@ -5609,6 +5609,7 @@ def get_ref_resolver(self, lang: str, rebuild=False): def build_ref_resolver(self, lang: str): from .linker.match_template import MatchTemplateTrie from .linker.ref_resolver import RefResolver, TermMatcher + from .linker.named_entity_resolver import NamedEntityRecognizer from sefaria.model.schema import NonUniqueTermSet from sefaria.helper.linker import load_spacy_model @@ -5617,10 +5618,14 @@ def build_ref_resolver(self, lang: str): root_nodes = list(filter(lambda n: getattr(n, 'match_templates', None) is not None, self.get_index_forest())) alone_nodes = reduce(lambda a, b: a + b.index.get_referenceable_alone_nodes(), root_nodes, []) non_unique_terms = NonUniqueTermSet() + ner = NamedEntityRecognizer( + lang, + load_spacy_model(RAW_REF_MODEL_BY_LANG_FILEPATH[lang]), + load_spacy_model(RAW_REF_PART_MODEL_BY_LANG_FILEPATH[lang]) + ) + self._ref_resolver[lang] = RefResolver( - lang, - load_spacy_model(RAW_REF_MODEL_BY_LANG_FILEPATH[lang]), - load_spacy_model(RAW_REF_PART_MODEL_BY_LANG_FILEPATH[lang]), + lang, ner, MatchTemplateTrie(lang, nodes=(root_nodes + alone_nodes), scope='alone'), TermMatcher(lang, non_unique_terms), ) From 575f28362cff0f6bf6910d02bc21eda35ca7071d Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Tue, 24 Oct 2023 09:16:14 +0300 Subject: [PATCH 003/210] refactor(linker): break up functions a bit more so they're more readable. --- sefaria/model/linker/named_entity_resolver.py | 41 +++++++++++++------ 1 file changed, 29 insertions(+), 12 deletions(-) diff --git a/sefaria/model/linker/named_entity_resolver.py b/sefaria/model/linker/named_entity_resolver.py index 0a4094ad53..f0616439be 100644 --- a/sefaria/model/linker/named_entity_resolver.py +++ b/sefaria/model/linker/named_entity_resolver.py @@ -63,7 +63,7 @@ def map_normal_output_to_original_input(self, input: str, raw_ref_list: List[Raw for raw_ref, temp_unnorm_inds, temp_unnorm_part_inds in zip(raw_ref_list, unnorm_inds, unnorm_part_inds): raw_ref.map_new_indices(unnorm_doc, temp_unnorm_inds, temp_unnorm_part_inds) - def get_raw_ref_spans_in_string(self, st: str) -> List[Span]: + def _get_raw_ref_spans_in_string(self, st: str) -> List[Span]: doc = self._raw_ref_model(st) return doc.ents @@ -99,19 +99,36 @@ def bulk_get_raw_refs(self, input: List[str]) -> List[List[RawRef]]: all_raw_refs = [] for input_idx, raw_ref_spans in enumerate(all_raw_ref_spans): raw_ref_part_spans = all_raw_ref_part_span_map[input_idx] - raw_refs = [] - for ispan, (span, part_span_list) in enumerate(zip(raw_ref_spans, raw_ref_part_spans)): - raw_ref_parts = [] - for ipart, part_span in enumerate(part_span_list): - part_type = RefPartType.span_label_to_enum(part_span.label_) - dh_continuation = None - if part_type == RefPartType.DH: - dh_continuation = self._get_dh_continuation(ispan, ipart, raw_ref_spans, part_span_list, span, part_span) - raw_ref_parts += [RawRefPart(part_type, part_span, dh_continuation)] - raw_refs += [RawRef(self._lang, raw_ref_parts, span)] - all_raw_refs += [raw_refs] + all_raw_refs += [self._bulk_make_raw_refs(raw_ref_spans, raw_ref_part_spans)] return all_raw_refs + def _bulk_make_raw_refs(self, raw_ref_spans: List[SpanOrToken], raw_ref_part_spans: List[List[SpanOrToken]]) -> List[RawRef]: + raw_refs = [] + dh_continuations = self._bulk_make_dh_continuations(raw_ref_spans, raw_ref_part_spans) + for span, part_span_list, temp_dh_continuations in zip(raw_ref_spans, raw_ref_part_spans, dh_continuations): + raw_refs += [self._make_raw_ref(span, part_span_list, temp_dh_continuations)] + return raw_refs + + def _make_raw_ref(self, span: SpanOrToken, part_span_list: List[SpanOrToken], dh_continuations: List[SpanOrToken]) -> RawRef: + raw_ref_parts = [] + for part_span, dh_continuation in zip(part_span_list, dh_continuations): + part_type = RefPartType.span_label_to_enum(part_span.label_) + raw_ref_parts += [RawRefPart(part_type, part_span, dh_continuation)] + return RawRef(self._lang, raw_ref_parts, span) + + def _bulk_make_dh_continuations(self, raw_ref_spans, raw_ref_part_spans) -> List[List[SpanOrToken]]: + dh_continuations = [] + for ispan, (span, part_span_list) in enumerate(zip(raw_ref_spans, raw_ref_part_spans)): + temp_dh_continuations = [] + for ipart, part_span in enumerate(part_span_list): + part_type = RefPartType.span_label_to_enum(part_span.label_) + dh_continuation = None + if part_type == RefPartType.DH: + dh_continuation = self._get_dh_continuation(ispan, ipart, raw_ref_spans, part_span_list, span, part_span) + temp_dh_continuations += [dh_continuation] + dh_continuations += [temp_dh_continuations] + return dh_continuations + @staticmethod def _get_dh_continuation(ispan: int, ipart: int, raw_ref_spans: List[SpanOrToken], part_span_list: List[SpanOrToken], span: SpanOrToken, part_span: SpanOrToken) -> Optional[SpanOrToken]: if ipart == len(part_span_list) - 1: From 86488c63adc082c32dc4449986877bfc55aeaba0 Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Tue, 24 Oct 2023 09:21:03 +0300 Subject: [PATCH 004/210] refactor(linker): reorganize functions so public ones are first --- sefaria/model/linker/named_entity_resolver.py | 54 +++++++++---------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/sefaria/model/linker/named_entity_resolver.py b/sefaria/model/linker/named_entity_resolver.py index f0616439be..2471efb49e 100644 --- a/sefaria/model/linker/named_entity_resolver.py +++ b/sefaria/model/linker/named_entity_resolver.py @@ -28,19 +28,20 @@ def __init_normalizer(self) -> NormalizerComposer: normalizer_steps += ['maqaf', 'cantillation'] return NormalizerComposer(normalizer_steps) - def _normalize_input(self, input: List[str]): - """ - Normalize input text to match normalization that happened at training time - """ - return [self._normalizer.normalize(s) for s in input] - - @property - def raw_ref_model(self): - return self._raw_ref_model + def bulk_get_raw_refs(self, input: List[str]) -> List[List[RawRef]]: + normalized_input = self._normalize_input(input) + all_raw_ref_spans = list(self._bulk_get_raw_ref_spans(normalized_input)) + ref_part_input = reduce(lambda a, b: a + [(sub_b.text, b[0]) for sub_b in b[1]], enumerate(all_raw_ref_spans), []) + all_raw_ref_part_spans = list(self._bulk_get_raw_ref_part_spans(ref_part_input, as_tuples=True)) + all_raw_ref_part_span_map = defaultdict(list) + for ref_part_span, input_idx in all_raw_ref_part_spans: + all_raw_ref_part_span_map[input_idx] += [ref_part_span] - @property - def raw_ref_part_model(self): - return self._raw_ref_part_model + all_raw_refs = [] + for input_idx, raw_ref_spans in enumerate(all_raw_ref_spans): + raw_ref_part_spans = all_raw_ref_part_span_map[input_idx] + all_raw_refs += [self._bulk_make_raw_refs(raw_ref_spans, raw_ref_part_spans)] + return all_raw_refs def bulk_map_normal_output_to_original_input(self, input: List[str], raw_ref_list_list: List[List[RawRef]]): for temp_input, raw_ref_list in zip(input, raw_ref_list_list): @@ -63,6 +64,20 @@ def map_normal_output_to_original_input(self, input: str, raw_ref_list: List[Raw for raw_ref, temp_unnorm_inds, temp_unnorm_part_inds in zip(raw_ref_list, unnorm_inds, unnorm_part_inds): raw_ref.map_new_indices(unnorm_doc, temp_unnorm_inds, temp_unnorm_part_inds) + @property + def raw_ref_model(self): + return self._raw_ref_model + + @property + def raw_ref_part_model(self): + return self._raw_ref_part_model + + def _normalize_input(self, input: List[str]): + """ + Normalize input text to match normalization that happened at training time + """ + return [self._normalizer.normalize(s) for s in input] + def _get_raw_ref_spans_in_string(self, st: str) -> List[Span]: doc = self._raw_ref_model(st) return doc.ents @@ -87,21 +102,6 @@ def _bulk_get_raw_ref_part_spans(self, input: List[str], batch_size=None, **kwar else: yield doc.ents - def bulk_get_raw_refs(self, input: List[str]) -> List[List[RawRef]]: - normalized_input = self._normalize_input(input) - all_raw_ref_spans = list(self._bulk_get_raw_ref_spans(normalized_input)) - ref_part_input = reduce(lambda a, b: a + [(sub_b.text, b[0]) for sub_b in b[1]], enumerate(all_raw_ref_spans), []) - all_raw_ref_part_spans = list(self._bulk_get_raw_ref_part_spans(ref_part_input, as_tuples=True)) - all_raw_ref_part_span_map = defaultdict(list) - for ref_part_span, input_idx in all_raw_ref_part_spans: - all_raw_ref_part_span_map[input_idx] += [ref_part_span] - - all_raw_refs = [] - for input_idx, raw_ref_spans in enumerate(all_raw_ref_spans): - raw_ref_part_spans = all_raw_ref_part_span_map[input_idx] - all_raw_refs += [self._bulk_make_raw_refs(raw_ref_spans, raw_ref_part_spans)] - return all_raw_refs - def _bulk_make_raw_refs(self, raw_ref_spans: List[SpanOrToken], raw_ref_part_spans: List[List[SpanOrToken]]) -> List[RawRef]: raw_refs = [] dh_continuations = self._bulk_make_dh_continuations(raw_ref_spans, raw_ref_part_spans) From f8f6a7745083bc257ffba83143a0579b36f57080 Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Tue, 24 Oct 2023 10:05:40 +0300 Subject: [PATCH 005/210] docs(linker): add docs to NamedEntityRecognizer --- sefaria/model/linker/named_entity_resolver.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/sefaria/model/linker/named_entity_resolver.py b/sefaria/model/linker/named_entity_resolver.py index 2471efb49e..d5e43678ed 100644 --- a/sefaria/model/linker/named_entity_resolver.py +++ b/sefaria/model/linker/named_entity_resolver.py @@ -13,6 +13,13 @@ class NamedEntityRecognizer: + """ + Given models, runs them and returns named entity results + Currently, named entities include: + - refs + - people + - groups of people + """ def __init__(self, lang: str, raw_ref_model: Language, raw_ref_part_model: Language): self._lang = lang @@ -29,6 +36,13 @@ def __init_normalizer(self) -> NormalizerComposer: return NormalizerComposer(normalizer_steps) def bulk_get_raw_refs(self, input: List[str]) -> List[List[RawRef]]: + """ + Runs models on input to locate all refs and ref parts + Note: takes advantage of bulk spaCy operations. It is more efficient to pass multiple strings in input than to + run this function multiple times + @param input: List of strings to search for refs in. + @return: 2D list of RawRefs. Each inner list corresponds to the refs found in a string of the input. + """ normalized_input = self._normalize_input(input) all_raw_ref_spans = list(self._bulk_get_raw_ref_spans(normalized_input)) ref_part_input = reduce(lambda a, b: a + [(sub_b.text, b[0]) for sub_b in b[1]], enumerate(all_raw_ref_spans), []) From b8262f3f4ad339d9a04919a439232beb8b17dd48 Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Tue, 24 Oct 2023 10:07:26 +0300 Subject: [PATCH 006/210] refactor(linker): rename input to inputs --- sefaria/model/linker/named_entity_resolver.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sefaria/model/linker/named_entity_resolver.py b/sefaria/model/linker/named_entity_resolver.py index d5e43678ed..701a01d288 100644 --- a/sefaria/model/linker/named_entity_resolver.py +++ b/sefaria/model/linker/named_entity_resolver.py @@ -35,16 +35,16 @@ def __init_normalizer(self) -> NormalizerComposer: normalizer_steps += ['maqaf', 'cantillation'] return NormalizerComposer(normalizer_steps) - def bulk_get_raw_refs(self, input: List[str]) -> List[List[RawRef]]: + def bulk_get_raw_refs(self, inputs: List[str]) -> List[List[RawRef]]: """ - Runs models on input to locate all refs and ref parts + Runs models on inputs to locate all refs and ref parts Note: takes advantage of bulk spaCy operations. It is more efficient to pass multiple strings in input than to run this function multiple times - @param input: List of strings to search for refs in. + @param inputs: List of strings to search for refs in. @return: 2D list of RawRefs. Each inner list corresponds to the refs found in a string of the input. """ - normalized_input = self._normalize_input(input) - all_raw_ref_spans = list(self._bulk_get_raw_ref_spans(normalized_input)) + normalized_inputs = self._normalize_input(inputs) + all_raw_ref_spans = list(self._bulk_get_raw_ref_spans(normalized_inputs)) ref_part_input = reduce(lambda a, b: a + [(sub_b.text, b[0]) for sub_b in b[1]], enumerate(all_raw_ref_spans), []) all_raw_ref_part_spans = list(self._bulk_get_raw_ref_part_spans(ref_part_input, as_tuples=True)) all_raw_ref_part_span_map = defaultdict(list) From c212a58ec95462b72cc257ec6af5923922ad3e9c Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Tue, 24 Oct 2023 11:13:14 +0300 Subject: [PATCH 007/210] feat(linker): make rawref a subclass of named entity --- scripts/catch_refs_yerushalmi_translation.py | 2 +- sefaria/model/linker/named_entity_resolver.py | 57 ++++++++++------ sefaria/model/linker/ref_part.py | 68 ++++++++++++++----- sefaria/model/linker/ref_resolver.py | 2 +- sefaria/model/linker/tests/linker_test.py | 2 +- .../model/linker/tests/linker_test_utils.py | 2 +- 6 files changed, 93 insertions(+), 40 deletions(-) diff --git a/scripts/catch_refs_yerushalmi_translation.py b/scripts/catch_refs_yerushalmi_translation.py index da277a76da..842981784c 100644 --- a/scripts/catch_refs_yerushalmi_translation.py +++ b/scripts/catch_refs_yerushalmi_translation.py @@ -292,7 +292,7 @@ def post_process_resolved_refs(self, resolved_refs: List[ResolvedRef], context_r if span_end is not None: subspan_slice = slice(0, span_end) subspan = raw_ref.subspan(subspan_slice) - new_raw_ref = RawRef('en', raw_ref.raw_ref_parts[subspan_slice], subspan) + new_raw_ref = RawRef(subspan, 'en', raw_ref.raw_ref_parts[subspan_slice]) temp_resolved_refs = self.resolver.resolve_raw_ref('en', context_ref, new_raw_ref) for temp_resolved_ref in temp_resolved_refs: temp_ref = temp_resolved_ref.ref diff --git a/sefaria/model/linker/named_entity_resolver.py b/sefaria/model/linker/named_entity_resolver.py index 701a01d288..4150cb9aac 100644 --- a/sefaria/model/linker/named_entity_resolver.py +++ b/sefaria/model/linker/named_entity_resolver.py @@ -1,7 +1,7 @@ from typing import List, Generator, Optional from functools import reduce from collections import defaultdict -from sefaria.model.linker.ref_part import RawRef, RawRefPart, SpanOrToken, span_inds, RefPartType +from sefaria.model.linker.ref_part import RawRef, RawRefPart, SpanOrToken, span_inds, RefPartType, RawNamedEntity, NamedEntityType from sefaria.helper.normalization import NormalizerComposer try: @@ -35,6 +35,23 @@ def __init_normalizer(self) -> NormalizerComposer: normalizer_steps += ['maqaf', 'cantillation'] return NormalizerComposer(normalizer_steps) + def bulk_get_raw_named_entities(self, inputs: List[str]) -> List[List[RawNamedEntity]]: + normalized_inputs = self._normalize_input(inputs) + all_raw_ref_spans = list(self._bulk_get_raw_named_entity_spans(normalized_inputs)) + all_raw_named_entities = [] + for raw_ref_spans in all_raw_ref_spans: + temp_raw_named_entities = [] + for span in raw_ref_spans: + type = NamedEntityType.span_label_to_enum(span.label_) + temp_raw_named_entities += [RawNamedEntity(span, type)] + all_raw_named_entities += [temp_raw_named_entities] + return all_raw_named_entities + + def bulk_get_raw_named_entities_by_type(self, inputs: List[str], type_filter: NamedEntityType): + all_raw_named_entities = self.bulk_get_raw_named_entities(inputs) + return [[named_entity for named_entity in sublist if named_entity.type == type_filter] + for sublist in all_raw_named_entities] + def bulk_get_raw_refs(self, inputs: List[str]) -> List[List[RawRef]]: """ Runs models on inputs to locate all refs and ref parts @@ -43,18 +60,17 @@ def bulk_get_raw_refs(self, inputs: List[str]) -> List[List[RawRef]]: @param inputs: List of strings to search for refs in. @return: 2D list of RawRefs. Each inner list corresponds to the refs found in a string of the input. """ - normalized_inputs = self._normalize_input(inputs) - all_raw_ref_spans = list(self._bulk_get_raw_ref_spans(normalized_inputs)) - ref_part_input = reduce(lambda a, b: a + [(sub_b.text, b[0]) for sub_b in b[1]], enumerate(all_raw_ref_spans), []) + all_ref_entities = self.bulk_get_raw_named_entities_by_type(inputs, NamedEntityType.CITATION) + ref_part_input = reduce(lambda a, b: a + [(sub_b.text, b[0]) for sub_b in b[1]], enumerate(all_ref_entities), []) all_raw_ref_part_spans = list(self._bulk_get_raw_ref_part_spans(ref_part_input, as_tuples=True)) all_raw_ref_part_span_map = defaultdict(list) for ref_part_span, input_idx in all_raw_ref_part_spans: all_raw_ref_part_span_map[input_idx] += [ref_part_span] all_raw_refs = [] - for input_idx, raw_ref_spans in enumerate(all_raw_ref_spans): + for input_idx, named_entities in enumerate(all_ref_entities): raw_ref_part_spans = all_raw_ref_part_span_map[input_idx] - all_raw_refs += [self._bulk_make_raw_refs(raw_ref_spans, raw_ref_part_spans)] + all_raw_refs += [self._bulk_make_raw_refs(named_entities, raw_ref_part_spans)] return all_raw_refs def bulk_map_normal_output_to_original_input(self, input: List[str], raw_ref_list_list: List[List[RawRef]]): @@ -92,15 +108,15 @@ def _normalize_input(self, input: List[str]): """ return [self._normalizer.normalize(s) for s in input] - def _get_raw_ref_spans_in_string(self, st: str) -> List[Span]: + def _get_raw_named_entity_spans(self, st: str) -> List[SpanOrToken]: doc = self._raw_ref_model(st) return doc.ents - def _get_raw_ref_part_spans_in_string(self, st: str) -> List[Span]: + def _get_raw_ref_part_spans(self, st: str) -> List[SpanOrToken]: doc = self._raw_ref_part_model(st) return doc.ents - def _bulk_get_raw_ref_spans(self, input: List[str], batch_size=150, **kwargs) -> Generator[List[Span], None, None]: + def _bulk_get_raw_named_entity_spans(self, input: List[str], batch_size=150, **kwargs) -> Generator[List[Span], None, None]: for doc in self._raw_ref_model.pipe(input, batch_size=batch_size, **kwargs): if kwargs.get('as_tuples', False): doc, context = doc @@ -116,11 +132,11 @@ def _bulk_get_raw_ref_part_spans(self, input: List[str], batch_size=None, **kwar else: yield doc.ents - def _bulk_make_raw_refs(self, raw_ref_spans: List[SpanOrToken], raw_ref_part_spans: List[List[SpanOrToken]]) -> List[RawRef]: + def _bulk_make_raw_refs(self, named_entities: List[RawNamedEntity], raw_ref_part_spans: List[List[SpanOrToken]]) -> List[RawRef]: raw_refs = [] - dh_continuations = self._bulk_make_dh_continuations(raw_ref_spans, raw_ref_part_spans) - for span, part_span_list, temp_dh_continuations in zip(raw_ref_spans, raw_ref_part_spans, dh_continuations): - raw_refs += [self._make_raw_ref(span, part_span_list, temp_dh_continuations)] + dh_continuations = self._bulk_make_dh_continuations(named_entities, raw_ref_part_spans) + for named_entity, part_span_list, temp_dh_continuations in zip(named_entities, raw_ref_part_spans, dh_continuations): + raw_refs += [self._make_raw_ref(named_entity.span, part_span_list, temp_dh_continuations)] return raw_refs def _make_raw_ref(self, span: SpanOrToken, part_span_list: List[SpanOrToken], dh_continuations: List[SpanOrToken]) -> RawRef: @@ -128,30 +144,31 @@ def _make_raw_ref(self, span: SpanOrToken, part_span_list: List[SpanOrToken], dh for part_span, dh_continuation in zip(part_span_list, dh_continuations): part_type = RefPartType.span_label_to_enum(part_span.label_) raw_ref_parts += [RawRefPart(part_type, part_span, dh_continuation)] - return RawRef(self._lang, raw_ref_parts, span) + return RawRef(span, self._lang, raw_ref_parts) - def _bulk_make_dh_continuations(self, raw_ref_spans, raw_ref_part_spans) -> List[List[SpanOrToken]]: + def _bulk_make_dh_continuations(self, named_entities: List[RawNamedEntity], raw_ref_part_spans) -> List[List[SpanOrToken]]: dh_continuations = [] - for ispan, (span, part_span_list) in enumerate(zip(raw_ref_spans, raw_ref_part_spans)): + for ispan, (named_entity, part_span_list) in enumerate(zip(named_entities, raw_ref_part_spans)): temp_dh_continuations = [] for ipart, part_span in enumerate(part_span_list): part_type = RefPartType.span_label_to_enum(part_span.label_) dh_continuation = None if part_type == RefPartType.DH: - dh_continuation = self._get_dh_continuation(ispan, ipart, raw_ref_spans, part_span_list, span, part_span) + dh_continuation = self._get_dh_continuation(ispan, ipart, named_entities, part_span_list, + named_entity.span, part_span) temp_dh_continuations += [dh_continuation] dh_continuations += [temp_dh_continuations] return dh_continuations @staticmethod - def _get_dh_continuation(ispan: int, ipart: int, raw_ref_spans: List[SpanOrToken], part_span_list: List[SpanOrToken], span: SpanOrToken, part_span: SpanOrToken) -> Optional[SpanOrToken]: + def _get_dh_continuation(ispan: int, ipart: int, named_entities: List[RawNamedEntity], part_span_list: List[SpanOrToken], span: SpanOrToken, part_span: SpanOrToken) -> Optional[SpanOrToken]: if ipart == len(part_span_list) - 1: curr_doc = span.doc _, span_end = span_inds(span) - if ispan == len(raw_ref_spans) - 1: + if ispan == len(named_entities) - 1: dh_cont = curr_doc[span_end:] else: - next_span_start, _ = span_inds(raw_ref_spans[ispan + 1]) + next_span_start, _ = span_inds(named_entities[ispan + 1].span) dh_cont = curr_doc[span_end:next_span_start] else: _, part_span_end = span_inds(part_span) diff --git a/sefaria/model/linker/ref_part.py b/sefaria/model/linker/ref_part.py index 20e590b781..f43b712ebc 100644 --- a/sefaria/model/linker/ref_part.py +++ b/sefaria/model/linker/ref_part.py @@ -34,6 +34,18 @@ "non-cts": "NON_CTS", } + +# keys correspond named entity labels in spacy models +# values are properties in NamedEntityType +LABEL_TO_NAMED_ENTITY_TYPE_ATTR = { + # HE + "מקור": "CITATION", + # EN + "Person": "PERSON", + "Group": "GROUP", + "Citation": "CITATION", +} + SpanOrToken = Union[Span, Token] # convenience type since Spans and Tokens are very similar @@ -76,6 +88,19 @@ def span_char_inds(span: SpanOrToken) -> Tuple[int, int]: return idx, idx + len(span) +class NamedEntityType(Enum): + PERSON = "person" + GROUP = "group" + CITATION = "citation" + + @classmethod + def span_label_to_enum(cls, span_label: str) -> 'NamedEntityType': + """ + Convert span label from spacy named entity to NamedEntityType + """ + return getattr(cls, LABEL_TO_NAMED_ENTITY_TYPE_ATTR[span_label]) + + class RefPartType(Enum): NAMED = "named" NUMBERED = "numbered" @@ -282,12 +307,36 @@ def _get_full_span(sections, toSections): return start_span.doc[start_token_i:end_token_i] -class RawRef(abst.Cloneable): +class RawNamedEntity(abst.Cloneable): + """ + Span of text which represents a named entity before it has been identified with an object in Sefaria's DB + """ + + def __init__(self, span: SpanOrToken, type: NamedEntityType, **cloneable_kwargs) -> None: + self.span = span + self.type = type + + @property + def text(self): + """ + Return text of underlying span + """ + return self.span.text + + @property + def char_indices(self) -> Tuple[int, int]: + """ + Return start and end char indices of underlying text + """ + return span_char_inds(self.span) + + +class RawRef(RawNamedEntity): """ Span of text which may represent one or more Refs Contains RawRefParts """ - def __init__(self, lang: str, raw_ref_parts: list, span: SpanOrToken, **clonable_kwargs) -> None: + def __init__(self, span: SpanOrToken, lang: str, raw_ref_parts: list, **clonable_kwargs) -> None: """ @param lang: @@ -295,6 +344,7 @@ def __init__(self, lang: str, raw_ref_parts: list, span: SpanOrToken, **clonable @param span: @param clonable_kwargs: kwargs when running Clonable.clone() """ + super().__init__(span, NamedEntityType.CITATION) self.lang = lang self.raw_ref_parts = self._group_ranged_parts(raw_ref_parts) self.parts_to_match = self.raw_ref_parts # actual parts that will be matched. different when their are context swaps @@ -397,20 +447,6 @@ def split_part(self, part: RawRefPart, str_end) -> Tuple['RawRef', RawRefPart, R new_parts_to_match = self.parts_to_match return self.clone(raw_ref_parts=new_parts, parts_to_match=new_parts_to_match), apart, bpart - @property - def text(self): - """ - Return text of underlying span - """ - return self.span.text - - @property - def char_indices(self) -> Tuple[int, int]: - """ - Return start and end char indices of underlying text - """ - return span_char_inds(self.span) - def map_new_indices(self, new_doc: Doc, new_indices: Tuple[int, int], new_part_indices: List[Tuple[int, int]]) -> None: """ Remap self.span and all spans of parts to new indices diff --git a/sefaria/model/linker/ref_resolver.py b/sefaria/model/linker/ref_resolver.py index 5ca8b89533..6e418f5245 100644 --- a/sefaria/model/linker/ref_resolver.py +++ b/sefaria/model/linker/ref_resolver.py @@ -318,7 +318,7 @@ def split_non_cts_parts(self, raw_ref: RawRef) -> List[RawRef]: try: raw_ref_span = raw_ref.subspan(slice(curr_part_start, curr_part_end)) curr_parts = [p.realign_to_new_raw_ref(raw_ref.span, raw_ref_span) for p in curr_parts] - split_raw_refs += [RawRef(self._lang, curr_parts, raw_ref_span)] + split_raw_refs += [RawRef(raw_ref_span, self._lang, curr_parts)] except AssertionError: pass curr_parts = [] diff --git a/sefaria/model/linker/tests/linker_test.py b/sefaria/model/linker/tests/linker_test.py index 354d002893..04f0205aea 100644 --- a/sefaria/model/linker/tests/linker_test.py +++ b/sefaria/model/linker/tests/linker_test.py @@ -306,7 +306,7 @@ def test_get_all_possible_sections_from_string(input_addr_str, AddressClass, exp ]) def test_group_ranged_parts(raw_ref_params, expected_section_slices): lang, raw_ref_parts, span = raw_ref_params - raw_ref = RawRef(lang, raw_ref_parts, span) + raw_ref = RawRef(span, lang, raw_ref_parts) exp_sec_slice, exp2sec_slice = expected_section_slices if exp_sec_slice is None: expected_raw_ref_parts = raw_ref_parts diff --git a/sefaria/model/linker/tests/linker_test_utils.py b/sefaria/model/linker/tests/linker_test_utils.py index 5d18486e07..c6d9b0213f 100644 --- a/sefaria/model/linker/tests/linker_test_utils.py +++ b/sefaria/model/linker/tests/linker_test_utils.py @@ -129,7 +129,7 @@ def raw_ref_parts(self): return raw_ref_parts def get_raw_ref_params(self): - return self.lang, self.raw_ref_parts, self.span + return self.span, self.lang, self.raw_ref_parts def print_debug_info(self): print('Input:', self.input_str) From 0005b826f5021426cecba0242c91c380d594f263 Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Tue, 24 Oct 2023 12:07:08 +0300 Subject: [PATCH 008/210] feat(linker): basic NamedEntityResolver --- sefaria/model/linker/named_entity_resolver.py | 46 +++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/sefaria/model/linker/named_entity_resolver.py b/sefaria/model/linker/named_entity_resolver.py index 4150cb9aac..2aaba5f9cc 100644 --- a/sefaria/model/linker/named_entity_resolver.py +++ b/sefaria/model/linker/named_entity_resolver.py @@ -3,6 +3,7 @@ from collections import defaultdict from sefaria.model.linker.ref_part import RawRef, RawRefPart, SpanOrToken, span_inds, RefPartType, RawNamedEntity, NamedEntityType from sefaria.helper.normalization import NormalizerComposer +from sefaria.model.topic import Topic, TopicSet try: import spacy @@ -176,3 +177,48 @@ def _get_dh_continuation(ispan: int, ipart: int, named_entities: List[RawNamedEn dh_cont = part_span.doc[part_span_end:next_part_span_start] return dh_cont + + +class ResolvedNamedEntity: + + def __init__(self, raw_named_entity: RawNamedEntity, topic: Topic): + self.raw_named_entity = raw_named_entity + self.topic = topic + + +class TopicMatcher: + + def __init__(self, lang: str, topics=None): + topics = topics or TopicSet() + self._slug_topic_map = {t.slug: t for t in topics} + self._title_slug_map = {} + for topic in topics: + for title in topic.get_titles(lang=lang, with_disambiguation=False): + self._title_slug_map[title] = topic.slug + + def match(self, text) -> Optional[Topic]: + slug = self._title_slug_map.get(text) + if slug: + return self._slug_topic_map[slug] + + +class NamedEntityResolver: + + def __init__(self, named_entity_recognizer: NamedEntityRecognizer, topic_matcher: TopicMatcher): + self._named_entity_recognizer = named_entity_recognizer + self._topic_matcher = topic_matcher + + def bulk_resolve_named_entities(self, inputs: List[str]) -> List[List[ResolvedNamedEntity]]: + all_named_entities = self._named_entity_recognizer.bulk_get_raw_named_entities(inputs) + resolved = [] + for named_entities in all_named_entities: + temp_resolved = [] + for named_entity in named_entities: + matched_topic = self._topic_matcher.match(named_entity.text) + if matched_topic: + temp_resolved += [ResolvedNamedEntity(named_entity, matched_topic)] + resolved += [temp_resolved] + return resolved + + + From f8c915f57d024779dd86fdb2e84d147b5660d7a8 Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Tue, 24 Oct 2023 12:31:21 +0300 Subject: [PATCH 009/210] feat(linker): initialize namedEntityResolver in library --- sefaria/model/text.py | 44 ++++++++++++++++++++++++++++++++----------- 1 file changed, 33 insertions(+), 11 deletions(-) diff --git a/sefaria/model/text.py b/sefaria/model/text.py index 01ba48daab..af3c3c91b8 100644 --- a/sefaria/model/text.py +++ b/sefaria/model/text.py @@ -4865,7 +4865,8 @@ def __init__(self): self._simple_term_mapping = {} self._full_term_mapping = {} self._simple_term_mapping_json = None - self._ref_resolver = {} + self._ref_resolver_by_lang = {} + self._named_entity_resolver_by_lang = {} # Topics self._topic_mapping = {} @@ -5600,8 +5601,35 @@ def _build_topic_mapping(self): self._topic_mapping = {t.slug: {"en": t.get_primary_title("en"), "he": t.get_primary_title("he")} for t in TopicSet()} return self._topic_mapping + def get_named_entity_resolver(self, lang: str, rebuild=False): + resolver = self._named_entity_resolver_by_lang.get(lang) + if not resolver or rebuild: + resolver = self.build_named_entity_resolver(lang) + return resolver + + def build_named_entity_resolver(self, lang: str): + from .linker.named_entity_resolver import TopicMatcher, NamedEntityResolver + from .topic import Topic + + ontology_topics = Topic.init('entity').topics_by_link_type_recursively() + self._named_entity_resolver_by_lang[lang] = NamedEntityResolver( + self._build_named_entity_recognizer(lang), TopicMatcher(lang, topics=ontology_topics) + ) + return self._named_entity_resolver_by_lang[lang] + + @staticmethod + def _build_named_entity_recognizer(lang: str): + from .linker.named_entity_resolver import NamedEntityRecognizer + from sefaria.helper.linker import load_spacy_model + + return NamedEntityRecognizer( + lang, + load_spacy_model(RAW_REF_MODEL_BY_LANG_FILEPATH[lang]), + load_spacy_model(RAW_REF_PART_MODEL_BY_LANG_FILEPATH[lang]) + ) + def get_ref_resolver(self, lang: str, rebuild=False): - resolver = self._ref_resolver.get(lang) + resolver = self._ref_resolver_by_lang.get(lang) if not resolver or rebuild: resolver = self.build_ref_resolver(lang) return resolver @@ -5609,27 +5637,21 @@ def get_ref_resolver(self, lang: str, rebuild=False): def build_ref_resolver(self, lang: str): from .linker.match_template import MatchTemplateTrie from .linker.ref_resolver import RefResolver, TermMatcher - from .linker.named_entity_resolver import NamedEntityRecognizer from sefaria.model.schema import NonUniqueTermSet - from sefaria.helper.linker import load_spacy_model logger.info("Loading Spacy Model") root_nodes = list(filter(lambda n: getattr(n, 'match_templates', None) is not None, self.get_index_forest())) alone_nodes = reduce(lambda a, b: a + b.index.get_referenceable_alone_nodes(), root_nodes, []) non_unique_terms = NonUniqueTermSet() - ner = NamedEntityRecognizer( - lang, - load_spacy_model(RAW_REF_MODEL_BY_LANG_FILEPATH[lang]), - load_spacy_model(RAW_REF_PART_MODEL_BY_LANG_FILEPATH[lang]) - ) + ner = self._build_named_entity_recognizer(lang) - self._ref_resolver[lang] = RefResolver( + self._ref_resolver_by_lang[lang] = RefResolver( lang, ner, MatchTemplateTrie(lang, nodes=(root_nodes + alone_nodes), scope='alone'), TermMatcher(lang, non_unique_terms), ) - return self._ref_resolver[lang] + return self._ref_resolver_by_lang[lang] def get_index_forest(self): """ From feb1f95c466e961d3ff516d6ed67b36f770ccbf9 Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Wed, 25 Oct 2023 09:03:59 +0300 Subject: [PATCH 010/210] fix(linker): curr_removed can also be negative --- sefaria/helper/normalization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sefaria/helper/normalization.py b/sefaria/helper/normalization.py index 9969022838..e0d9945fb2 100644 --- a/sefaria/helper/normalization.py +++ b/sefaria/helper/normalization.py @@ -109,7 +109,7 @@ def get_mapping_after_normalization(self, text, removal_list=None, reverse=False start, end = removal.start(), removal.end() normalized_text_index = start if reverse else (start + min(len(subst), end-start) - total_removed) curr_removed = end - start - len(subst) - if curr_removed > 0: + if curr_removed != 0: total_removed += curr_removed removal_map[normalized_text_index] = total_removed return removal_map From fce8d9446ca4487f4bd4b3e71a3214f93bc0de22 Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Wed, 25 Oct 2023 10:37:13 +0300 Subject: [PATCH 011/210] fix(linker): dont allow for partial overlaps when merging. This case seems to be either impossible or very unlikely. throwing and error for now unless we determine we want to handle this case. --- sefaria/helper/normalization.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/sefaria/helper/normalization.py b/sefaria/helper/normalization.py index e0d9945fb2..dad6fbd9d3 100644 --- a/sefaria/helper/normalization.py +++ b/sefaria/helper/normalization.py @@ -295,7 +295,14 @@ def merge_removal_inds(*all_removal_inds): else: # some sort of overlap curr_merged_inds = (last_inds[0], max(last_inds[1], curr_inds[1])) - curr_merged_repl = last_repl[:curr_inds[0]-last_inds[0]] + curr_repl + last_repl[(curr_inds[1]+1)-last_inds[0]:] + if curr_inds[0] == last_inds[0] and last_inds[1] <= curr_inds[1]: + # last is subset. use curr_repl + curr_merged_repl = curr_repl + elif curr_inds[0] >= last_inds[0] and curr_inds[1] <= last_inds[1]: + # curr is subset. use last_repl + curr_merged_repl = last_repl + else: + raise Exception(f"partial overlap. not sure how to reconcile. curr_inds: {curr_inds}. last_inds: {last_inds}") merged_removal_inds[-1] = (curr_merged_inds, curr_merged_repl) return merged_removal_inds From a567d4e89291792756b6ae51d15c68b6d51ccfcc Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Thu, 26 Oct 2023 10:20:34 +0300 Subject: [PATCH 012/210] fix: remove special casing for normalizer. It seems that the start indices from the mapping are already off-by-one, so end doesn't need to be -1. --- sefaria/helper/normalization.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/sefaria/helper/normalization.py b/sefaria/helper/normalization.py index dad6fbd9d3..b1b450d97f 100644 --- a/sefaria/helper/normalization.py +++ b/sefaria/helper/normalization.py @@ -126,9 +126,7 @@ def convert_normalized_indices_to_unnormalized_indices(normalized_indices, remov sign = -1 if reverse else 1 for start, end in normalized_indices: unnorm_start_index = bisect_right(removal_keys, start) - 1 - # special case if range is zero-length. treat end as literal and not off-by-one. - bisect_end_index = end if end == start else (end - 1) - unnorm_end_index = bisect_right(removal_keys, bisect_end_index) - 1 + unnorm_end_index = bisect_right(removal_keys, end) - 1 unnorm_start = start if unnorm_start_index < 0 else start + (sign * removal_map[removal_keys[unnorm_start_index]]) unnorm_end = end if unnorm_end_index < 0 else end + (sign * removal_map[removal_keys[unnorm_end_index]]) From 4a3797a7f102abd5d730cc96049f2050477dffb4 Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Thu, 26 Oct 2023 10:21:20 +0300 Subject: [PATCH 013/210] test(normalizer): add and modify tests for new end calculation. --- sefaria/helper/tests/normalization_tests.py | 24 +++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/sefaria/helper/tests/normalization_tests.py b/sefaria/helper/tests/normalization_tests.py index 59e4ed7767..76d59b8f07 100644 --- a/sefaria/helper/tests/normalization_tests.py +++ b/sefaria/helper/tests/normalization_tests.py @@ -1,3 +1,4 @@ +import pytest import django django.setup() from sefaria.helper.normalization import * @@ -55,9 +56,9 @@ def test_simpler_normalizer_composer(): nsc = NormalizerComposer(['brackets', 'double-space']) assert nsc.normalize(text) == normalized text_to_remove = nsc.find_text_to_remove(text) - assert len(text_to_remove) == 2 + assert len(text_to_remove) == 1 (start0, end0), repl0 = text_to_remove[0] - assert text[start0:end0] == " " + assert text[start0:end0] == " [" assert repl0 == ' ' @@ -67,12 +68,26 @@ def test_complicated_normalizer_composer(): nsc = NormalizerComposer(['html', "parens-plus-contents", 'brackets', 'double-space']) assert nsc.normalize(text) == normalized text_to_remove = nsc.find_text_to_remove(text) - assert len(text_to_remove) == 6 + assert len(text_to_remove) == 5 (start0, end0), repl0 = text_to_remove[0] - assert text[start0:end0] == "(hello other stuff) " + assert text[start0:end0] == "(hello other stuff) [" assert repl0 == ' ' +def test_mapping(): + text = """ test""" + normalized = """ test""" + nsc = NormalizerComposer(['html', 'double-space']) + assert nsc.normalize(text) == normalized + mapping = nsc.get_mapping_after_normalization(text) + test_word = "test" + start_norm_ind = normalized.index(test_word) + norm_inds = (start_norm_ind, start_norm_ind+len(test_word)) + unnorm_inds = nsc.convert_normalized_indices_to_unnormalized_indices([norm_inds], mapping)[0] + # actual test + assert text[slice(*unnorm_inds)] == normalized[slice(*norm_inds)] + + def test_html_normalizer_for_empty_prefix(): text = """It is written241K. 17:1. Elijah the Tisbite""" normalizer = NormalizerComposer(['html']) @@ -102,6 +117,7 @@ def test_nested_itag(): assert text[s:e] == """bullnestedThe.""" +@pytest.mark.xfail(reason="not clear we want to support char_indices_from_word_indices as it's unused") def test_word_to_char(): test_string = 'some words go here\n\nhello world' words = ['go', 'here', 'hello'] From 9d8dc2a6f535197d00d57dc32e648cbceac52a61 Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Thu, 26 Oct 2023 10:22:27 +0300 Subject: [PATCH 014/210] refactor(linker): break up map_new_indices so it distinguishes between mapping main span and part spans. --- sefaria/model/linker/ref_part.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/sefaria/model/linker/ref_part.py b/sefaria/model/linker/ref_part.py index f43b712ebc..10d81e87f2 100644 --- a/sefaria/model/linker/ref_part.py +++ b/sefaria/model/linker/ref_part.py @@ -316,6 +316,13 @@ def __init__(self, span: SpanOrToken, type: NamedEntityType, **cloneable_kwargs) self.span = span self.type = type + def map_new_indices(self, new_doc: Doc, new_indices: Tuple[int, int]) -> None: + """ + Remap self.span to new indices + """ + self.span = new_doc.char_span(*new_indices) + if self.span is None: raise InputError(f"${new_indices} don't match token boundaries. Using 'expand' alignment mode text is '{new_doc.char_span(*new_indices, alignment_mode='expand')}'") + @property def text(self): """ @@ -447,13 +454,13 @@ def split_part(self, part: RawRefPart, str_end) -> Tuple['RawRef', RawRefPart, R new_parts_to_match = self.parts_to_match return self.clone(raw_ref_parts=new_parts, parts_to_match=new_parts_to_match), apart, bpart - def map_new_indices(self, new_doc: Doc, new_indices: Tuple[int, int], new_part_indices: List[Tuple[int, int]]) -> None: + def map_new_part_indices(self, new_part_indices: List[Tuple[int, int]]) -> None: """ Remap self.span and all spans of parts to new indices """ - self.span = new_doc.char_span(*new_indices) - if self.span is None: raise InputError(f"${new_indices} don't match token boundaries. Using 'expand' alignment mode text is '{new_doc.char_span(*new_indices, alignment_mode='expand')}'") + start_char, _ = self.char_indices doc_span = self.span.as_doc() for part, temp_part_indices in zip(self.raw_ref_parts, new_part_indices): - part.span = doc_span.char_span(*[i-new_indices[0] for i in temp_part_indices]) - if part.span is None: raise InputError(f"{temp_part_indices} doesn't match token boundaries for part {part}. Using 'expand' alignment mode text is '{new_doc.char_span(*temp_part_indices, alignment_mode='expand')}'") + part.span = doc_span.char_span(*[i-start_char for i in temp_part_indices]) + if part.span is None: + raise InputError(f"{temp_part_indices} doesn't match token boundaries for part {part}.") From 873c49652722f6a9254a2c271e0018a137089fa0 Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Thu, 26 Oct 2023 10:22:49 +0300 Subject: [PATCH 015/210] feat(linker): update tokenizer to include more punctuation. --- sefaria/spacy_function_registry.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sefaria/spacy_function_registry.py b/sefaria/spacy_function_registry.py index 34a8366744..a1a91bb342 100644 --- a/sefaria/spacy_function_registry.py +++ b/sefaria/spacy_function_registry.py @@ -5,7 +5,7 @@ def inner_punct_tokenizer_factory(): def inner_punct_tokenizer(nlp): # infix_re = spacy.util.compile_infix_regex(nlp.Defaults.infixes) - infix_re = re.compile(r'''[\.\,\?\:\;…\‘\’\`\“\”\"\'~\–\-/\(\)]''') + infix_re = re.compile(r'''[.,?!:;…‘’`“”"'~–\-/()<>]''') prefix_re = spacy.util.compile_prefix_regex(nlp.Defaults.prefixes) suffix_re = spacy.util.compile_suffix_regex(nlp.Defaults.suffixes) From 2ec609b50b63bb94287f9c839f1e1274f88a5c19 Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Thu, 26 Oct 2023 10:24:01 +0300 Subject: [PATCH 016/210] fix(linker): fix bugs with outputting named entities. --- sefaria/model/linker/named_entity_resolver.py | 35 ++++++++++++++----- 1 file changed, 26 insertions(+), 9 deletions(-) diff --git a/sefaria/model/linker/named_entity_resolver.py b/sefaria/model/linker/named_entity_resolver.py index 2aaba5f9cc..1638831808 100644 --- a/sefaria/model/linker/named_entity_resolver.py +++ b/sefaria/model/linker/named_entity_resolver.py @@ -3,7 +3,7 @@ from collections import defaultdict from sefaria.model.linker.ref_part import RawRef, RawRefPart, SpanOrToken, span_inds, RefPartType, RawNamedEntity, NamedEntityType from sefaria.helper.normalization import NormalizerComposer -from sefaria.model.topic import Topic, TopicSet +from sefaria.model.topic import Topic, TopicSet, RefTopicLink try: import spacy @@ -78,7 +78,7 @@ def bulk_map_normal_output_to_original_input(self, input: List[str], raw_ref_lis for temp_input, raw_ref_list in zip(input, raw_ref_list_list): self.map_normal_output_to_original_input(temp_input, raw_ref_list) - def map_normal_output_to_original_input(self, input: str, raw_ref_list: List[RawRef]) -> None: + def map_normal_output_to_original_input(self, input: str, named_entities: List[RawNamedEntity]) -> None: """ Ref resolution ran on normalized input. Remap raw refs to original (non-normalized) input """ @@ -86,14 +86,17 @@ def map_normal_output_to_original_input(self, input: str, raw_ref_list: List[Raw mapping = self._normalizer.get_mapping_after_normalization(input) # this function name is waaay too long conv = self._normalizer.convert_normalized_indices_to_unnormalized_indices - norm_inds = [raw_ref.char_indices for raw_ref in raw_ref_list] + norm_inds = [named_entity.char_indices for named_entity in named_entities] unnorm_inds = conv(norm_inds, mapping) unnorm_part_inds = [] - for (raw_ref, (norm_raw_ref_start, _)) in zip(raw_ref_list, norm_inds): + for (named_entity, (norm_raw_ref_start, _)) in zip(named_entities, norm_inds): + raw_ref_parts = named_entity.raw_ref_parts if isinstance(named_entity, RawRef) else [] unnorm_part_inds += [conv([[norm_raw_ref_start + i for i in part.char_indices] - for part in raw_ref.raw_ref_parts], mapping)] - for raw_ref, temp_unnorm_inds, temp_unnorm_part_inds in zip(raw_ref_list, unnorm_inds, unnorm_part_inds): - raw_ref.map_new_indices(unnorm_doc, temp_unnorm_inds, temp_unnorm_part_inds) + for part in raw_ref_parts], mapping)] + for named_entity, temp_unnorm_inds, temp_unnorm_part_inds in zip(named_entities, unnorm_inds, unnorm_part_inds): + named_entity.map_new_indices(unnorm_doc, temp_unnorm_inds) + if isinstance(named_entity, RawRef): + named_entity.map_new_part_indices(temp_unnorm_part_inds) @property def raw_ref_model(self): @@ -185,6 +188,18 @@ def __init__(self, raw_named_entity: RawNamedEntity, topic: Topic): self.raw_named_entity = raw_named_entity self.topic = topic + def to_ref_topic_link(self) -> RefTopicLink: + start_char, end_char = self.raw_named_entity.char_indices + return RefTopicLink({ + "ref": "", + "toTopic": self.topic.slug if self.topic else "N/A", + "charLevelData": { + "startChar": start_char, + "endChar": end_char, + "text": self.raw_named_entity.text, + } + }) + class TopicMatcher: @@ -208,16 +223,18 @@ def __init__(self, named_entity_recognizer: NamedEntityRecognizer, topic_matcher self._named_entity_recognizer = named_entity_recognizer self._topic_matcher = topic_matcher - def bulk_resolve_named_entities(self, inputs: List[str]) -> List[List[ResolvedNamedEntity]]: + def bulk_resolve_named_entities(self, inputs: List[str], with_failures=False) -> List[List[ResolvedNamedEntity]]: all_named_entities = self._named_entity_recognizer.bulk_get_raw_named_entities(inputs) resolved = [] for named_entities in all_named_entities: temp_resolved = [] for named_entity in named_entities: matched_topic = self._topic_matcher.match(named_entity.text) - if matched_topic: + if matched_topic or with_failures: temp_resolved += [ResolvedNamedEntity(named_entity, matched_topic)] resolved += [temp_resolved] + named_entity_list_list = [[rr.raw_named_entity for rr in inner_resolved] for inner_resolved in resolved] + self._named_entity_recognizer.bulk_map_normal_output_to_original_input(inputs, named_entity_list_list) return resolved From 48186fdd80a3861a0849234f10699333787b8743 Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Thu, 26 Oct 2023 11:04:13 +0300 Subject: [PATCH 017/210] fix(linker): add missing parameter --- reader/views.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/reader/views.py b/reader/views.py index 86f6843254..72f5b7fe81 100644 --- a/reader/views.py +++ b/reader/views.py @@ -114,7 +114,7 @@ if ENABLE_LINKER: logger.info("Initializing Linker") - library.build_ref_resolver() + library.build_ref_resolver('he') if server_coordinator: server_coordinator.connect() From 899e59e76cc7c8b508201ce68c25f47ea0a62b31 Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Mon, 30 Oct 2023 00:38:52 +0200 Subject: [PATCH 018/210] feat(linker): allow for multiple topics to match --- sefaria/model/linker/named_entity_resolver.py | 53 +++++++++---------- sefaria/model/text.py | 12 ++++- 2 files changed, 36 insertions(+), 29 deletions(-) diff --git a/sefaria/model/linker/named_entity_resolver.py b/sefaria/model/linker/named_entity_resolver.py index 1638831808..751d6702ba 100644 --- a/sefaria/model/linker/named_entity_resolver.py +++ b/sefaria/model/linker/named_entity_resolver.py @@ -1,4 +1,4 @@ -from typing import List, Generator, Optional +from typing import List, Generator, Optional, Dict from functools import reduce from collections import defaultdict from sefaria.model.linker.ref_part import RawRef, RawRefPart, SpanOrToken, span_inds, RefPartType, RawNamedEntity, NamedEntityType @@ -184,37 +184,36 @@ def _get_dh_continuation(ispan: int, ipart: int, named_entities: List[RawNamedEn class ResolvedNamedEntity: - def __init__(self, raw_named_entity: RawNamedEntity, topic: Topic): + def __init__(self, raw_named_entity: RawNamedEntity, topics: List[Topic]): self.raw_named_entity = raw_named_entity - self.topic = topic + self.topics = topics - def to_ref_topic_link(self) -> RefTopicLink: - start_char, end_char = self.raw_named_entity.char_indices - return RefTopicLink({ - "ref": "", - "toTopic": self.topic.slug if self.topic else "N/A", - "charLevelData": { - "startChar": start_char, - "endChar": end_char, - "text": self.raw_named_entity.text, - } - }) + @property + def is_ambiguous(self) -> bool: + return len(self.topics) > 1 class TopicMatcher: - def __init__(self, lang: str, topics=None): - topics = topics or TopicSet() - self._slug_topic_map = {t.slug: t for t in topics} - self._title_slug_map = {} + def __init__(self, lang: str, topics_by_type: Dict[str, List[Topic]]): + self._lang = lang + all_topics = reduce(lambda a, b: a + b, topics_by_type.values()) + self._slug_topic_map = {t.slug: t for t in all_topics} + self._title_slug_map_by_type = { + named_entity_type: self.__get_title_map_for_topics(topics) + for named_entity_type, topics in topics_by_type.items() + } + + def __get_title_map_for_topics(self, topics) -> Dict[str, List[str]]: + title_slug_map = defaultdict(list) for topic in topics: - for title in topic.get_titles(lang=lang, with_disambiguation=False): - self._title_slug_map[title] = topic.slug + for title in topic.get_titles(lang=self._lang, with_disambiguation=False): + title_slug_map[title] += [topic.slug] + return title_slug_map - def match(self, text) -> Optional[Topic]: - slug = self._title_slug_map.get(text) - if slug: - return self._slug_topic_map[slug] + def match(self, named_entity: RawNamedEntity) -> List[Topic]: + slugs = self._title_slug_map_by_type.get(named_entity.type.name, {}).get(named_entity.text, []) + return [self._slug_topic_map[slug] for slug in slugs] class NamedEntityResolver: @@ -229,9 +228,9 @@ def bulk_resolve_named_entities(self, inputs: List[str], with_failures=False) -> for named_entities in all_named_entities: temp_resolved = [] for named_entity in named_entities: - matched_topic = self._topic_matcher.match(named_entity.text) - if matched_topic or with_failures: - temp_resolved += [ResolvedNamedEntity(named_entity, matched_topic)] + matched_topics = self._topic_matcher.match(named_entity) + if len(matched_topics) > 0 or with_failures: + temp_resolved += [ResolvedNamedEntity(named_entity, matched_topics)] resolved += [temp_resolved] named_entity_list_list = [[rr.raw_named_entity for rr in inner_resolved] for inner_resolved in resolved] self._named_entity_recognizer.bulk_map_normal_output_to_original_input(inputs, named_entity_list_list) diff --git a/sefaria/model/text.py b/sefaria/model/text.py index af3c3c91b8..03be0f7886 100644 --- a/sefaria/model/text.py +++ b/sefaria/model/text.py @@ -5611,9 +5611,17 @@ def build_named_entity_resolver(self, lang: str): from .linker.named_entity_resolver import TopicMatcher, NamedEntityResolver from .topic import Topic - ontology_topics = Topic.init('entity').topics_by_link_type_recursively() + ontology_roots_to_named_entity_types = { + "people": "PERSON", + "group-of-people": "GROUP", + } + yo = Topic.init('people').topics_by_link_type_recursively() self._named_entity_resolver_by_lang[lang] = NamedEntityResolver( - self._build_named_entity_recognizer(lang), TopicMatcher(lang, topics=ontology_topics) + self._build_named_entity_recognizer(lang), + TopicMatcher(lang, { + named_entity_type: Topic.init(ontology_root).topics_by_link_type_recursively() + for ontology_root, named_entity_type in ontology_roots_to_named_entity_types.items() + }) ) return self._named_entity_resolver_by_lang[lang] From 6f2e424c5c3b1b5883c0b982e9b017d4d79169b3 Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Thu, 2 Nov 2023 13:10:07 +0200 Subject: [PATCH 019/210] test(linker): add tests for PersonTitleGenerator --- .../linker/tests/named_entity_resolver_tests.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) create mode 100644 sefaria/model/linker/tests/named_entity_resolver_tests.py diff --git a/sefaria/model/linker/tests/named_entity_resolver_tests.py b/sefaria/model/linker/tests/named_entity_resolver_tests.py new file mode 100644 index 0000000000..c207c7f8f4 --- /dev/null +++ b/sefaria/model/linker/tests/named_entity_resolver_tests.py @@ -0,0 +1,14 @@ +import pytest +from sefaria.model.linker.named_entity_resolver import NamedEntityTitleGenerator, PersonTitleGenerator + + +@pytest.mark.parametrize(('title', 'expected_output'), [ + ['Rabbi b. Ben', ['Rabbi b. Ben', 'Rabbi ben Ben', 'Rabbi bar Ben', 'Rabbi, son of Ben', 'Rabbi, the son of Ben', + 'Rabbi son of Ben', 'Rabbi the son of Ben', 'Rabbi Bar Ben', 'Rabbi Ben Ben', 'R. b. Ben']], + ['Rabbi ben Ben', ['R. ben Ben', 'Rabbi ben Ben']], + ['Bar Kochba', ['Bar Kochba', 'bar Kochba']], +]) +def test_person_title_generator(title, expected_output): + expected_output = sorted(expected_output) + actual_output = sorted(PersonTitleGenerator.generate(title)) + assert actual_output == expected_output From 6d9e83458da0b77247793b57710b5e8a8fcfec66 Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Thu, 2 Nov 2023 13:10:33 +0200 Subject: [PATCH 020/210] fix(linker): be more lenient with span alignment --- sefaria/model/linker/ref_part.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sefaria/model/linker/ref_part.py b/sefaria/model/linker/ref_part.py index 10d81e87f2..aa07265df0 100644 --- a/sefaria/model/linker/ref_part.py +++ b/sefaria/model/linker/ref_part.py @@ -320,7 +320,7 @@ def map_new_indices(self, new_doc: Doc, new_indices: Tuple[int, int]) -> None: """ Remap self.span to new indices """ - self.span = new_doc.char_span(*new_indices) + self.span = new_doc.char_span(*new_indices, alignment_mode='expand') if self.span is None: raise InputError(f"${new_indices} don't match token boundaries. Using 'expand' alignment mode text is '{new_doc.char_span(*new_indices, alignment_mode='expand')}'") @property From bd7c331eadce8e41ef31feacad001ce84d81c3fa Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Thu, 2 Nov 2023 13:11:13 +0200 Subject: [PATCH 021/210] fix(linker): improve ability of ner resolver to resolve topics --- sefaria/model/linker/named_entity_resolver.py | 104 ++++++++++++++++-- sefaria/model/text.py | 13 +-- 2 files changed, 99 insertions(+), 18 deletions(-) diff --git a/sefaria/model/linker/named_entity_resolver.py b/sefaria/model/linker/named_entity_resolver.py index 751d6702ba..6356345caf 100644 --- a/sefaria/model/linker/named_entity_resolver.py +++ b/sefaria/model/linker/named_entity_resolver.py @@ -1,9 +1,16 @@ -from typing import List, Generator, Optional, Dict +import dataclasses +from typing import List, Generator, Optional, Dict, Type, Set +try: + import re2 as re + re.set_fallback_notification(re.FALLBACK_WARNING) +except ImportError: + import re from functools import reduce from collections import defaultdict from sefaria.model.linker.ref_part import RawRef, RawRefPart, SpanOrToken, span_inds, RefPartType, RawNamedEntity, NamedEntityType from sefaria.helper.normalization import NormalizerComposer from sefaria.model.topic import Topic, TopicSet, RefTopicLink +from sefaria.utils.hebrew import strip_cantillation try: import spacy @@ -193,24 +200,103 @@ def is_ambiguous(self) -> bool: return len(self.topics) > 1 +class TitleGenerator: + + expansions = {} + + @classmethod + def generate(cls, title: str) -> List[str]: + expansions = [title] + for reg, reg_expansions in cls.expansions.items(): + for reg_expansion in reg_expansions: + potential_expansion = re.sub(reg, reg_expansion, title) + if potential_expansion == title: continue + expansions += [potential_expansion] + expansions = [strip_cantillation(t, strip_vowels=True) for t in expansions] + return expansions + + +class PersonTitleGenerator(TitleGenerator): + + expansions = { + r' b\. ': [' ben ', ' bar ', ', son of ', ', the son of ', ' son of ', ' the son of ', ' Bar ', ' Ben '], + r'^Ben ': ['ben '], + r'^Bar ': ['bar '], + r'^Rabbi ': ['R. '], + r'^Rebbi ': ['R. '], + } + + +class FallbackTitleGenerator(TitleGenerator): + + expansions = { + '^The ': ['the '], + } + + +@dataclasses.dataclass +class NamedEntityTitleExpanderRoute: + type_slug: str + generator: Type[TitleGenerator] + + +class NamedEntityTitleExpander: + type_generator_router = [ + NamedEntityTitleExpanderRoute('people', PersonTitleGenerator), + NamedEntityTitleExpanderRoute('entity', FallbackTitleGenerator), + ] + + def __init__(self, lang: str): + self._lang = lang + + def expand(self, topic: Topic) -> List[str]: + for route in self.type_generator_router: + if topic.has_types({route.type_slug}): + return self._expand_titles_with_generator(topic, route.generator) + return self._get_topic_titles(topic) + + def _get_topic_titles(self, topic: Topic) -> List[str]: + return topic.get_titles(lang=self._lang, with_disambiguation=False) + + def _expand_titles_with_generator(self, topic: Topic, generator: Type[TitleGenerator]) -> List[str]: + expansions = [] + for title in self._get_topic_titles(topic): + expansions += generator.generate(title) + return expansions + + class TopicMatcher: - def __init__(self, lang: str, topics_by_type: Dict[str, List[Topic]]): + def __init__(self, lang: str, named_entity_types_to_topics: Dict[str, Dict[str, List[str]]]): self._lang = lang + self._title_expander = NamedEntityTitleExpander(lang) + topics_by_type = { + named_entity_type: self.__generate_topic_list_from_spec(topic_spec) + for named_entity_type, topic_spec in named_entity_types_to_topics.items() + } all_topics = reduce(lambda a, b: a + b, topics_by_type.values()) self._slug_topic_map = {t.slug: t for t in all_topics} self._title_slug_map_by_type = { - named_entity_type: self.__get_title_map_for_topics(topics) - for named_entity_type, topics in topics_by_type.items() + named_entity_type: self.__get_title_map_for_topics(topics_by_type[named_entity_type]) + for named_entity_type, topic_spec in named_entity_types_to_topics.items() } - def __get_title_map_for_topics(self, topics) -> Dict[str, List[str]]: - title_slug_map = defaultdict(list) - for topic in topics: - for title in topic.get_titles(lang=self._lang, with_disambiguation=False): - title_slug_map[title] += [topic.slug] + def __get_title_map_for_topics(self, topics: List[Topic]) -> Dict[str, Set[str]]: + title_slug_map = defaultdict(set) + unique_topics = {t.slug: t for t in topics}.values() + for topic in unique_topics: + for title in self._title_expander.expand(topic): + title_slug_map[title].add(topic.slug) return title_slug_map + @staticmethod + def __generate_topic_list_from_spec(topic_spec: Dict[str, List[str]]) -> List[Topic]: + topics = [] + for root in topic_spec.get('ontology_roots', []): + topics += Topic.init(root).topics_by_link_type_recursively() + topics += [Topic.init(slug) for slug in topic_spec.get('single_slugs', [])] + return topics + def match(self, named_entity: RawNamedEntity) -> List[Topic]: slugs = self._title_slug_map_by_type.get(named_entity.type.name, {}).get(named_entity.text, []) return [self._slug_topic_map[slug] for slug in slugs] diff --git a/sefaria/model/text.py b/sefaria/model/text.py index 03be0f7886..bffe8cf7c5 100644 --- a/sefaria/model/text.py +++ b/sefaria/model/text.py @@ -5609,19 +5609,14 @@ def get_named_entity_resolver(self, lang: str, rebuild=False): def build_named_entity_resolver(self, lang: str): from .linker.named_entity_resolver import TopicMatcher, NamedEntityResolver - from .topic import Topic - ontology_roots_to_named_entity_types = { - "people": "PERSON", - "group-of-people": "GROUP", + named_entity_types_to_topics = { + "PERSON": {"ontology_roots": ['people'], "single_slugs": ['god', 'the-tetragrammaton']}, + "GROUP": {'ontology_roots': ["group-of-people"]}, } - yo = Topic.init('people').topics_by_link_type_recursively() self._named_entity_resolver_by_lang[lang] = NamedEntityResolver( self._build_named_entity_recognizer(lang), - TopicMatcher(lang, { - named_entity_type: Topic.init(ontology_root).topics_by_link_type_recursively() - for ontology_root, named_entity_type in ontology_roots_to_named_entity_types.items() - }) + TopicMatcher(lang, named_entity_types_to_topics) ) return self._named_entity_resolver_by_lang[lang] From c230512e82744f58fa35f31e66911575b4a74823 Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Thu, 2 Nov 2023 14:23:08 +0200 Subject: [PATCH 022/210] fix(linker): make span alignment more flexible --- sefaria/model/linker/ref_part.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sefaria/model/linker/ref_part.py b/sefaria/model/linker/ref_part.py index aa07265df0..cd5340152f 100644 --- a/sefaria/model/linker/ref_part.py +++ b/sefaria/model/linker/ref_part.py @@ -461,6 +461,6 @@ def map_new_part_indices(self, new_part_indices: List[Tuple[int, int]]) -> None: start_char, _ = self.char_indices doc_span = self.span.as_doc() for part, temp_part_indices in zip(self.raw_ref_parts, new_part_indices): - part.span = doc_span.char_span(*[i-start_char for i in temp_part_indices]) + part.span = doc_span.char_span(*[i-start_char for i in temp_part_indices], alignment_mode='expand') if part.span is None: raise InputError(f"{temp_part_indices} doesn't match token boundaries for part {part}.") From 29359d1cf1155174968513cac5bb12baa3237343 Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Thu, 2 Nov 2023 14:23:28 +0200 Subject: [PATCH 023/210] refactor(linker): make book_context_refs optional --- scripts/catch_refs_yerushalmi_translation.py | 2 +- scripts/dicta_library_linker.py | 2 +- scripts/parse_rambi_webpages.py | 2 +- scripts/parse_refs_in_gilyon_hashas.py | 4 ++-- sefaria/helper/linker.py | 4 ++-- sefaria/model/linker/ref_resolver.py | 5 ++++- sefaria/model/linker/tests/linker_test.py | 2 +- 7 files changed, 12 insertions(+), 9 deletions(-) diff --git a/scripts/catch_refs_yerushalmi_translation.py b/scripts/catch_refs_yerushalmi_translation.py index 842981784c..f420a9c815 100644 --- a/scripts/catch_refs_yerushalmi_translation.py +++ b/scripts/catch_refs_yerushalmi_translation.py @@ -150,7 +150,7 @@ def catch_refs_in_title(self, title: str): version = Version().load({"title": title, "language": self.lang, "versionTitle": self.vtitle}) version.walk_thru_contents(self.collect_resolver_input) context_refs, input_text = zip(*self.resolver_input) - all_resolved_refs = self.resolver.bulk_resolve_refs(self.lang, context_refs, input_text, with_failures=True, verbose=True) + all_resolved_refs = self.resolver.bulk_resolve(self.lang, context_refs, input_text, with_failures=True, verbose=True) self.resolved_refs_by_context = {} for context_ref, resolved_refs in zip(context_refs, all_resolved_refs): self.resolved_refs_by_context[context_ref.normal()] = resolved_refs diff --git a/scripts/dicta_library_linker.py b/scripts/dicta_library_linker.py index e4470d4e83..0304c65c5b 100644 --- a/scripts/dicta_library_linker.py +++ b/scripts/dicta_library_linker.py @@ -17,7 +17,7 @@ def run_on_page(path, tref): text = get_text(jin) #text = """וכן כתב הרמב"ם ז"ל בהלכות טוען ונטען פ"ב""" ref_resolver = library.get_ref_resolver() - resolved = ref_resolver.bulk_resolve_refs("he", [None], [text], with_failures=True) + resolved = ref_resolver.bulk_resolve("he", [None], [text], with_failures=True) make_html([resolved], [[text]], f"../data/private/linker_results/{tref}.html") diff --git a/scripts/parse_rambi_webpages.py b/scripts/parse_rambi_webpages.py index 63f9c5dab0..9014b53a34 100644 --- a/scripts/parse_rambi_webpages.py +++ b/scripts/parse_rambi_webpages.py @@ -62,7 +62,7 @@ def get_refs_from_string(string): string = translliterate_russian_to_latin(string) ref_resolver = library.get_ref_resolver() if lang == 'he': # remove this line when linker v3 is availabe in English - refs = ref_resolver.bulk_resolve_refs(lang, [None], [string]) + refs = ref_resolver.bulk_resolve(lang, [None], [string]) refs = {y.ref for x in refs for y in x if type(y) != AmbiguousResolvedRef} else: # remove else statement (with its content) when linker v3 is availabe in English refs = set() diff --git a/scripts/parse_refs_in_gilyon_hashas.py b/scripts/parse_refs_in_gilyon_hashas.py index 11af3d0da2..8113a89d82 100644 --- a/scripts/parse_refs_in_gilyon_hashas.py +++ b/scripts/parse_refs_in_gilyon_hashas.py @@ -22,7 +22,7 @@ def collect_input(s: str, en_tref: str, he_tref: str, v: Version) -> None: version = VersionSet({"title": title, "language": "he"}).array()[0] version.walk_thru_contents(collect_input) - resolved = resolver.bulk_resolve_refs('he', input_context_refs, input_text, with_failures=True, verbose=True) + resolved = resolver.bulk_resolve('he', input_context_refs, input_text, with_failures=True, verbose=True) return resolved, input_text, input_context_refs @@ -64,7 +64,7 @@ def parse_string(resolver): זהו ג"כ הוי כמו חזקה). וכן מצאתי להדיא שכ"כ בתורת השלמים ר"ס קפ"ה ע"ש. וראיתי להנו"ב (מ"ק ביו"ד סי' נ"ז) שהקשה להרשב"א דס"ל דבס"ס נמי היכא דאיכא לברורי מבררינן דא"כ אמאי סמכינן על סתם כלים של נכרים אינן ב"י דהוא מטעם ס"ס אף דאפשר לברר ע"י קפילא וכמ"ש הרא"ש בפ"ב דע"ז דמשום ס"ס לא הטריחוהו להטעימו לקפילא. והרשב"א גופי' פסק כן דסומכין ע"ז דסתם כלים של נכרים אינן ב"י, ותירץ הנו"ב דכיון דאין אנו דנים על הכלי דאפי' ודאי אינו ב"י אסור לבשל בו לכתחלה רק אנו דנים על התבשיל והתבשיל יש לו חזקת היתר ולכן בצירוף חזקת היתר עם הס"ס א"צ לעמוד על הבירור בזה עכ"ד הנו"ב. ואף שיש לפקפק על דבריו במה דפשיטא ליה שהתבשיל יש לו חזקת היתר. די"ל דאיתרע חזקתו כשבישלו בכלי שהוא ספק ב"י. ובכעין זה נחלקו הט"ז והנקה"כ ביו"ד ר"ס ק"ה לענין ספק כבוש, דהט"ז ס"ל שם דיש לאוקמי ההיתר בחזקת כשרות, והנה"כ ושאר """ context_refs = [Ref("Job 1")] - resolved = resolver.bulk_resolve_refs('he', context_refs, [s], with_failures=True) + resolved = resolver.bulk_resolve('he', context_refs, [s], with_failures=True) make_html(resolved, "../data/toratchesed-018__output.html", 'he') save_resolved_refs(zip(context_refs, resolved), 'toratchesed-018__output.csv') diff --git a/sefaria/helper/linker.py b/sefaria/helper/linker.py index 3b454102f0..2083210bca 100644 --- a/sefaria/helper/linker.py +++ b/sefaria/helper/linker.py @@ -120,9 +120,9 @@ def _make_find_refs_response_with_cache(request_text: _FindRefsText, options: _F def _make_find_refs_response_linker_v3(request_text: _FindRefsText, options: _FindRefsTextOptions) -> dict: resolver = library.get_ref_resolver(request_text.lang) - resolved_title = resolver.bulk_resolve_refs([None], [request_text.title]) + resolved_title = resolver.bulk_resolve([request_text.title], [None]) context_ref = resolved_title[0][0].ref if (len(resolved_title[0]) == 1 and not resolved_title[0][0].is_ambiguous) else None - resolved_body = resolver.bulk_resolve_refs([context_ref], [request_text.body], with_failures=True) + resolved_body = resolver.bulk_resolve([request_text.body], [context_ref], with_failures=True) response = { "title": _make_find_refs_response_inner(resolved_title, options), diff --git a/sefaria/model/linker/ref_resolver.py b/sefaria/model/linker/ref_resolver.py index 6e418f5245..59822a7aaf 100644 --- a/sefaria/model/linker/ref_resolver.py +++ b/sefaria/model/linker/ref_resolver.py @@ -254,7 +254,9 @@ def __init__(self, lang: str, named_entity_recognizer: NamedEntityRecognizer, def reset_ibid_history(self): self._ibid_history = IbidHistory() - def bulk_resolve_refs(self, book_context_refs: List[Optional[text.Ref]], input: List[str], with_failures=False, verbose=False, reset_ibids_every_context_ref=True, thoroughness=ResolutionThoroughness.NORMAL) -> List[List[Union[ResolvedRef, AmbiguousResolvedRef]]]: + def bulk_resolve(self, input: List[str], book_context_refs: Optional[List[Optional[text.Ref]]] = None, with_failures=False, + verbose=False, reset_ibids_every_context_ref=True, thoroughness=ResolutionThoroughness.NORMAL) \ + -> List[List[Union[ResolvedRef, AmbiguousResolvedRef]]]: """ Main function for resolving refs in text. Given a list of texts, returns ResolvedRefs for each @param book_context_refs: @@ -269,6 +271,7 @@ def bulk_resolve_refs(self, book_context_refs: List[Optional[text.Ref]], input: self.reset_ibid_history() all_raw_refs = self._named_entity_recognizer.bulk_get_raw_refs(input) resolved = [] + book_context_refs = book_context_refs or [None]*len(all_raw_refs) iter = zip(book_context_refs, all_raw_refs) if verbose: iter = tqdm(iter, total=len(book_context_refs)) diff --git a/sefaria/model/linker/tests/linker_test.py b/sefaria/model/linker/tests/linker_test.py index 04f0205aea..ca8d649625 100644 --- a/sefaria/model/linker/tests/linker_test.py +++ b/sefaria/model/linker/tests/linker_test.py @@ -265,7 +265,7 @@ class TestResolveRawRef: def test_full_pipeline_ref_resolver(context_tref, input_str, lang, expected_trefs, expected_pretty_texts): context_oref = context_tref and Ref(context_tref) ref_resolver = library.get_ref_resolver(lang) - resolved = ref_resolver.bulk_resolve_refs([context_oref], [input_str])[0] + resolved = ref_resolver.bulk_resolve([input_str], [context_oref])[0] assert len(resolved) == len(expected_trefs) resolved_orefs = sorted(reduce(lambda a, b: a + b, [[match.ref] if not match.is_ambiguous else [inner_match.ref for inner_match in match.resolved_raw_refs] for match in resolved], []), key=lambda x: x.normal()) if len(expected_trefs) != len(resolved_orefs): From 99ed52a87719b637cc1982dbfd25b35a896d524b Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Thu, 2 Nov 2023 14:39:15 +0200 Subject: [PATCH 024/210] refactor(linker): break down complex function --- sefaria/model/linker/ref_resolver.py | 36 ++++++++++++++++++---------- 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/sefaria/model/linker/ref_resolver.py b/sefaria/model/linker/ref_resolver.py index 59822a7aaf..0529268fbc 100644 --- a/sefaria/model/linker/ref_resolver.py +++ b/sefaria/model/linker/ref_resolver.py @@ -172,6 +172,9 @@ def pretty_text(self): return self.resolved_raw_refs[0].pretty_text +PossiblyAmbigResolvedRef = Union[ResolvedRef, AmbiguousResolvedRef] + + class TermMatcher: """ Used to match raw ref parts to non-unique terms naively. @@ -256,7 +259,7 @@ def reset_ibid_history(self): def bulk_resolve(self, input: List[str], book_context_refs: Optional[List[Optional[text.Ref]]] = None, with_failures=False, verbose=False, reset_ibids_every_context_ref=True, thoroughness=ResolutionThoroughness.NORMAL) \ - -> List[List[Union[ResolvedRef, AmbiguousResolvedRef]]]: + -> List[List[PossiblyAmbigResolvedRef]]: """ Main function for resolving refs in text. Given a list of texts, returns ResolvedRefs for each @param book_context_refs: @@ -280,23 +283,30 @@ def bulk_resolve(self, input: List[str], book_context_refs: Optional[List[Option self.reset_ibid_history() inner_resolved = [] for raw_ref in raw_refs: - temp_resolved = self.resolve_raw_ref(book_context_ref, raw_ref) - if len(temp_resolved) == 0: - self.reset_ibid_history() - if with_failures: - inner_resolved += [ResolvedRef(raw_ref, [], None, None, context_ref=book_context_ref)] - elif any(r.is_ambiguous for r in temp_resolved): - # can't be sure about future ibid inferences - # TODO can probably salvage parts of history if matches are ambiguous within one book - self.reset_ibid_history() - else: - self._ibid_history.last_refs = temp_resolved[-1].ref + temp_resolved = self._resolve_raw_ref_and_update_ibid_history(raw_ref, book_context_ref, with_failures) inner_resolved += temp_resolved resolved += [inner_resolved] raw_ref_list_list = [[rr.raw_ref for rr in inner_resolved] for inner_resolved in resolved] self._named_entity_recognizer.bulk_map_normal_output_to_original_input(input, raw_ref_list_list) return resolved + def _resolve_raw_ref_and_update_ibid_history(self, raw_ref: RawRef, book_context_ref: text.Ref, with_failures=False) -> List[PossiblyAmbigResolvedRef]: + temp_resolved = self.resolve_raw_ref(book_context_ref, raw_ref) + self._update_ibid_history(temp_resolved) + if len(temp_resolved) == 0 and with_failures: + return [ResolvedRef(raw_ref, [], None, None, context_ref=book_context_ref)] + return temp_resolved + + def _update_ibid_history(self, temp_resolved: List[PossiblyAmbigResolvedRef]): + if len(temp_resolved) == 0: + self.reset_ibid_history() + elif any(r.is_ambiguous for r in temp_resolved): + # can't be sure about future ibid inferences + # TODO can probably salvage parts of history if matches are ambiguous within one book + self.reset_ibid_history() + else: + self._ibid_history.last_refs = temp_resolved[-1].ref + def get_ner(self) -> NamedEntityRecognizer: return self._named_entity_recognizer @@ -331,7 +341,7 @@ def split_non_cts_parts(self, raw_ref: RawRef) -> List[RawRef]: def set_thoroughness(self, thoroughness: ResolutionThoroughness) -> None: self._thoroughness = thoroughness - def resolve_raw_ref(self, book_context_ref: Optional[text.Ref], raw_ref: RawRef) -> List[Union[ResolvedRef, AmbiguousResolvedRef]]: + def resolve_raw_ref(self, book_context_ref: Optional[text.Ref], raw_ref: RawRef) -> List[PossiblyAmbigResolvedRef]: split_raw_refs = self.split_non_cts_parts(raw_ref) resolved_list = [] for i, temp_raw_ref in enumerate(split_raw_refs): From 27be2c09442ce9bd2aa5c4966840c80a72df905b Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Sun, 5 Nov 2023 15:10:14 +0200 Subject: [PATCH 025/210] test(linker): fix test_map_new_indices test --- sefaria/model/linker/tests/linker_test.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sefaria/model/linker/tests/linker_test.py b/sefaria/model/linker/tests/linker_test.py index ca8d649625..4514a3fe3f 100644 --- a/sefaria/model/linker/tests/linker_test.py +++ b/sefaria/model/linker/tests/linker_test.py @@ -411,7 +411,8 @@ def test_map_new_indices(crrd_params): # test assert norm_raw_ref.text == norm_text.strip() - norm_raw_ref.map_new_indices(doc, indices, part_indices) + norm_raw_ref.map_new_indices(doc, indices) + norm_raw_ref.map_new_part_indices(part_indices) assert norm_raw_ref.text == raw_ref.text for norm_part, part in zip(norm_raw_ref.raw_ref_parts, raw_ref.raw_ref_parts): assert norm_part.text == part.text From 1976dde4c8219ab65cf362bc7063fd9fa44f702d Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Sun, 5 Nov 2023 15:11:14 +0200 Subject: [PATCH 026/210] fix(linker): add alignment_mode to allow for differentiation in how ranges are dealt with in NormalizerComposer --- sefaria/helper/normalization.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/sefaria/helper/normalization.py b/sefaria/helper/normalization.py index b1b450d97f..2a10f25eed 100644 --- a/sefaria/helper/normalization.py +++ b/sefaria/helper/normalization.py @@ -115,18 +115,21 @@ def get_mapping_after_normalization(self, text, removal_list=None, reverse=False return removal_map @staticmethod - def convert_normalized_indices_to_unnormalized_indices(normalized_indices, removal_map, reverse=False): + def convert_normalized_indices_to_unnormalized_indices(normalized_indices, removal_map, reverse=False, alignment_mode='contract'): """ normalized_indices - list of tuples where each tuple is (x, y) x being start index, y is end index + 1 removal_map - return value of get_mapping_after_normalization() reverse - if True, normalized_indices are actually unnormalized indices and removal_map was calculated using reverse=True in get_mapping_after_normalization() + alignment_mode - How to deal with cases where the end of a range touches a removal. Use "expand" if the removal should be included in the range. "contract" if it should be excluded. """ removal_keys = sorted(removal_map.keys()) unnormalized_indices = [] sign = -1 if reverse else 1 for start, end in normalized_indices: unnorm_start_index = bisect_right(removal_keys, start) - 1 - unnorm_end_index = bisect_right(removal_keys, end) - 1 + + bisect_end_index = end if (start == end or alignment_mode == 'expand') else end - 1 + unnorm_end_index = bisect_right(removal_keys, bisect_end_index) - 1 unnorm_start = start if unnorm_start_index < 0 else start + (sign * removal_map[removal_keys[unnorm_start_index]]) unnorm_end = end if unnorm_end_index < 0 else end + (sign * removal_map[removal_keys[unnorm_end_index]]) @@ -266,7 +269,7 @@ def find_text_to_remove(self, s, **kwargs): else: text_to_remove_inds, text_to_remove_repls = zip(*curr_text_to_remove) for mapping in reversed(mappings): - text_to_remove_inds = step.convert_normalized_indices_to_unnormalized_indices(text_to_remove_inds, mapping) + text_to_remove_inds = step.convert_normalized_indices_to_unnormalized_indices(text_to_remove_inds, mapping, alignment_mode='expand') curr_text_to_remove = list(zip(text_to_remove_inds, text_to_remove_repls)) # merge any overlapping ranges From f022c239512e4ee76a0f7efe58b02b6d2d47d4c4 Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Wed, 8 Nov 2023 09:15:56 +0200 Subject: [PATCH 027/210] feat: scaffolding for linker being central class to handle refs and named entities --- sefaria/model/linker/linker.py | 27 ++++++++++ sefaria/model/linker/named_entity_resolver.py | 53 +++++++++++++++---- 2 files changed, 70 insertions(+), 10 deletions(-) create mode 100644 sefaria/model/linker/linker.py diff --git a/sefaria/model/linker/linker.py b/sefaria/model/linker/linker.py new file mode 100644 index 0000000000..c8160ced2c --- /dev/null +++ b/sefaria/model/linker/linker.py @@ -0,0 +1,27 @@ +from typing import List, Optional, Union +from sefaria.model.text import Ref +from sefaria.model.linker.ref_part import RawRef +from sefaria.model.linker.ref_resolver import RefResolver, ResolutionThoroughness +from sefaria.model.linker.named_entity_resolver import NamedEntityResolver, NamedEntityRecognizer + + +class Linker: + + def __init__(self, ref_resolver: RefResolver, ne_resolver: NamedEntityResolver, ne_recognizer: NamedEntityRecognizer): + self._ref_resolver = ref_resolver + self._ne_resolver = ne_resolver + self._ne_recognizer = ne_recognizer + + def bulk_link(self, input: List[str], book_context_refs: Optional[List[Optional[Ref]]] = None, with_failures=False, + verbose=False, reset_ibids_every_context_ref=True, thoroughness=ResolutionThoroughness.NORMAL): + all_named_entities = self._ne_recognizer.bulk_get_raw_named_entities(input) + resolved = [] + for inner_named_entities in all_named_entities: + for named_entity in inner_named_entities: + if isinstance(named_entity, RawRef): + # resolve with ref resolver + pass + else: + # resolve with ne resolver + pass + return resolved diff --git a/sefaria/model/linker/named_entity_resolver.py b/sefaria/model/linker/named_entity_resolver.py index 6356345caf..12acbce5d4 100644 --- a/sefaria/model/linker/named_entity_resolver.py +++ b/sefaria/model/linker/named_entity_resolver.py @@ -1,5 +1,5 @@ import dataclasses -from typing import List, Generator, Optional, Dict, Type, Set +from typing import List, Generator, Optional, Dict, Type, Set, Tuple try: import re2 as re re.set_fallback_notification(re.FALLBACK_WARNING) @@ -44,6 +44,26 @@ def __init_normalizer(self) -> NormalizerComposer: return NormalizerComposer(normalizer_steps) def bulk_get_raw_named_entities(self, inputs: List[str]) -> List[List[RawNamedEntity]]: + """ + Return all RawNamedEntity's in `inputs`. If the entity is a citation, parse out the inner RawRefParts and create + RawRefs. + @param inputs: List of strings to search for named entities in. + @return: 2D list of RawNamedEntity's. Includes RawRefs which are a subtype of RawNamedEntity + """ + all_raw_named_entities = self._bulk_get_raw_named_entities_wo_raw_refs(inputs) + all_citations, all_non_citations = self._partition_named_entities_by_citation_type(all_raw_named_entities) + all_raw_refs = self._bulk_parse_raw_refs(all_citations) + merged_entities = [] + for inner_non_citations, inner_citations in zip(all_non_citations, all_raw_refs): + merged_entities += [inner_non_citations + inner_citations] + return merged_entities + + def _bulk_get_raw_named_entities_wo_raw_refs(self, inputs: List[str]) -> List[List[RawNamedEntity]]: + """ + Finds RawNamedEntities in `inputs` but doesn't parse citations into RawRefs with RawRefParts + @param inputs: + @return: + """ normalized_inputs = self._normalize_input(inputs) all_raw_ref_spans = list(self._bulk_get_raw_named_entity_spans(normalized_inputs)) all_raw_named_entities = [] @@ -55,12 +75,26 @@ def bulk_get_raw_named_entities(self, inputs: List[str]) -> List[List[RawNamedEn all_raw_named_entities += [temp_raw_named_entities] return all_raw_named_entities - def bulk_get_raw_named_entities_by_type(self, inputs: List[str], type_filter: NamedEntityType): - all_raw_named_entities = self.bulk_get_raw_named_entities(inputs) - return [[named_entity for named_entity in sublist if named_entity.type == type_filter] - for sublist in all_raw_named_entities] - - def bulk_get_raw_refs(self, inputs: List[str]) -> List[List[RawRef]]: + @staticmethod + def _partition_named_entities_by_citation_type( + all_raw_named_entities: List[List[RawNamedEntity]] + ) -> Tuple[List[List[RawNamedEntity]], List[List[RawNamedEntity]]]: + """ + Given named entities, partition them into two lists; list of entities that are citations and those that aren't. + @param all_raw_named_entities: + @return: + """ + citations, non_citations = [], [] + for sublist in all_raw_named_entities: + inner_citations, inner_non_citations = [], [] + for named_entity in sublist: + inner_list = inner_citations if named_entity.type == NamedEntityType.CITATION else inner_non_citations + inner_list += [named_entity] + citations += [inner_citations] + non_citations += [inner_non_citations] + return citations, non_citations + + def _bulk_parse_raw_refs(self, all_citation_entities: List[List[RawNamedEntity]]) -> List[List[RawRef]]: """ Runs models on inputs to locate all refs and ref parts Note: takes advantage of bulk spaCy operations. It is more efficient to pass multiple strings in input than to @@ -68,15 +102,14 @@ def bulk_get_raw_refs(self, inputs: List[str]) -> List[List[RawRef]]: @param inputs: List of strings to search for refs in. @return: 2D list of RawRefs. Each inner list corresponds to the refs found in a string of the input. """ - all_ref_entities = self.bulk_get_raw_named_entities_by_type(inputs, NamedEntityType.CITATION) - ref_part_input = reduce(lambda a, b: a + [(sub_b.text, b[0]) for sub_b in b[1]], enumerate(all_ref_entities), []) + ref_part_input = reduce(lambda a, b: a + [(sub_b.text, b[0]) for sub_b in b[1]], enumerate(all_citation_entities), []) all_raw_ref_part_spans = list(self._bulk_get_raw_ref_part_spans(ref_part_input, as_tuples=True)) all_raw_ref_part_span_map = defaultdict(list) for ref_part_span, input_idx in all_raw_ref_part_spans: all_raw_ref_part_span_map[input_idx] += [ref_part_span] all_raw_refs = [] - for input_idx, named_entities in enumerate(all_ref_entities): + for input_idx, named_entities in enumerate(all_citation_entities): raw_ref_part_spans = all_raw_ref_part_span_map[input_idx] all_raw_refs += [self._bulk_make_raw_refs(named_entities, raw_ref_part_spans)] return all_raw_refs From 17978882baf1ada500218faaf75a72cbf94039a8 Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Wed, 8 Nov 2023 10:04:46 +0200 Subject: [PATCH 028/210] feat: finalize bulk_link function --- sefaria/model/linker/linker.py | 45 +++++++++++++------ sefaria/model/linker/named_entity_resolver.py | 16 +++---- sefaria/model/linker/ref_resolver.py | 33 ++++---------- 3 files changed, 46 insertions(+), 48 deletions(-) diff --git a/sefaria/model/linker/linker.py b/sefaria/model/linker/linker.py index c8160ced2c..4c8437517f 100644 --- a/sefaria/model/linker/linker.py +++ b/sefaria/model/linker/linker.py @@ -1,6 +1,7 @@ -from typing import List, Optional, Union +from typing import List, Optional, Union, Iterable, Tuple +from tqdm import tqdm from sefaria.model.text import Ref -from sefaria.model.linker.ref_part import RawRef +from sefaria.model.linker.ref_part import RawRef, RawNamedEntity from sefaria.model.linker.ref_resolver import RefResolver, ResolutionThoroughness from sefaria.model.linker.named_entity_resolver import NamedEntityResolver, NamedEntityRecognizer @@ -12,16 +13,34 @@ def __init__(self, ref_resolver: RefResolver, ne_resolver: NamedEntityResolver, self._ne_resolver = ne_resolver self._ne_recognizer = ne_recognizer - def bulk_link(self, input: List[str], book_context_refs: Optional[List[Optional[Ref]]] = None, with_failures=False, - verbose=False, reset_ibids_every_context_ref=True, thoroughness=ResolutionThoroughness.NORMAL): - all_named_entities = self._ne_recognizer.bulk_get_raw_named_entities(input) + def bulk_link(self, inputs: List[str], book_context_refs: Optional[List[Optional[Ref]]] = None, with_failures=False, + verbose=False, thoroughness=ResolutionThoroughness.NORMAL): + all_named_entities = self._ne_recognizer.bulk_get_raw_named_entities(inputs) resolved = [] - for inner_named_entities in all_named_entities: - for named_entity in inner_named_entities: - if isinstance(named_entity, RawRef): - # resolve with ref resolver - pass - else: - # resolve with ne resolver - pass + iterable = self._get_bulk_link_iterable(all_named_entities, book_context_refs, verbose) + for book_context_ref, inner_named_entities in iterable: + raw_refs, named_entities = self._partition_raw_refs_and_named_entities(inner_named_entities) + resolved_refs = self._ref_resolver.bulk_resolve(raw_refs, book_context_ref, with_failures, thoroughness) + resolved_named_entities = self._ne_resolver.bulk_resolve(named_entities, with_failures) + resolved += [resolved_refs + resolved_named_entities] + + named_entity_list_list = [[rr.raw_named_entity for rr in inner_resolved] for inner_resolved in resolved] + self._ne_recognizer.bulk_map_normal_output_to_original_input(inputs, named_entity_list_list) return resolved + + @staticmethod + def _partition_raw_refs_and_named_entities(raw_refs_and_named_entities: List[RawNamedEntity]) \ + -> Tuple[List[RawRef], List[RawNamedEntity]]: + raw_refs = [ne for ne in raw_refs_and_named_entities if isinstance(ne, RawRef)] + named_entities = [ne for ne in raw_refs_and_named_entities if not isinstance(ne, RawRef)] + return raw_refs, named_entities + + @staticmethod + def _get_bulk_link_iterable(all_named_entities: List[List[RawNamedEntity]], + book_context_refs: Optional[List[Optional[Ref]]] = None, verbose=False + ) -> Iterable[Tuple[Ref, List[RawNamedEntity]]]: + iterable = zip(book_context_refs, all_named_entities) + if verbose: + iterable = tqdm(iterable, total=len(book_context_refs)) + return iterable + diff --git a/sefaria/model/linker/named_entity_resolver.py b/sefaria/model/linker/named_entity_resolver.py index 12acbce5d4..158fcde611 100644 --- a/sefaria/model/linker/named_entity_resolver.py +++ b/sefaria/model/linker/named_entity_resolver.py @@ -341,18 +341,12 @@ def __init__(self, named_entity_recognizer: NamedEntityRecognizer, topic_matcher self._named_entity_recognizer = named_entity_recognizer self._topic_matcher = topic_matcher - def bulk_resolve_named_entities(self, inputs: List[str], with_failures=False) -> List[List[ResolvedNamedEntity]]: - all_named_entities = self._named_entity_recognizer.bulk_get_raw_named_entities(inputs) + def bulk_resolve(self, raw_named_entities: List[RawNamedEntity], with_failures=False) -> List[ResolvedNamedEntity]: resolved = [] - for named_entities in all_named_entities: - temp_resolved = [] - for named_entity in named_entities: - matched_topics = self._topic_matcher.match(named_entity) - if len(matched_topics) > 0 or with_failures: - temp_resolved += [ResolvedNamedEntity(named_entity, matched_topics)] - resolved += [temp_resolved] - named_entity_list_list = [[rr.raw_named_entity for rr in inner_resolved] for inner_resolved in resolved] - self._named_entity_recognizer.bulk_map_normal_output_to_original_input(inputs, named_entity_list_list) + for named_entity in raw_named_entities: + matched_topics = self._topic_matcher.match(named_entity) + if len(matched_topics) > 0 or with_failures: + resolved += [ResolvedNamedEntity(named_entity, matched_topics)] return resolved diff --git a/sefaria/model/linker/ref_resolver.py b/sefaria/model/linker/ref_resolver.py index 0529268fbc..9538166287 100644 --- a/sefaria/model/linker/ref_resolver.py +++ b/sefaria/model/linker/ref_resolver.py @@ -7,7 +7,7 @@ from sefaria.model import text from sefaria.model import schema from sefaria.model.linker.named_entity_resolver import NamedEntityRecognizer -from sefaria.model.linker.ref_part import RawRef, RawRefPart, SpanOrToken, span_inds, RefPartType, SectionContext, ContextPart, TermContext +from sefaria.model.linker.ref_part import RawRef, RawRefPart, SpanOrToken, span_inds, RefPartType, SectionContext, ContextPart, TermContext, RawNamedEntity from sefaria.model.linker.referenceable_book_node import NamedReferenceableBookNode, ReferenceableBookNode from sefaria.model.linker.match_template import MatchTemplateTrie, LEAF_TRIE_ENTRY from sefaria.model.linker.resolved_ref_refiner_factory import resolved_ref_refiner_factory @@ -257,37 +257,22 @@ def __init__(self, lang: str, named_entity_recognizer: NamedEntityRecognizer, def reset_ibid_history(self): self._ibid_history = IbidHistory() - def bulk_resolve(self, input: List[str], book_context_refs: Optional[List[Optional[text.Ref]]] = None, with_failures=False, - verbose=False, reset_ibids_every_context_ref=True, thoroughness=ResolutionThoroughness.NORMAL) \ - -> List[List[PossiblyAmbigResolvedRef]]: + def bulk_resolve(self, raw_refs: List[RawRef], book_context_ref: Optional[text.Ref] = None, + with_failures=False, thoroughness=ResolutionThoroughness.NORMAL) -> List[PossiblyAmbigResolvedRef]: """ - Main function for resolving refs in text. Given a list of texts, returns ResolvedRefs for each - @param book_context_refs: - @param input: + Main function for resolving refs in text. Given a list of RawRefs, returns ResolvedRefs for each + @param raw_refs: + @param book_context_ref: @param with_failures: - @param verbose: - @param reset_ibids_every_context_ref: @param thoroughness: how thorough should the search be. More thorough == slower. Currently "normal" will avoid searching for DH matches at book level and avoid filtering empty refs @return: """ self._thoroughness = thoroughness self.reset_ibid_history() - all_raw_refs = self._named_entity_recognizer.bulk_get_raw_refs(input) resolved = [] - book_context_refs = book_context_refs or [None]*len(all_raw_refs) - iter = zip(book_context_refs, all_raw_refs) - if verbose: - iter = tqdm(iter, total=len(book_context_refs)) - for book_context_ref, raw_refs in iter: - if reset_ibids_every_context_ref: - self.reset_ibid_history() - inner_resolved = [] - for raw_ref in raw_refs: - temp_resolved = self._resolve_raw_ref_and_update_ibid_history(raw_ref, book_context_ref, with_failures) - inner_resolved += temp_resolved - resolved += [inner_resolved] - raw_ref_list_list = [[rr.raw_ref for rr in inner_resolved] for inner_resolved in resolved] - self._named_entity_recognizer.bulk_map_normal_output_to_original_input(input, raw_ref_list_list) + for raw_ref in raw_refs: + temp_resolved = self._resolve_raw_ref_and_update_ibid_history(raw_ref, book_context_ref, with_failures) + resolved += temp_resolved return resolved def _resolve_raw_ref_and_update_ibid_history(self, raw_ref: RawRef, book_context_ref: text.Ref, with_failures=False) -> List[PossiblyAmbigResolvedRef]: From 766a0952a1e6dbb0d5f71a3ebe8d99a73da6efbc Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Wed, 8 Nov 2023 14:57:36 +0200 Subject: [PATCH 029/210] refactor: make ResolveRef a child of ResolvedNamedEntity --- sefaria/model/linker/named_entity_resolver.py | 24 +++++++++++++----- sefaria/model/linker/ref_resolver.py | 25 +++++++------------ 2 files changed, 27 insertions(+), 22 deletions(-) diff --git a/sefaria/model/linker/named_entity_resolver.py b/sefaria/model/linker/named_entity_resolver.py index 158fcde611..d8102f3e86 100644 --- a/sefaria/model/linker/named_entity_resolver.py +++ b/sefaria/model/linker/named_entity_resolver.py @@ -11,6 +11,7 @@ from sefaria.helper.normalization import NormalizerComposer from sefaria.model.topic import Topic, TopicSet, RefTopicLink from sefaria.utils.hebrew import strip_cantillation +from sefaria.system.exceptions import InputError try: import spacy @@ -223,14 +224,25 @@ def _get_dh_continuation(ispan: int, ipart: int, named_entities: List[RawNamedEn class ResolvedNamedEntity: + is_ambiguous = False - def __init__(self, raw_named_entity: RawNamedEntity, topics: List[Topic]): - self.raw_named_entity = raw_named_entity - self.topics = topics + def __init__(self, raw_named_entity: RawNamedEntity, topic: Topic): + self.raw_entity = raw_named_entity + self.topic = topic - @property - def is_ambiguous(self) -> bool: - return len(self.topics) > 1 + +class AmbiguousNamedEntity: + """ + Container for multiple ambiguous ResolvedNamedEntity's + """ + is_ambiguous = True + + def __init__(self, resolved_named_entities: List[ResolvedNamedEntity]): + if len(resolved_named_entities) == 0: + raise InputError("Length of `resolved_named_entities` must be at least 1") + self.resolved_named_entities = resolved_named_entities + # assumption is all resolved_refs share same raw_entity. expose at top level + self.raw_entity = resolved_named_entities[0].raw_entity class TitleGenerator: diff --git a/sefaria/model/linker/ref_resolver.py b/sefaria/model/linker/ref_resolver.py index 9538166287..f2742f3f41 100644 --- a/sefaria/model/linker/ref_resolver.py +++ b/sefaria/model/linker/ref_resolver.py @@ -6,7 +6,7 @@ from sefaria.model import abstract as abst from sefaria.model import text from sefaria.model import schema -from sefaria.model.linker.named_entity_resolver import NamedEntityRecognizer +from sefaria.model.linker.named_entity_resolver import NamedEntityRecognizer, ResolvedNamedEntity, AmbiguousNamedEntity from sefaria.model.linker.ref_part import RawRef, RawRefPart, SpanOrToken, span_inds, RefPartType, SectionContext, ContextPart, TermContext, RawNamedEntity from sefaria.model.linker.referenceable_book_node import NamedReferenceableBookNode, ReferenceableBookNode from sefaria.model.linker.match_template import MatchTemplateTrie, LEAF_TRIE_ENTRY @@ -42,14 +42,14 @@ class ResolutionThoroughness(IntEnum): HIGH = 2 -class ResolvedRef(abst.Cloneable): +class ResolvedRef(abst.Cloneable, ResolvedNamedEntity): """ Partial or complete resolution of a RawRef """ - is_ambiguous = False def __init__(self, raw_ref: RawRef, resolved_parts: List[RawRefPart], node, ref: text.Ref, context_ref: text.Ref = None, context_type: ContextType = None, context_parts: List[ContextPart] = None, _thoroughness=ResolutionThoroughness.NORMAL, _matched_dh_map=None) -> None: - self.raw_ref = raw_ref + super().__init__(raw_ref) + self.raw_entity = raw_ref self.resolved_parts = resolved_parts self.node: ReferenceableBookNode = node self.ref = ref @@ -71,7 +71,7 @@ def pretty_text(self) -> str: - adds extra DH words that were matched but aren't in span @return: """ - new_raw_ref_span = self._get_pretty_dh_span(self.raw_ref.span) + new_raw_ref_span = self._get_pretty_dh_span(self.raw_entity.span) new_raw_ref_span = self._get_pretty_end_paren_span(new_raw_ref_span) return new_raw_ref_span.text @@ -154,17 +154,10 @@ def order_key(self): return len(explicit_matched), num_context_parts_matched -class AmbiguousResolvedRef: +class AmbiguousResolvedRef(AmbiguousNamedEntity): """ Container for multiple ambiguous ResolvedRefs """ - is_ambiguous = True - - def __init__(self, resolved_refs: List[ResolvedRef]): - if len(resolved_refs) == 0: - raise InputError("Length of `resolved_refs` must be at least 1") - self.resolved_raw_refs = resolved_refs - self.raw_ref = resolved_refs[0].raw_ref # assumption is all resolved_refs share same raw_ref. expose at top level @property def pretty_text(self): @@ -453,7 +446,7 @@ def refine_ref_part_matches(self, book_context_ref: Optional[text.Ref], matches: temp_matches = [] refs_matched = {match.ref.normal() for match in matches} for unrefined_match in matches: - unused_parts = list(set(unrefined_match.raw_ref.parts_to_match) - set(unrefined_match.resolved_parts)) + unused_parts = list(set(unrefined_match.raw_entity.parts_to_match) - set(unrefined_match.resolved_parts)) context_free_matches = self._get_refined_ref_part_matches_recursive(unrefined_match, unused_parts) # context @@ -601,7 +594,7 @@ def do_explicit_sections_match_before_context_sections(match: ResolvedRef) -> bo @staticmethod def matched_all_explicit_sections(match: ResolvedRef) -> bool: resolved_explicit = set(match.get_resolved_parts(exclude={ContextPart})) - to_match_explicit = {part for part in match.raw_ref.parts_to_match if not part.is_context} + to_match_explicit = {part for part in match.raw_entity.parts_to_match if not part.is_context} if match.context_type in CONTEXT_TO_REF_PART_TYPE.keys(): # remove an equivalent number of context parts that were resolved from to_match_explicit to approximate @@ -672,7 +665,7 @@ def remove_incorrect_matches(resolved_refs: List[ResolvedRef]) -> List[ResolvedR @staticmethod def get_context_free_matches(resolved_refs: List[ResolvedRef]) -> List[ResolvedRef]: def match_is_context_free(match: ResolvedRef) -> bool: - return match.context_ref is None and set(match.get_resolved_parts()) == set(match.raw_ref.parts_to_match) + return match.context_ref is None and set(match.get_resolved_parts()) == set(match.raw_entity.parts_to_match) return list(filter(match_is_context_free, resolved_refs)) @staticmethod From 00e116bb533e085814bf93758c9757e719527c9e Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Wed, 8 Nov 2023 15:01:51 +0200 Subject: [PATCH 030/210] refactor: more refactoring of raw_ref to raw_entity --- sefaria/helper/linker.py | 12 ++++++------ sefaria/model/linker/resolved_ref_refiner.py | 2 +- sefaria/model/linker/tests/linker_test.py | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/sefaria/helper/linker.py b/sefaria/helper/linker.py index 2083210bca..b6ab7c5e5e 100644 --- a/sefaria/helper/linker.py +++ b/sefaria/helper/linker.py @@ -184,7 +184,7 @@ def _make_find_refs_response_inner(resolved: List[List[Union[AmbiguousResolvedRe resolved_ref_list = [resolved_ref for inner_resolved in resolved for resolved_ref in inner_resolved] for resolved_ref in resolved_ref_list: resolved_refs = resolved_ref.resolved_raw_refs if resolved_ref.is_ambiguous else [resolved_ref] - start_char, end_char = resolved_ref.raw_ref.char_indices + start_char, end_char = resolved_ref.raw_entity.char_indices text = resolved_ref.pretty_text link_failed = resolved_refs[0].ref is None if not link_failed and resolved_refs[0].ref.is_book_level(): continue @@ -251,10 +251,10 @@ def _get_ref_text_by_lang_for_linker(oref: text.Ref, lang: str, options: _FindRe def _make_debug_response_for_linker(resolved_ref: ResolvedRef) -> dict: debug_data = { - "orig_part_strs": [p.text for p in resolved_ref.raw_ref.raw_ref_parts], - "orig_part_types": [p.type.name for p in resolved_ref.raw_ref.raw_ref_parts], - "final_part_strs": [p.text for p in resolved_ref.raw_ref.parts_to_match], - "final_part_types": [p.type.name for p in resolved_ref.raw_ref.parts_to_match], + "orig_part_strs": [p.text for p in resolved_ref.raw_entity.raw_ref_parts], + "orig_part_types": [p.type.name for p in resolved_ref.raw_entity.raw_ref_parts], + "final_part_strs": [p.text for p in resolved_ref.raw_entity.parts_to_match], + "final_part_types": [p.type.name for p in resolved_ref.raw_entity.parts_to_match], "resolved_part_strs": [p.term.slug if isinstance(p, TermContext) else p.text for p in resolved_ref.resolved_parts], "resolved_part_types": [p.type.name for p in resolved_ref.resolved_parts], "resolved_part_classes": [p.__class__.__name__ for p in resolved_ref.resolved_parts], @@ -262,7 +262,7 @@ def _make_debug_response_for_linker(resolved_ref: ResolvedRef) -> dict: "context_type": resolved_ref.context_type.name if resolved_ref.context_type else None, } if RefPartType.RANGE.name in debug_data['final_part_types']: - range_part = next((p for p in resolved_ref.raw_ref.parts_to_match if p.type == RefPartType.RANGE), None) + range_part = next((p for p in resolved_ref.raw_entity.parts_to_match if p.type == RefPartType.RANGE), None) debug_data.update({ 'input_range_sections': [p.text for p in range_part.sections], 'input_range_to_sections': [p.text for p in range_part.toSections] diff --git a/sefaria/model/linker/resolved_ref_refiner.py b/sefaria/model/linker/resolved_ref_refiner.py index 5219d168ce..88f364019b 100644 --- a/sefaria/model/linker/resolved_ref_refiner.py +++ b/sefaria/model/linker/resolved_ref_refiner.py @@ -36,7 +36,7 @@ def _has_prev_unused_numbered_ref_part(self) -> bool: Helper function to avoid matching AddressInteger sections out of order Returns True if there is a RawRefPart which immediately precedes `raw_ref_part` and is not yet included in this match """ - prev_part = self.resolved_ref.raw_ref.prev_num_parts_map.get(self.part_to_match, None) + prev_part = self.resolved_ref.raw_entity.prev_num_parts_map.get(self.part_to_match, None) if prev_part is None: return False return prev_part not in set(self.resolved_ref.resolved_parts) diff --git a/sefaria/model/linker/tests/linker_test.py b/sefaria/model/linker/tests/linker_test.py index 4514a3fe3f..1ab03cb330 100644 --- a/sefaria/model/linker/tests/linker_test.py +++ b/sefaria/model/linker/tests/linker_test.py @@ -275,7 +275,7 @@ def test_full_pipeline_ref_resolver(context_tref, input_str, lang, expected_tref for expected_tref, matched_oref in zip(sorted(expected_trefs, key=lambda x: x), resolved_orefs): assert matched_oref == Ref(expected_tref) for match, expected_pretty_text in zip(resolved, expected_pretty_texts): - assert input_str[slice(*match.raw_ref.char_indices)] == match.raw_ref.text + assert input_str[slice(*match.raw_entity.char_indices)] == match.raw_entity.text assert match.pretty_text == expected_pretty_text From fdb9e80d0a0caf369aad7033871312af820844b8 Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Thu, 9 Nov 2023 09:46:50 +0200 Subject: [PATCH 031/210] fix: remove AmbiguousNamedEntity which isnt currently needed --- scripts/parse_rambi_webpages.py | 2 +- sefaria/model/linker/named_entity_resolver.py | 26 ++++++++----------- sefaria/model/linker/ref_resolver.py | 19 +++++++++----- 3 files changed, 25 insertions(+), 22 deletions(-) diff --git a/scripts/parse_rambi_webpages.py b/scripts/parse_rambi_webpages.py index 9014b53a34..b9f663dc35 100644 --- a/scripts/parse_rambi_webpages.py +++ b/scripts/parse_rambi_webpages.py @@ -63,7 +63,7 @@ def get_refs_from_string(string): ref_resolver = library.get_ref_resolver() if lang == 'he': # remove this line when linker v3 is availabe in English refs = ref_resolver.bulk_resolve(lang, [None], [string]) - refs = {y.ref for x in refs for y in x if type(y) != AmbiguousResolvedRef} + refs = {y.ref for x in refs for y in x if not y.is_ambiguous} else: # remove else statement (with its content) when linker v3 is availabe in English refs = set() library.apply_action_for_all_refs_in_string(re.sub('[\(\)]', '', string), lambda x, y: refs.add(x), 'en', citing_only=True) diff --git a/sefaria/model/linker/named_entity_resolver.py b/sefaria/model/linker/named_entity_resolver.py index d8102f3e86..5b71e5c9c8 100644 --- a/sefaria/model/linker/named_entity_resolver.py +++ b/sefaria/model/linker/named_entity_resolver.py @@ -224,25 +224,21 @@ def _get_dh_continuation(ispan: int, ipart: int, named_entities: List[RawNamedEn class ResolvedNamedEntity: - is_ambiguous = False - def __init__(self, raw_named_entity: RawNamedEntity, topic: Topic): + def __init__(self, raw_named_entity: RawNamedEntity, topics: List[Topic]): self.raw_entity = raw_named_entity - self.topic = topic + self.topics = topics + @property + def topic(self): + if len(self.topics) != 1: + raise InputError(f"ResolvedNamedEntity is ambiguous and has {len(self.topics)} topics so you can't access " + ".topic.") + return self.topics[0] -class AmbiguousNamedEntity: - """ - Container for multiple ambiguous ResolvedNamedEntity's - """ - is_ambiguous = True - - def __init__(self, resolved_named_entities: List[ResolvedNamedEntity]): - if len(resolved_named_entities) == 0: - raise InputError("Length of `resolved_named_entities` must be at least 1") - self.resolved_named_entities = resolved_named_entities - # assumption is all resolved_refs share same raw_entity. expose at top level - self.raw_entity = resolved_named_entities[0].raw_entity + @property + def is_ambiguous(self): + return len(self.topics) != 1 class TitleGenerator: diff --git a/sefaria/model/linker/ref_resolver.py b/sefaria/model/linker/ref_resolver.py index f2742f3f41..0d3f24ea63 100644 --- a/sefaria/model/linker/ref_resolver.py +++ b/sefaria/model/linker/ref_resolver.py @@ -6,7 +6,7 @@ from sefaria.model import abstract as abst from sefaria.model import text from sefaria.model import schema -from sefaria.model.linker.named_entity_resolver import NamedEntityRecognizer, ResolvedNamedEntity, AmbiguousNamedEntity +from sefaria.model.linker.named_entity_resolver import NamedEntityRecognizer, ResolvedNamedEntity from sefaria.model.linker.ref_part import RawRef, RawRefPart, SpanOrToken, span_inds, RefPartType, SectionContext, ContextPart, TermContext, RawNamedEntity from sefaria.model.linker.referenceable_book_node import NamedReferenceableBookNode, ReferenceableBookNode from sefaria.model.linker.match_template import MatchTemplateTrie, LEAF_TRIE_ENTRY @@ -42,14 +42,14 @@ class ResolutionThoroughness(IntEnum): HIGH = 2 -class ResolvedRef(abst.Cloneable, ResolvedNamedEntity): +class ResolvedRef(abst.Cloneable): """ Partial or complete resolution of a RawRef """ + is_ambiguous = False - def __init__(self, raw_ref: RawRef, resolved_parts: List[RawRefPart], node, ref: text.Ref, context_ref: text.Ref = None, context_type: ContextType = None, context_parts: List[ContextPart] = None, _thoroughness=ResolutionThoroughness.NORMAL, _matched_dh_map=None) -> None: - super().__init__(raw_ref) - self.raw_entity = raw_ref + def __init__(self, raw_entity: RawRef, resolved_parts: List[RawRefPart], node, ref: text.Ref, context_ref: text.Ref = None, context_type: ContextType = None, context_parts: List[ContextPart] = None, _thoroughness=ResolutionThoroughness.NORMAL, _matched_dh_map=None) -> None: + self.raw_entity = raw_entity self.resolved_parts = resolved_parts self.node: ReferenceableBookNode = node self.ref = ref @@ -154,10 +154,17 @@ def order_key(self): return len(explicit_matched), num_context_parts_matched -class AmbiguousResolvedRef(AmbiguousNamedEntity): +class AmbiguousResolvedRef: """ Container for multiple ambiguous ResolvedRefs """ + is_ambiguous = True + + def __init__(self, resolved_refs: List[ResolvedRef]): + if len(resolved_refs) == 0: + raise InputError("Length of `resolved_refs` must be at least 1") + self.resolved_raw_refs = resolved_refs + self.raw_ref = resolved_refs[0].raw_entity # assumption is all resolved_refs share same raw_ref. expose at top level @property def pretty_text(self): From 10300f35e8cafd6cdaff35d3b17587fafff50e76 Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Thu, 9 Nov 2023 10:18:14 +0200 Subject: [PATCH 032/210] feat: add many non-bulk functions to complement bulk functions --- sefaria/model/linker/linker.py | 23 ++++++-- sefaria/model/linker/named_entity_resolver.py | 55 +++++++++++++++---- 2 files changed, 61 insertions(+), 17 deletions(-) diff --git a/sefaria/model/linker/linker.py b/sefaria/model/linker/linker.py index 4c8437517f..4d18682803 100644 --- a/sefaria/model/linker/linker.py +++ b/sefaria/model/linker/linker.py @@ -1,9 +1,16 @@ +import dataclasses from typing import List, Optional, Union, Iterable, Tuple from tqdm import tqdm from sefaria.model.text import Ref from sefaria.model.linker.ref_part import RawRef, RawNamedEntity -from sefaria.model.linker.ref_resolver import RefResolver, ResolutionThoroughness -from sefaria.model.linker.named_entity_resolver import NamedEntityResolver, NamedEntityRecognizer +from sefaria.model.linker.ref_resolver import RefResolver, ResolutionThoroughness, PossiblyAmbigResolvedRef +from sefaria.model.linker.named_entity_resolver import NamedEntityResolver, NamedEntityRecognizer, ResolvedNamedEntity + + +@dataclasses.dataclass +class LinkedDoc: + resolved_refs: List[PossiblyAmbigResolvedRef] + resolved_named_entities: List[ResolvedNamedEntity] class Linker: @@ -14,15 +21,15 @@ def __init__(self, ref_resolver: RefResolver, ne_resolver: NamedEntityResolver, self._ne_recognizer = ne_recognizer def bulk_link(self, inputs: List[str], book_context_refs: Optional[List[Optional[Ref]]] = None, with_failures=False, - verbose=False, thoroughness=ResolutionThoroughness.NORMAL): - all_named_entities = self._ne_recognizer.bulk_get_raw_named_entities(inputs) + verbose=False, thoroughness=ResolutionThoroughness.NORMAL) -> List[LinkedDoc]: + all_named_entities = self._ne_recognizer.bulk_recognize(inputs) resolved = [] iterable = self._get_bulk_link_iterable(all_named_entities, book_context_refs, verbose) for book_context_ref, inner_named_entities in iterable: raw_refs, named_entities = self._partition_raw_refs_and_named_entities(inner_named_entities) resolved_refs = self._ref_resolver.bulk_resolve(raw_refs, book_context_ref, with_failures, thoroughness) resolved_named_entities = self._ne_resolver.bulk_resolve(named_entities, with_failures) - resolved += [resolved_refs + resolved_named_entities] + resolved += [LinkedDoc(resolved_refs, resolved_named_entities)] named_entity_list_list = [[rr.raw_named_entity for rr in inner_resolved] for inner_resolved in resolved] self._ne_recognizer.bulk_map_normal_output_to_original_input(inputs, named_entity_list_list) @@ -44,3 +51,9 @@ def _get_bulk_link_iterable(all_named_entities: List[List[RawNamedEntity]], iterable = tqdm(iterable, total=len(book_context_refs)) return iterable + def link(self, input_str: str, book_context_ref: Optional[Ref] = None, with_failures=False, + thoroughness=ResolutionThoroughness.NORMAL) -> LinkedDoc: + raw_refs, named_entities = self._ne_recognizer.recognize(input_str) + resolved_refs = self._ref_resolver.bulk_resolve(raw_refs, book_context_ref, with_failures, thoroughness) + resolved_named_entities = self._ne_resolver.bulk_resolve(named_entities, with_failures) + return LinkedDoc(resolved_refs, resolved_named_entities) diff --git a/sefaria/model/linker/named_entity_resolver.py b/sefaria/model/linker/named_entity_resolver.py index 5b71e5c9c8..ff37411fd8 100644 --- a/sefaria/model/linker/named_entity_resolver.py +++ b/sefaria/model/linker/named_entity_resolver.py @@ -44,7 +44,7 @@ def __init_normalizer(self) -> NormalizerComposer: normalizer_steps += ['maqaf', 'cantillation'] return NormalizerComposer(normalizer_steps) - def bulk_get_raw_named_entities(self, inputs: List[str]) -> List[List[RawNamedEntity]]: + def bulk_recognize(self, inputs: List[str]) -> List[List[RawNamedEntity]]: """ Return all RawNamedEntity's in `inputs`. If the entity is a citation, parse out the inner RawRefParts and create RawRefs. @@ -52,13 +52,19 @@ def bulk_get_raw_named_entities(self, inputs: List[str]) -> List[List[RawNamedEn @return: 2D list of RawNamedEntity's. Includes RawRefs which are a subtype of RawNamedEntity """ all_raw_named_entities = self._bulk_get_raw_named_entities_wo_raw_refs(inputs) - all_citations, all_non_citations = self._partition_named_entities_by_citation_type(all_raw_named_entities) + all_citations, all_non_citations = self._bulk_partition_named_entities_by_citation_type(all_raw_named_entities) all_raw_refs = self._bulk_parse_raw_refs(all_citations) merged_entities = [] for inner_non_citations, inner_citations in zip(all_non_citations, all_raw_refs): merged_entities += [inner_non_citations + inner_citations] return merged_entities + def recognize(self, input_str: str) -> Tuple[List[RawRef], List[RawNamedEntity]]: + raw_named_entities = self._get_raw_named_entities_wo_raw_refs(input_str) + citations, non_citations = self._partition_named_entities_by_citation_type(raw_named_entities) + raw_refs = self._parse_raw_refs(citations) + return raw_refs, non_citations + def _bulk_get_raw_named_entities_wo_raw_refs(self, inputs: List[str]) -> List[List[RawNamedEntity]]: """ Finds RawNamedEntities in `inputs` but doesn't parse citations into RawRefs with RawRefParts @@ -66,18 +72,32 @@ def _bulk_get_raw_named_entities_wo_raw_refs(self, inputs: List[str]) -> List[Li @return: """ normalized_inputs = self._normalize_input(inputs) - all_raw_ref_spans = list(self._bulk_get_raw_named_entity_spans(normalized_inputs)) + all_raw_named_entity_spans = list(self._bulk_get_raw_named_entity_spans(normalized_inputs)) all_raw_named_entities = [] - for raw_ref_spans in all_raw_ref_spans: + for raw_named_entity_spans in all_raw_named_entity_spans: temp_raw_named_entities = [] - for span in raw_ref_spans: - type = NamedEntityType.span_label_to_enum(span.label_) - temp_raw_named_entities += [RawNamedEntity(span, type)] + for span in raw_named_entity_spans: + ne_type = NamedEntityType.span_label_to_enum(span.label_) + temp_raw_named_entities += [RawNamedEntity(span, ne_type)] all_raw_named_entities += [temp_raw_named_entities] return all_raw_named_entities + def _get_raw_named_entities_wo_raw_refs(self, input_str: str) -> List[RawNamedEntity]: + """ + Finds RawNamedEntities in `input_str` but doesn't parse citations into RawRefs with RawRefParts + @param input_str: + @return: + """ + normalized_input = self._normalize_input([input_str])[0] + raw_named_entity_spans = self._get_raw_named_entity_spans(normalized_input) + raw_named_entities = [] + for span in raw_named_entity_spans: + ne_type = NamedEntityType.span_label_to_enum(span.label_) + raw_named_entities += [RawNamedEntity(span, ne_type)] + return raw_named_entities + @staticmethod - def _partition_named_entities_by_citation_type( + def _bulk_partition_named_entities_by_citation_type( all_raw_named_entities: List[List[RawNamedEntity]] ) -> Tuple[List[List[RawNamedEntity]], List[List[RawNamedEntity]]]: """ @@ -87,14 +107,21 @@ def _partition_named_entities_by_citation_type( """ citations, non_citations = [], [] for sublist in all_raw_named_entities: - inner_citations, inner_non_citations = [], [] - for named_entity in sublist: - inner_list = inner_citations if named_entity.type == NamedEntityType.CITATION else inner_non_citations - inner_list += [named_entity] + inner_citations, inner_non_citations = NamedEntityRecognizer._partition_named_entities_by_citation_type(sublist) citations += [inner_citations] non_citations += [inner_non_citations] return citations, non_citations + @staticmethod + def _partition_named_entities_by_citation_type( + raw_named_entities: List[RawNamedEntity] + ) -> Tuple[List[RawNamedEntity], List[RawNamedEntity]]: + citations, non_citations = [], [] + for named_entity in raw_named_entities: + curr_list = citations if named_entity.type == NamedEntityType.CITATION else non_citations + curr_list += [named_entity] + return citations, non_citations + def _bulk_parse_raw_refs(self, all_citation_entities: List[List[RawNamedEntity]]) -> List[List[RawRef]]: """ Runs models on inputs to locate all refs and ref parts @@ -115,6 +142,10 @@ def _bulk_parse_raw_refs(self, all_citation_entities: List[List[RawNamedEntity]] all_raw_refs += [self._bulk_make_raw_refs(named_entities, raw_ref_part_spans)] return all_raw_refs + def _parse_raw_refs(self, citation_entities: List[RawNamedEntity]) -> List[RawRef]: + raw_ref_part_spans = list(self._bulk_get_raw_ref_part_spans([e.text for e in citation_entities])) + return self._bulk_make_raw_refs(citation_entities, raw_ref_part_spans) + def bulk_map_normal_output_to_original_input(self, input: List[str], raw_ref_list_list: List[List[RawRef]]): for temp_input, raw_ref_list in zip(input, raw_ref_list_list): self.map_normal_output_to_original_input(temp_input, raw_ref_list) From facd588fb47444b65fd83cade8e774c730f5f1a2 Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Thu, 9 Nov 2023 10:27:47 +0200 Subject: [PATCH 033/210] feat: expose linker from library class --- sefaria/model/linker/named_entity_resolver.py | 3 +- sefaria/model/linker/ref_resolver.py | 7 +-- sefaria/model/text.py | 50 +++++++++---------- 3 files changed, 25 insertions(+), 35 deletions(-) diff --git a/sefaria/model/linker/named_entity_resolver.py b/sefaria/model/linker/named_entity_resolver.py index ff37411fd8..358511c5cc 100644 --- a/sefaria/model/linker/named_entity_resolver.py +++ b/sefaria/model/linker/named_entity_resolver.py @@ -376,8 +376,7 @@ def match(self, named_entity: RawNamedEntity) -> List[Topic]: class NamedEntityResolver: - def __init__(self, named_entity_recognizer: NamedEntityRecognizer, topic_matcher: TopicMatcher): - self._named_entity_recognizer = named_entity_recognizer + def __init__(self, topic_matcher: TopicMatcher): self._topic_matcher = topic_matcher def bulk_resolve(self, raw_named_entities: List[RawNamedEntity], with_failures=False) -> List[ResolvedNamedEntity]: diff --git a/sefaria/model/linker/ref_resolver.py b/sefaria/model/linker/ref_resolver.py index 0d3f24ea63..a6f625cf90 100644 --- a/sefaria/model/linker/ref_resolver.py +++ b/sefaria/model/linker/ref_resolver.py @@ -244,11 +244,9 @@ def get_ref_by_title(self, title: str) -> Optional[text.Ref]: class RefResolver: - def __init__(self, lang: str, named_entity_recognizer: NamedEntityRecognizer, - ref_part_title_trie: MatchTemplateTrie, term_matcher: TermMatcher) -> None: + def __init__(self, lang: str, ref_part_title_trie: MatchTemplateTrie, term_matcher: TermMatcher) -> None: self._lang = lang - self._named_entity_recognizer = named_entity_recognizer self._ref_part_title_trie = ref_part_title_trie self._term_matcher = term_matcher self._ibid_history = IbidHistory() @@ -292,9 +290,6 @@ def _update_ibid_history(self, temp_resolved: List[PossiblyAmbigResolvedRef]): else: self._ibid_history.last_refs = temp_resolved[-1].ref - def get_ner(self) -> NamedEntityRecognizer: - return self._named_entity_recognizer - def get_ref_part_title_trie(self) -> MatchTemplateTrie: return self._ref_part_title_trie diff --git a/sefaria/model/text.py b/sefaria/model/text.py index bffe8cf7c5..a88a93b267 100644 --- a/sefaria/model/text.py +++ b/sefaria/model/text.py @@ -4865,8 +4865,7 @@ def __init__(self): self._simple_term_mapping = {} self._full_term_mapping = {} self._simple_term_mapping_json = None - self._ref_resolver_by_lang = {} - self._named_entity_resolver_by_lang = {} + self._linker_by_lang = {} # Topics self._topic_mapping = {} @@ -5601,24 +5600,32 @@ def _build_topic_mapping(self): self._topic_mapping = {t.slug: {"en": t.get_primary_title("en"), "he": t.get_primary_title("he")} for t in TopicSet()} return self._topic_mapping - def get_named_entity_resolver(self, lang: str, rebuild=False): - resolver = self._named_entity_resolver_by_lang.get(lang) - if not resolver or rebuild: - resolver = self.build_named_entity_resolver(lang) - return resolver + def get_linker(self, lang: str, rebuild=False): + linker = self._linker_by_lang.get(lang) + if not linker or rebuild: + linker = self.build_linker(lang) + return linker - def build_named_entity_resolver(self, lang: str): + def build_linker(self, lang: str): + from sefaria.model.linker.linker import Linker + + logger.info("Loading Spacy Model") + + named_entity_resolver = self._build_named_entity_resolver(lang) + ref_resolver = self._build_ref_resolver(lang) + named_entity_recognizer = self._build_named_entity_recognizer(lang) + self._linker_by_lang[lang] = Linker(ref_resolver, named_entity_resolver, named_entity_recognizer) + return self._linker_by_lang[lang] + + @staticmethod + def _build_named_entity_resolver(self, lang: str): from .linker.named_entity_resolver import TopicMatcher, NamedEntityResolver named_entity_types_to_topics = { "PERSON": {"ontology_roots": ['people'], "single_slugs": ['god', 'the-tetragrammaton']}, "GROUP": {'ontology_roots': ["group-of-people"]}, } - self._named_entity_resolver_by_lang[lang] = NamedEntityResolver( - self._build_named_entity_recognizer(lang), - TopicMatcher(lang, named_entity_types_to_topics) - ) - return self._named_entity_resolver_by_lang[lang] + return NamedEntityResolver(TopicMatcher(lang, named_entity_types_to_topics)) @staticmethod def _build_named_entity_recognizer(lang: str): @@ -5631,30 +5638,19 @@ def _build_named_entity_recognizer(lang: str): load_spacy_model(RAW_REF_PART_MODEL_BY_LANG_FILEPATH[lang]) ) - def get_ref_resolver(self, lang: str, rebuild=False): - resolver = self._ref_resolver_by_lang.get(lang) - if not resolver or rebuild: - resolver = self.build_ref_resolver(lang) - return resolver - - def build_ref_resolver(self, lang: str): + def _build_ref_resolver(self, lang: str): from .linker.match_template import MatchTemplateTrie from .linker.ref_resolver import RefResolver, TermMatcher from sefaria.model.schema import NonUniqueTermSet - logger.info("Loading Spacy Model") - root_nodes = list(filter(lambda n: getattr(n, 'match_templates', None) is not None, self.get_index_forest())) alone_nodes = reduce(lambda a, b: a + b.index.get_referenceable_alone_nodes(), root_nodes, []) non_unique_terms = NonUniqueTermSet() - ner = self._build_named_entity_recognizer(lang) - self._ref_resolver_by_lang[lang] = RefResolver( - lang, ner, - MatchTemplateTrie(lang, nodes=(root_nodes + alone_nodes), scope='alone'), + return RefResolver( + lang, MatchTemplateTrie(lang, nodes=(root_nodes + alone_nodes), scope='alone'), TermMatcher(lang, non_unique_terms), ) - return self._ref_resolver_by_lang[lang] def get_index_forest(self): """ From bc3e919b1e30f8a94d6c35cdfd0dfef9debc32b7 Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Thu, 9 Nov 2023 10:43:10 +0200 Subject: [PATCH 034/210] refactor: move link() function to top of Linker --- sefaria/model/linker/linker.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/sefaria/model/linker/linker.py b/sefaria/model/linker/linker.py index 4d18682803..5278cf83eb 100644 --- a/sefaria/model/linker/linker.py +++ b/sefaria/model/linker/linker.py @@ -35,6 +35,13 @@ def bulk_link(self, inputs: List[str], book_context_refs: Optional[List[Optional self._ne_recognizer.bulk_map_normal_output_to_original_input(inputs, named_entity_list_list) return resolved + def link(self, input_str: str, book_context_ref: Optional[Ref] = None, with_failures=False, + thoroughness=ResolutionThoroughness.NORMAL) -> LinkedDoc: + raw_refs, named_entities = self._ne_recognizer.recognize(input_str) + resolved_refs = self._ref_resolver.bulk_resolve(raw_refs, book_context_ref, with_failures, thoroughness) + resolved_named_entities = self._ne_resolver.bulk_resolve(named_entities, with_failures) + return LinkedDoc(resolved_refs, resolved_named_entities) + @staticmethod def _partition_raw_refs_and_named_entities(raw_refs_and_named_entities: List[RawNamedEntity]) \ -> Tuple[List[RawRef], List[RawNamedEntity]]: @@ -50,10 +57,3 @@ def _get_bulk_link_iterable(all_named_entities: List[List[RawNamedEntity]], if verbose: iterable = tqdm(iterable, total=len(book_context_refs)) return iterable - - def link(self, input_str: str, book_context_ref: Optional[Ref] = None, with_failures=False, - thoroughness=ResolutionThoroughness.NORMAL) -> LinkedDoc: - raw_refs, named_entities = self._ne_recognizer.recognize(input_str) - resolved_refs = self._ref_resolver.bulk_resolve(raw_refs, book_context_ref, with_failures, thoroughness) - resolved_named_entities = self._ne_resolver.bulk_resolve(named_entities, with_failures) - return LinkedDoc(resolved_refs, resolved_named_entities) From 619f1f862675bfdab69d076d8aece296845fa246 Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Thu, 9 Nov 2023 10:53:14 +0200 Subject: [PATCH 035/210] docs(linker): add documentation to linker functions --- sefaria/model/linker/linker.py | 33 +++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/sefaria/model/linker/linker.py b/sefaria/model/linker/linker.py index 5278cf83eb..437df8d032 100644 --- a/sefaria/model/linker/linker.py +++ b/sefaria/model/linker/linker.py @@ -9,6 +9,7 @@ @dataclasses.dataclass class LinkedDoc: + text: str resolved_refs: List[PossiblyAmbigResolvedRef] resolved_named_entities: List[ResolvedNamedEntity] @@ -22,14 +23,26 @@ def __init__(self, ref_resolver: RefResolver, ne_resolver: NamedEntityResolver, def bulk_link(self, inputs: List[str], book_context_refs: Optional[List[Optional[Ref]]] = None, with_failures=False, verbose=False, thoroughness=ResolutionThoroughness.NORMAL) -> List[LinkedDoc]: + """ + Bulk operation to link every string in `inputs` with citations and named entities + `bulk_link()` is faster than running `link()` in a loop because it can pass all strings to the relevant models + at once. + @param inputs: String inputs. Each input is processed independently. + @param book_context_refs: Additional context references that represents the source book that the input came from. + @param with_failures: True to return all recognized entities, even if they weren't linked. + @param verbose: True to print progress to the console + @param thoroughness: How thorough the search to link entities should be. HIGH increases the processing time. + @return: list of LinkedDocs + """ all_named_entities = self._ne_recognizer.bulk_recognize(inputs) resolved = [] - iterable = self._get_bulk_link_iterable(all_named_entities, book_context_refs, verbose) - for book_context_ref, inner_named_entities in iterable: + book_context_refs = book_context_refs or [None]*len(all_named_entities) + iterable = self._get_bulk_link_iterable(inputs, all_named_entities, book_context_refs, verbose) + for input_str, book_context_ref, inner_named_entities in iterable: raw_refs, named_entities = self._partition_raw_refs_and_named_entities(inner_named_entities) resolved_refs = self._ref_resolver.bulk_resolve(raw_refs, book_context_ref, with_failures, thoroughness) resolved_named_entities = self._ne_resolver.bulk_resolve(named_entities, with_failures) - resolved += [LinkedDoc(resolved_refs, resolved_named_entities)] + resolved += [LinkedDoc(input_str, resolved_refs, resolved_named_entities)] named_entity_list_list = [[rr.raw_named_entity for rr in inner_resolved] for inner_resolved in resolved] self._ne_recognizer.bulk_map_normal_output_to_original_input(inputs, named_entity_list_list) @@ -37,10 +50,18 @@ def bulk_link(self, inputs: List[str], book_context_refs: Optional[List[Optional def link(self, input_str: str, book_context_ref: Optional[Ref] = None, with_failures=False, thoroughness=ResolutionThoroughness.NORMAL) -> LinkedDoc: + """ + Link `input_str` with citations and named entities + @param input_str: + @param book_context_ref: Additional context reference that represents the source book that the input came from. + @param with_failures: True to return all recognized entities, even if they weren't linked. + @param thoroughness: How thorough the search to link entities should be. HIGH increases the processing time. + @return: + """ raw_refs, named_entities = self._ne_recognizer.recognize(input_str) resolved_refs = self._ref_resolver.bulk_resolve(raw_refs, book_context_ref, with_failures, thoroughness) resolved_named_entities = self._ne_resolver.bulk_resolve(named_entities, with_failures) - return LinkedDoc(resolved_refs, resolved_named_entities) + return LinkedDoc(input_str, resolved_refs, resolved_named_entities) @staticmethod def _partition_raw_refs_and_named_entities(raw_refs_and_named_entities: List[RawNamedEntity]) \ @@ -50,10 +71,10 @@ def _partition_raw_refs_and_named_entities(raw_refs_and_named_entities: List[Raw return raw_refs, named_entities @staticmethod - def _get_bulk_link_iterable(all_named_entities: List[List[RawNamedEntity]], + def _get_bulk_link_iterable(inputs: List[str], all_named_entities: List[List[RawNamedEntity]], book_context_refs: Optional[List[Optional[Ref]]] = None, verbose=False ) -> Iterable[Tuple[Ref, List[RawNamedEntity]]]: - iterable = zip(book_context_refs, all_named_entities) + iterable = zip(inputs, book_context_refs, all_named_entities) if verbose: iterable = tqdm(iterable, total=len(book_context_refs)) return iterable From 27d5db73c1267aae137530a28bdf015ccd108a1e Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Thu, 9 Nov 2023 11:03:36 +0200 Subject: [PATCH 036/210] feat(linker): all control over what type of entities are returned from linker --- sefaria/model/linker/linker.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/sefaria/model/linker/linker.py b/sefaria/model/linker/linker.py index 437df8d032..ef6a7e19ee 100644 --- a/sefaria/model/linker/linker.py +++ b/sefaria/model/linker/linker.py @@ -22,7 +22,7 @@ def __init__(self, ref_resolver: RefResolver, ne_resolver: NamedEntityResolver, self._ne_recognizer = ne_recognizer def bulk_link(self, inputs: List[str], book_context_refs: Optional[List[Optional[Ref]]] = None, with_failures=False, - verbose=False, thoroughness=ResolutionThoroughness.NORMAL) -> List[LinkedDoc]: + verbose=False, thoroughness=ResolutionThoroughness.NORMAL, type_filter='all') -> List[LinkedDoc]: """ Bulk operation to link every string in `inputs` with citations and named entities `bulk_link()` is faster than running `link()` in a loop because it can pass all strings to the relevant models @@ -32,6 +32,7 @@ def bulk_link(self, inputs: List[str], book_context_refs: Optional[List[Optional @param with_failures: True to return all recognized entities, even if they weren't linked. @param verbose: True to print progress to the console @param thoroughness: How thorough the search to link entities should be. HIGH increases the processing time. + @param type_filter: Type of entities to return, either 'all', 'citation' or 'named entity' @return: list of LinkedDocs """ all_named_entities = self._ne_recognizer.bulk_recognize(inputs) @@ -40,8 +41,11 @@ def bulk_link(self, inputs: List[str], book_context_refs: Optional[List[Optional iterable = self._get_bulk_link_iterable(inputs, all_named_entities, book_context_refs, verbose) for input_str, book_context_ref, inner_named_entities in iterable: raw_refs, named_entities = self._partition_raw_refs_and_named_entities(inner_named_entities) - resolved_refs = self._ref_resolver.bulk_resolve(raw_refs, book_context_ref, with_failures, thoroughness) - resolved_named_entities = self._ne_resolver.bulk_resolve(named_entities, with_failures) + resolved_refs, resolved_named_entities = [], [] + if type_filter in {'all', 'citation'}: + resolved_refs = self._ref_resolver.bulk_resolve(raw_refs, book_context_ref, with_failures, thoroughness) + if type_filter in {'all', 'named entity'}: + resolved_named_entities = self._ne_resolver.bulk_resolve(named_entities, with_failures) resolved += [LinkedDoc(input_str, resolved_refs, resolved_named_entities)] named_entity_list_list = [[rr.raw_named_entity for rr in inner_resolved] for inner_resolved in resolved] @@ -49,18 +53,22 @@ def bulk_link(self, inputs: List[str], book_context_refs: Optional[List[Optional return resolved def link(self, input_str: str, book_context_ref: Optional[Ref] = None, with_failures=False, - thoroughness=ResolutionThoroughness.NORMAL) -> LinkedDoc: + thoroughness=ResolutionThoroughness.NORMAL, type_filter='all') -> LinkedDoc: """ Link `input_str` with citations and named entities @param input_str: @param book_context_ref: Additional context reference that represents the source book that the input came from. @param with_failures: True to return all recognized entities, even if they weren't linked. @param thoroughness: How thorough the search to link entities should be. HIGH increases the processing time. + @param type_filter: Type of entities to return, either 'all', 'citation' or 'named entity' @return: """ raw_refs, named_entities = self._ne_recognizer.recognize(input_str) - resolved_refs = self._ref_resolver.bulk_resolve(raw_refs, book_context_ref, with_failures, thoroughness) - resolved_named_entities = self._ne_resolver.bulk_resolve(named_entities, with_failures) + resolved_refs, resolved_named_entities = [], [] + if type_filter in {'all', 'citation'}: + resolved_refs = self._ref_resolver.bulk_resolve(raw_refs, book_context_ref, with_failures, thoroughness) + if type_filter in {'all', 'named entity'}: + resolved_named_entities = self._ne_resolver.bulk_resolve(named_entities, with_failures) return LinkedDoc(input_str, resolved_refs, resolved_named_entities) @staticmethod From c15ee6444439f85ab65754cde9943b565aac7aa3 Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Thu, 9 Nov 2023 13:23:11 +0200 Subject: [PATCH 037/210] fix(linker): remove extra param 'self' --- sefaria/model/text.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sefaria/model/text.py b/sefaria/model/text.py index a88a93b267..2f4d271691 100644 --- a/sefaria/model/text.py +++ b/sefaria/model/text.py @@ -5618,7 +5618,7 @@ def build_linker(self, lang: str): return self._linker_by_lang[lang] @staticmethod - def _build_named_entity_resolver(self, lang: str): + def _build_named_entity_resolver(lang: str): from .linker.named_entity_resolver import TopicMatcher, NamedEntityResolver named_entity_types_to_topics = { From 8655a6aebfa43d3c24180264f2ed59a3bd108263 Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Thu, 9 Nov 2023 13:23:36 +0200 Subject: [PATCH 038/210] fix(linker): change field name from raw_ref to raw_entity to match ResolvedRef's field name --- sefaria/model/linker/ref_resolver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sefaria/model/linker/ref_resolver.py b/sefaria/model/linker/ref_resolver.py index a6f625cf90..7c758dd2ab 100644 --- a/sefaria/model/linker/ref_resolver.py +++ b/sefaria/model/linker/ref_resolver.py @@ -164,7 +164,7 @@ def __init__(self, resolved_refs: List[ResolvedRef]): if len(resolved_refs) == 0: raise InputError("Length of `resolved_refs` must be at least 1") self.resolved_raw_refs = resolved_refs - self.raw_ref = resolved_refs[0].raw_entity # assumption is all resolved_refs share same raw_ref. expose at top level + self.raw_entity = resolved_refs[0].raw_entity # assumption is all resolved_refs share same raw_ref. expose at top level @property def pretty_text(self): From 10178b541076f5c86ea220f45ecdd3ba29b4ed1c Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Thu, 9 Nov 2023 13:24:01 +0200 Subject: [PATCH 039/210] fix(linker): properly collect all named entities from linkedDocs --- sefaria/model/linker/linker.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/sefaria/model/linker/linker.py b/sefaria/model/linker/linker.py index ef6a7e19ee..0d2d81785d 100644 --- a/sefaria/model/linker/linker.py +++ b/sefaria/model/linker/linker.py @@ -13,6 +13,10 @@ class LinkedDoc: resolved_refs: List[PossiblyAmbigResolvedRef] resolved_named_entities: List[ResolvedNamedEntity] + @property + def all_resolved(self) -> List[Union[PossiblyAmbigResolvedRef, ResolvedNamedEntity]]: + return self.resolved_refs + self.resolved_named_entities + class Linker: @@ -36,7 +40,7 @@ def bulk_link(self, inputs: List[str], book_context_refs: Optional[List[Optional @return: list of LinkedDocs """ all_named_entities = self._ne_recognizer.bulk_recognize(inputs) - resolved = [] + docs = [] book_context_refs = book_context_refs or [None]*len(all_named_entities) iterable = self._get_bulk_link_iterable(inputs, all_named_entities, book_context_refs, verbose) for input_str, book_context_ref, inner_named_entities in iterable: @@ -46,11 +50,11 @@ def bulk_link(self, inputs: List[str], book_context_refs: Optional[List[Optional resolved_refs = self._ref_resolver.bulk_resolve(raw_refs, book_context_ref, with_failures, thoroughness) if type_filter in {'all', 'named entity'}: resolved_named_entities = self._ne_resolver.bulk_resolve(named_entities, with_failures) - resolved += [LinkedDoc(input_str, resolved_refs, resolved_named_entities)] + docs += [LinkedDoc(input_str, resolved_refs, resolved_named_entities)] - named_entity_list_list = [[rr.raw_named_entity for rr in inner_resolved] for inner_resolved in resolved] + named_entity_list_list = [[rr.raw_entity for rr in doc.all_resolved] for doc in docs] self._ne_recognizer.bulk_map_normal_output_to_original_input(inputs, named_entity_list_list) - return resolved + return docs def link(self, input_str: str, book_context_ref: Optional[Ref] = None, with_failures=False, thoroughness=ResolutionThoroughness.NORMAL, type_filter='all') -> LinkedDoc: From a99a7e26f260d9708419d240791603b0268f95cc Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Thu, 9 Nov 2023 13:39:08 +0200 Subject: [PATCH 040/210] fix(linker): change build_ref_resolver() to build_linker() --- reader/views.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/reader/views.py b/reader/views.py index 72f5b7fe81..2d76756d44 100644 --- a/reader/views.py +++ b/reader/views.py @@ -114,7 +114,7 @@ if ENABLE_LINKER: logger.info("Initializing Linker") - library.build_ref_resolver('he') + library.build_linker('he') if server_coordinator: server_coordinator.connect() From b35a18ce00a70814e11b30f8a1f27496fdbe556b Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Thu, 9 Nov 2023 13:53:57 +0200 Subject: [PATCH 041/210] refactor(linker): refactor linker api's usage of linker to match now syntax. --- sefaria/helper/linker.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/sefaria/helper/linker.py b/sefaria/helper/linker.py index b6ab7c5e5e..9a438ad770 100644 --- a/sefaria/helper/linker.py +++ b/sefaria/helper/linker.py @@ -119,14 +119,16 @@ def _make_find_refs_response_with_cache(request_text: _FindRefsText, options: _F def _make_find_refs_response_linker_v3(request_text: _FindRefsText, options: _FindRefsTextOptions) -> dict: - resolver = library.get_ref_resolver(request_text.lang) - resolved_title = resolver.bulk_resolve([request_text.title], [None]) - context_ref = resolved_title[0][0].ref if (len(resolved_title[0]) == 1 and not resolved_title[0][0].is_ambiguous) else None - resolved_body = resolver.bulk_resolve([request_text.body], [context_ref], with_failures=True) + linker = library.get_linker(request_text.lang) + title_doc = linker.link(request_text.title, type_filter='citation') + context_ref = None + if len(title_doc.resolved_refs) == 1 and not title_doc.resolved_refs[0].is_ambiguous: + context_ref = title_doc.resolved_refs[0].ref + body_doc = linker.link(request_text.body, context_ref, with_failures=True, type_filter='citation') response = { - "title": _make_find_refs_response_inner(resolved_title, options), - "body": _make_find_refs_response_inner(resolved_body, options), + "title": _make_find_refs_response_inner(title_doc.resolved_refs, options), + "body": _make_find_refs_response_inner(body_doc.resolved_refs, options), } return response From f2c9a96bbc52beb5b46794b6c210561b5932fe80 Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Thu, 9 Nov 2023 14:11:17 +0200 Subject: [PATCH 042/210] refactor(linker): refactor all usages of get_ref_resolver() --- scripts/parse_rambi_webpages.py | 6 +++--- sefaria/model/linker/linker.py | 20 ++++++++++++++----- sefaria/model/linker/tests/linker_test.py | 14 +++++++------ .../model/linker/tests/linker_test_utils.py | 4 ++-- sefaria/model/text.py | 2 +- 5 files changed, 29 insertions(+), 17 deletions(-) diff --git a/scripts/parse_rambi_webpages.py b/scripts/parse_rambi_webpages.py index b9f663dc35..d01ca88203 100644 --- a/scripts/parse_rambi_webpages.py +++ b/scripts/parse_rambi_webpages.py @@ -60,10 +60,10 @@ def get_refs_from_string(string): lang = 'he' if len(re.findall('[א-ת]', string)) > len(string) / 2 else 'en' if lang == 'en': string = translliterate_russian_to_latin(string) - ref_resolver = library.get_ref_resolver() + linker = library.get_linker('he') if lang == 'he': # remove this line when linker v3 is availabe in English - refs = ref_resolver.bulk_resolve(lang, [None], [string]) - refs = {y.ref for x in refs for y in x if not y.is_ambiguous} + doc = linker.link(string, type_filter='citation') + refs = {y.ref for y in doc.resolved_refs if not y.is_ambiguous} else: # remove else statement (with its content) when linker v3 is availabe in English refs = set() library.apply_action_for_all_refs_in_string(re.sub('[\(\)]', '', string), lambda x, y: refs.add(x), 'en', citing_only=True) diff --git a/sefaria/model/linker/linker.py b/sefaria/model/linker/linker.py index 0d2d81785d..2a3409d38d 100644 --- a/sefaria/model/linker/linker.py +++ b/sefaria/model/linker/linker.py @@ -20,10 +20,10 @@ def all_resolved(self) -> List[Union[PossiblyAmbigResolvedRef, ResolvedNamedEnti class Linker: - def __init__(self, ref_resolver: RefResolver, ne_resolver: NamedEntityResolver, ne_recognizer: NamedEntityRecognizer): + def __init__(self, ner: NamedEntityRecognizer, ref_resolver: RefResolver, ne_resolver: NamedEntityResolver, ): + self._ner = ner self._ref_resolver = ref_resolver self._ne_resolver = ne_resolver - self._ne_recognizer = ne_recognizer def bulk_link(self, inputs: List[str], book_context_refs: Optional[List[Optional[Ref]]] = None, with_failures=False, verbose=False, thoroughness=ResolutionThoroughness.NORMAL, type_filter='all') -> List[LinkedDoc]: @@ -39,7 +39,7 @@ def bulk_link(self, inputs: List[str], book_context_refs: Optional[List[Optional @param type_filter: Type of entities to return, either 'all', 'citation' or 'named entity' @return: list of LinkedDocs """ - all_named_entities = self._ne_recognizer.bulk_recognize(inputs) + all_named_entities = self._ner.bulk_recognize(inputs) docs = [] book_context_refs = book_context_refs or [None]*len(all_named_entities) iterable = self._get_bulk_link_iterable(inputs, all_named_entities, book_context_refs, verbose) @@ -53,7 +53,7 @@ def bulk_link(self, inputs: List[str], book_context_refs: Optional[List[Optional docs += [LinkedDoc(input_str, resolved_refs, resolved_named_entities)] named_entity_list_list = [[rr.raw_entity for rr in doc.all_resolved] for doc in docs] - self._ne_recognizer.bulk_map_normal_output_to_original_input(inputs, named_entity_list_list) + self._ner.bulk_map_normal_output_to_original_input(inputs, named_entity_list_list) return docs def link(self, input_str: str, book_context_ref: Optional[Ref] = None, with_failures=False, @@ -67,7 +67,7 @@ def link(self, input_str: str, book_context_ref: Optional[Ref] = None, with_fail @param type_filter: Type of entities to return, either 'all', 'citation' or 'named entity' @return: """ - raw_refs, named_entities = self._ne_recognizer.recognize(input_str) + raw_refs, named_entities = self._ner.recognize(input_str) resolved_refs, resolved_named_entities = [], [] if type_filter in {'all', 'citation'}: resolved_refs = self._ref_resolver.bulk_resolve(raw_refs, book_context_ref, with_failures, thoroughness) @@ -75,6 +75,16 @@ def link(self, input_str: str, book_context_ref: Optional[Ref] = None, with_fail resolved_named_entities = self._ne_resolver.bulk_resolve(named_entities, with_failures) return LinkedDoc(input_str, resolved_refs, resolved_named_entities) + def get_ner(self) -> NamedEntityRecognizer: + return self._ner + + def reset_ibid_history(self) -> None: + """ + Reflecting this function out + @return: + """ + self._ref_resolver.reset_ibid_history() + @staticmethod def _partition_raw_refs_and_named_entities(raw_refs_and_named_entities: List[RawNamedEntity]) \ -> Tuple[List[RawRef], List[RawNamedEntity]]: diff --git a/sefaria/model/linker/tests/linker_test.py b/sefaria/model/linker/tests/linker_test.py index 1ab03cb330..bd27a89020 100644 --- a/sefaria/model/linker/tests/linker_test.py +++ b/sefaria/model/linker/tests/linker_test.py @@ -232,7 +232,8 @@ def test_resolved_raw_ref_clone(): ]) def test_resolve_raw_ref(resolver_data, expected_trefs): raw_ref, context_ref, lang, prev_trefs = resolver_data - ref_resolver = library.get_ref_resolver(lang) + linker = library.get_linker(lang) + ref_resolver = linker._ref_resolver ref_resolver.reset_ibid_history() # reset from previous test runs if prev_trefs: for prev_tref in prev_trefs: @@ -264,8 +265,9 @@ class TestResolveRawRef: ]) def test_full_pipeline_ref_resolver(context_tref, input_str, lang, expected_trefs, expected_pretty_texts): context_oref = context_tref and Ref(context_tref) - ref_resolver = library.get_ref_resolver(lang) - resolved = ref_resolver.bulk_resolve([input_str], [context_oref])[0] + linker = library.get_linker(lang) + doc = linker.link(input_str, context_oref, type_filter='citation') + resolved = doc.resolved_refs assert len(resolved) == len(expected_trefs) resolved_orefs = sorted(reduce(lambda a, b: a + b, [[match.ref] if not match.is_ambiguous else [inner_match.ref for inner_match in match.resolved_raw_refs] for match in resolved], []), key=lambda x: x.normal()) if len(expected_trefs) != len(resolved_orefs): @@ -384,15 +386,15 @@ def test_map_new_indices(crrd_params): # unnorm data raw_ref, _, lang, _ = crrd(*crrd_params) text = raw_ref.text - ref_resolver = library.get_ref_resolver(lang) - nlp = ref_resolver.get_ner().raw_ref_model + linker = library.get_linker(lang) + nlp = linker.get_ner().raw_ref_model doc = nlp.make_doc(text) indices = raw_ref.char_indices part_indices = [p.char_indices for p in raw_ref.raw_ref_parts] print_spans(raw_ref) # norm data - n = ref_resolver.get_ner()._normalizer + n = linker.get_ner()._normalizer norm_text = n.normalize(text) norm_doc = nlp.make_doc(norm_text) mapping = n.get_mapping_after_normalization(text, reverse=True) diff --git a/sefaria/model/linker/tests/linker_test_utils.py b/sefaria/model/linker/tests/linker_test_utils.py index c6d9b0213f..fa511dc25a 100644 --- a/sefaria/model/linker/tests/linker_test_utils.py +++ b/sefaria/model/linker/tests/linker_test_utils.py @@ -56,7 +56,7 @@ def get_symbol_by_part_type(part_type): @staticmethod def convert_to_raw_encoded_part_list(lang, text, span_inds, part_types): - nlp = library.get_ref_resolver(lang).get_ner().raw_ref_part_model + nlp = library.get_linker(lang).get_ner().raw_ref_part_model doc = nlp.make_doc(text) span = doc[0:] raw_encoded_part_list = [] @@ -102,7 +102,7 @@ def part_types(self): @property def span(self): if not self._span: - nlp = library.get_ref_resolver(self.lang).get_ner().raw_ref_part_model + nlp = library.get_linker(self.lang).get_ner().raw_ref_part_model doc = nlp.make_doc(self.input_str) self._span = doc[0:] return self._span diff --git a/sefaria/model/text.py b/sefaria/model/text.py index 2f4d271691..9c089e2362 100644 --- a/sefaria/model/text.py +++ b/sefaria/model/text.py @@ -5614,7 +5614,7 @@ def build_linker(self, lang: str): named_entity_resolver = self._build_named_entity_resolver(lang) ref_resolver = self._build_ref_resolver(lang) named_entity_recognizer = self._build_named_entity_recognizer(lang) - self._linker_by_lang[lang] = Linker(ref_resolver, named_entity_resolver, named_entity_recognizer) + self._linker_by_lang[lang] = Linker(named_entity_recognizer, ref_resolver, named_entity_resolver) return self._linker_by_lang[lang] @staticmethod From 3914bc59ef8aa21d05f37eadfed7b29866881fea Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Thu, 9 Nov 2023 14:29:40 +0200 Subject: [PATCH 043/210] refactor(linker): move NamedEntityRecognizer to its own file. --- sefaria/model/linker/linker.py | 3 +- .../model/linker/named_entity_recognizer.py | 244 +++++++++++++++++ sefaria/model/linker/named_entity_resolver.py | 247 +----------------- sefaria/model/linker/ref_resolver.py | 6 +- sefaria/model/text.py | 2 +- 5 files changed, 252 insertions(+), 250 deletions(-) create mode 100644 sefaria/model/linker/named_entity_recognizer.py diff --git a/sefaria/model/linker/linker.py b/sefaria/model/linker/linker.py index 2a3409d38d..db0d66be6a 100644 --- a/sefaria/model/linker/linker.py +++ b/sefaria/model/linker/linker.py @@ -4,7 +4,8 @@ from sefaria.model.text import Ref from sefaria.model.linker.ref_part import RawRef, RawNamedEntity from sefaria.model.linker.ref_resolver import RefResolver, ResolutionThoroughness, PossiblyAmbigResolvedRef -from sefaria.model.linker.named_entity_resolver import NamedEntityResolver, NamedEntityRecognizer, ResolvedNamedEntity +from sefaria.model.linker.named_entity_resolver import NamedEntityResolver, ResolvedNamedEntity +from sefaria.model.linker.named_entity_recognizer import NamedEntityRecognizer @dataclasses.dataclass diff --git a/sefaria/model/linker/named_entity_recognizer.py b/sefaria/model/linker/named_entity_recognizer.py new file mode 100644 index 0000000000..ba27e2f708 --- /dev/null +++ b/sefaria/model/linker/named_entity_recognizer.py @@ -0,0 +1,244 @@ +from typing import List, Generator, Optional, Tuple +from functools import reduce +from collections import defaultdict +from sefaria.model.linker.ref_part import RawRef, RawRefPart, SpanOrToken, span_inds, RefPartType, RawNamedEntity, NamedEntityType +from sefaria.helper.normalization import NormalizerComposer +try: + import spacy + from spacy.tokens import Span + from spacy.language import Language +except ImportError: + spacy = Doc = Span = Token = Language = None + + +class NamedEntityRecognizer: + """ + Given models, runs them and returns named entity results + Currently, named entities include: + - refs + - people + - groups of people + """ + + def __init__(self, lang: str, raw_ref_model: Language, raw_ref_part_model: Language): + self._lang = lang + self._raw_ref_model = raw_ref_model + self._raw_ref_part_model = raw_ref_part_model + self._normalizer = self.__init_normalizer() + + def __init_normalizer(self) -> NormalizerComposer: + # see ML Repo library_exporter.py:TextWalker.__init__() which uses same normalization + # important that normalization is equivalent to normalization done at training time + normalizer_steps = ['unidecode', 'html', 'double-space'] + if self._lang == 'he': + normalizer_steps += ['maqaf', 'cantillation'] + return NormalizerComposer(normalizer_steps) + + def bulk_recognize(self, inputs: List[str]) -> List[List[RawNamedEntity]]: + """ + Return all RawNamedEntity's in `inputs`. If the entity is a citation, parse out the inner RawRefParts and create + RawRefs. + @param inputs: List of strings to search for named entities in. + @return: 2D list of RawNamedEntity's. Includes RawRefs which are a subtype of RawNamedEntity + """ + all_raw_named_entities = self._bulk_get_raw_named_entities_wo_raw_refs(inputs) + all_citations, all_non_citations = self._bulk_partition_named_entities_by_citation_type(all_raw_named_entities) + all_raw_refs = self._bulk_parse_raw_refs(all_citations) + merged_entities = [] + for inner_non_citations, inner_citations in zip(all_non_citations, all_raw_refs): + merged_entities += [inner_non_citations + inner_citations] + return merged_entities + + def recognize(self, input_str: str) -> Tuple[List[RawRef], List[RawNamedEntity]]: + raw_named_entities = self._get_raw_named_entities_wo_raw_refs(input_str) + citations, non_citations = self._partition_named_entities_by_citation_type(raw_named_entities) + raw_refs = self._parse_raw_refs(citations) + return raw_refs, non_citations + + def _bulk_get_raw_named_entities_wo_raw_refs(self, inputs: List[str]) -> List[List[RawNamedEntity]]: + """ + Finds RawNamedEntities in `inputs` but doesn't parse citations into RawRefs with RawRefParts + @param inputs: + @return: + """ + normalized_inputs = self._normalize_input(inputs) + all_raw_named_entity_spans = list(self._bulk_get_raw_named_entity_spans(normalized_inputs)) + all_raw_named_entities = [] + for raw_named_entity_spans in all_raw_named_entity_spans: + temp_raw_named_entities = [] + for span in raw_named_entity_spans: + ne_type = NamedEntityType.span_label_to_enum(span.label_) + temp_raw_named_entities += [RawNamedEntity(span, ne_type)] + all_raw_named_entities += [temp_raw_named_entities] + return all_raw_named_entities + + def _get_raw_named_entities_wo_raw_refs(self, input_str: str) -> List[RawNamedEntity]: + """ + Finds RawNamedEntities in `input_str` but doesn't parse citations into RawRefs with RawRefParts + @param input_str: + @return: + """ + normalized_input = self._normalize_input([input_str])[0] + raw_named_entity_spans = self._get_raw_named_entity_spans(normalized_input) + raw_named_entities = [] + for span in raw_named_entity_spans: + ne_type = NamedEntityType.span_label_to_enum(span.label_) + raw_named_entities += [RawNamedEntity(span, ne_type)] + return raw_named_entities + + @staticmethod + def _bulk_partition_named_entities_by_citation_type( + all_raw_named_entities: List[List[RawNamedEntity]] + ) -> Tuple[List[List[RawNamedEntity]], List[List[RawNamedEntity]]]: + """ + Given named entities, partition them into two lists; list of entities that are citations and those that aren't. + @param all_raw_named_entities: + @return: + """ + citations, non_citations = [], [] + for sublist in all_raw_named_entities: + inner_citations, inner_non_citations = NamedEntityRecognizer._partition_named_entities_by_citation_type(sublist) + citations += [inner_citations] + non_citations += [inner_non_citations] + return citations, non_citations + + @staticmethod + def _partition_named_entities_by_citation_type( + raw_named_entities: List[RawNamedEntity] + ) -> Tuple[List[RawNamedEntity], List[RawNamedEntity]]: + citations, non_citations = [], [] + for named_entity in raw_named_entities: + curr_list = citations if named_entity.type == NamedEntityType.CITATION else non_citations + curr_list += [named_entity] + return citations, non_citations + + def _bulk_parse_raw_refs(self, all_citation_entities: List[List[RawNamedEntity]]) -> List[List[RawRef]]: + """ + Runs models on inputs to locate all refs and ref parts + Note: takes advantage of bulk spaCy operations. It is more efficient to pass multiple strings in input than to + run this function multiple times + @param inputs: List of strings to search for refs in. + @return: 2D list of RawRefs. Each inner list corresponds to the refs found in a string of the input. + """ + ref_part_input = reduce(lambda a, b: a + [(sub_b.text, b[0]) for sub_b in b[1]], enumerate(all_citation_entities), []) + all_raw_ref_part_spans = list(self._bulk_get_raw_ref_part_spans(ref_part_input, as_tuples=True)) + all_raw_ref_part_span_map = defaultdict(list) + for ref_part_span, input_idx in all_raw_ref_part_spans: + all_raw_ref_part_span_map[input_idx] += [ref_part_span] + + all_raw_refs = [] + for input_idx, named_entities in enumerate(all_citation_entities): + raw_ref_part_spans = all_raw_ref_part_span_map[input_idx] + all_raw_refs += [self._bulk_make_raw_refs(named_entities, raw_ref_part_spans)] + return all_raw_refs + + def _parse_raw_refs(self, citation_entities: List[RawNamedEntity]) -> List[RawRef]: + raw_ref_part_spans = list(self._bulk_get_raw_ref_part_spans([e.text for e in citation_entities])) + return self._bulk_make_raw_refs(citation_entities, raw_ref_part_spans) + + def bulk_map_normal_output_to_original_input(self, input: List[str], raw_ref_list_list: List[List[RawRef]]): + for temp_input, raw_ref_list in zip(input, raw_ref_list_list): + self.map_normal_output_to_original_input(temp_input, raw_ref_list) + + def map_normal_output_to_original_input(self, input: str, named_entities: List[RawNamedEntity]) -> None: + """ + Ref resolution ran on normalized input. Remap raw refs to original (non-normalized) input + """ + unnorm_doc = self._raw_ref_model.make_doc(input) + mapping = self._normalizer.get_mapping_after_normalization(input) + # this function name is waaay too long + conv = self._normalizer.convert_normalized_indices_to_unnormalized_indices + norm_inds = [named_entity.char_indices for named_entity in named_entities] + unnorm_inds = conv(norm_inds, mapping) + unnorm_part_inds = [] + for (named_entity, (norm_raw_ref_start, _)) in zip(named_entities, norm_inds): + raw_ref_parts = named_entity.raw_ref_parts if isinstance(named_entity, RawRef) else [] + unnorm_part_inds += [conv([[norm_raw_ref_start + i for i in part.char_indices] + for part in raw_ref_parts], mapping)] + for named_entity, temp_unnorm_inds, temp_unnorm_part_inds in zip(named_entities, unnorm_inds, unnorm_part_inds): + named_entity.map_new_indices(unnorm_doc, temp_unnorm_inds) + if isinstance(named_entity, RawRef): + named_entity.map_new_part_indices(temp_unnorm_part_inds) + + @property + def raw_ref_model(self): + return self._raw_ref_model + + @property + def raw_ref_part_model(self): + return self._raw_ref_part_model + + def _normalize_input(self, input: List[str]): + """ + Normalize input text to match normalization that happened at training time + """ + return [self._normalizer.normalize(s) for s in input] + + def _get_raw_named_entity_spans(self, st: str) -> List[SpanOrToken]: + doc = self._raw_ref_model(st) + return doc.ents + + def _get_raw_ref_part_spans(self, st: str) -> List[SpanOrToken]: + doc = self._raw_ref_part_model(st) + return doc.ents + + def _bulk_get_raw_named_entity_spans(self, input: List[str], batch_size=150, **kwargs) -> Generator[List[Span], None, None]: + for doc in self._raw_ref_model.pipe(input, batch_size=batch_size, **kwargs): + if kwargs.get('as_tuples', False): + doc, context = doc + yield doc.ents, context + else: + yield doc.ents + + def _bulk_get_raw_ref_part_spans(self, input: List[str], batch_size=None, **kwargs) -> Generator[List[Span], None, None]: + for doc in self._raw_ref_part_model.pipe(input, batch_size=batch_size or len(input), **kwargs): + if kwargs.get('as_tuples', False): + doc, context = doc + yield doc.ents, context + else: + yield doc.ents + + def _bulk_make_raw_refs(self, named_entities: List[RawNamedEntity], raw_ref_part_spans: List[List[SpanOrToken]]) -> List[RawRef]: + raw_refs = [] + dh_continuations = self._bulk_make_dh_continuations(named_entities, raw_ref_part_spans) + for named_entity, part_span_list, temp_dh_continuations in zip(named_entities, raw_ref_part_spans, dh_continuations): + raw_refs += [self._make_raw_ref(named_entity.span, part_span_list, temp_dh_continuations)] + return raw_refs + + def _make_raw_ref(self, span: SpanOrToken, part_span_list: List[SpanOrToken], dh_continuations: List[SpanOrToken]) -> RawRef: + raw_ref_parts = [] + for part_span, dh_continuation in zip(part_span_list, dh_continuations): + part_type = RefPartType.span_label_to_enum(part_span.label_) + raw_ref_parts += [RawRefPart(part_type, part_span, dh_continuation)] + return RawRef(span, self._lang, raw_ref_parts) + + def _bulk_make_dh_continuations(self, named_entities: List[RawNamedEntity], raw_ref_part_spans) -> List[List[SpanOrToken]]: + dh_continuations = [] + for ispan, (named_entity, part_span_list) in enumerate(zip(named_entities, raw_ref_part_spans)): + temp_dh_continuations = [] + for ipart, part_span in enumerate(part_span_list): + part_type = RefPartType.span_label_to_enum(part_span.label_) + dh_continuation = None + if part_type == RefPartType.DH: + dh_continuation = self._get_dh_continuation(ispan, ipart, named_entities, part_span_list, + named_entity.span, part_span) + temp_dh_continuations += [dh_continuation] + dh_continuations += [temp_dh_continuations] + return dh_continuations + + @staticmethod + def _get_dh_continuation(ispan: int, ipart: int, named_entities: List[RawNamedEntity], part_span_list: List[SpanOrToken], span: SpanOrToken, part_span: SpanOrToken) -> Optional[SpanOrToken]: + if ipart == len(part_span_list) - 1: + curr_doc = span.doc + _, span_end = span_inds(span) + if ispan == len(named_entities) - 1: + dh_cont = curr_doc[span_end:] + else: + next_span_start, _ = span_inds(named_entities[ispan + 1].span) + dh_cont = curr_doc[span_end:next_span_start] + else: + _, part_span_end = span_inds(part_span) + next_part_span_start, _ = span_inds(part_span_list[ipart + 1]) + dh_cont = part_span.doc[part_span_end:next_part_span_start] + + return dh_cont diff --git a/sefaria/model/linker/named_entity_resolver.py b/sefaria/model/linker/named_entity_resolver.py index 358511c5cc..b8384d94fd 100644 --- a/sefaria/model/linker/named_entity_resolver.py +++ b/sefaria/model/linker/named_entity_resolver.py @@ -1,5 +1,5 @@ import dataclasses -from typing import List, Generator, Optional, Dict, Type, Set, Tuple +from typing import List, Dict, Type, Set try: import re2 as re re.set_fallback_notification(re.FALLBACK_WARNING) @@ -7,252 +7,11 @@ import re from functools import reduce from collections import defaultdict -from sefaria.model.linker.ref_part import RawRef, RawRefPart, SpanOrToken, span_inds, RefPartType, RawNamedEntity, NamedEntityType -from sefaria.helper.normalization import NormalizerComposer -from sefaria.model.topic import Topic, TopicSet, RefTopicLink +from sefaria.model.linker.ref_part import RawNamedEntity +from sefaria.model.topic import Topic from sefaria.utils.hebrew import strip_cantillation from sefaria.system.exceptions import InputError -try: - import spacy - from spacy.tokens import Span - from spacy.language import Language -except ImportError: - spacy = Doc = Span = Token = Language = None - - -class NamedEntityRecognizer: - """ - Given models, runs them and returns named entity results - Currently, named entities include: - - refs - - people - - groups of people - """ - - def __init__(self, lang: str, raw_ref_model: Language, raw_ref_part_model: Language): - self._lang = lang - self._raw_ref_model = raw_ref_model - self._raw_ref_part_model = raw_ref_part_model - self._normalizer = self.__init_normalizer() - - def __init_normalizer(self) -> NormalizerComposer: - # see ML Repo library_exporter.py:TextWalker.__init__() which uses same normalization - # important that normalization is equivalent to normalization done at training time - normalizer_steps = ['unidecode', 'html', 'double-space'] - if self._lang == 'he': - normalizer_steps += ['maqaf', 'cantillation'] - return NormalizerComposer(normalizer_steps) - - def bulk_recognize(self, inputs: List[str]) -> List[List[RawNamedEntity]]: - """ - Return all RawNamedEntity's in `inputs`. If the entity is a citation, parse out the inner RawRefParts and create - RawRefs. - @param inputs: List of strings to search for named entities in. - @return: 2D list of RawNamedEntity's. Includes RawRefs which are a subtype of RawNamedEntity - """ - all_raw_named_entities = self._bulk_get_raw_named_entities_wo_raw_refs(inputs) - all_citations, all_non_citations = self._bulk_partition_named_entities_by_citation_type(all_raw_named_entities) - all_raw_refs = self._bulk_parse_raw_refs(all_citations) - merged_entities = [] - for inner_non_citations, inner_citations in zip(all_non_citations, all_raw_refs): - merged_entities += [inner_non_citations + inner_citations] - return merged_entities - - def recognize(self, input_str: str) -> Tuple[List[RawRef], List[RawNamedEntity]]: - raw_named_entities = self._get_raw_named_entities_wo_raw_refs(input_str) - citations, non_citations = self._partition_named_entities_by_citation_type(raw_named_entities) - raw_refs = self._parse_raw_refs(citations) - return raw_refs, non_citations - - def _bulk_get_raw_named_entities_wo_raw_refs(self, inputs: List[str]) -> List[List[RawNamedEntity]]: - """ - Finds RawNamedEntities in `inputs` but doesn't parse citations into RawRefs with RawRefParts - @param inputs: - @return: - """ - normalized_inputs = self._normalize_input(inputs) - all_raw_named_entity_spans = list(self._bulk_get_raw_named_entity_spans(normalized_inputs)) - all_raw_named_entities = [] - for raw_named_entity_spans in all_raw_named_entity_spans: - temp_raw_named_entities = [] - for span in raw_named_entity_spans: - ne_type = NamedEntityType.span_label_to_enum(span.label_) - temp_raw_named_entities += [RawNamedEntity(span, ne_type)] - all_raw_named_entities += [temp_raw_named_entities] - return all_raw_named_entities - - def _get_raw_named_entities_wo_raw_refs(self, input_str: str) -> List[RawNamedEntity]: - """ - Finds RawNamedEntities in `input_str` but doesn't parse citations into RawRefs with RawRefParts - @param input_str: - @return: - """ - normalized_input = self._normalize_input([input_str])[0] - raw_named_entity_spans = self._get_raw_named_entity_spans(normalized_input) - raw_named_entities = [] - for span in raw_named_entity_spans: - ne_type = NamedEntityType.span_label_to_enum(span.label_) - raw_named_entities += [RawNamedEntity(span, ne_type)] - return raw_named_entities - - @staticmethod - def _bulk_partition_named_entities_by_citation_type( - all_raw_named_entities: List[List[RawNamedEntity]] - ) -> Tuple[List[List[RawNamedEntity]], List[List[RawNamedEntity]]]: - """ - Given named entities, partition them into two lists; list of entities that are citations and those that aren't. - @param all_raw_named_entities: - @return: - """ - citations, non_citations = [], [] - for sublist in all_raw_named_entities: - inner_citations, inner_non_citations = NamedEntityRecognizer._partition_named_entities_by_citation_type(sublist) - citations += [inner_citations] - non_citations += [inner_non_citations] - return citations, non_citations - - @staticmethod - def _partition_named_entities_by_citation_type( - raw_named_entities: List[RawNamedEntity] - ) -> Tuple[List[RawNamedEntity], List[RawNamedEntity]]: - citations, non_citations = [], [] - for named_entity in raw_named_entities: - curr_list = citations if named_entity.type == NamedEntityType.CITATION else non_citations - curr_list += [named_entity] - return citations, non_citations - - def _bulk_parse_raw_refs(self, all_citation_entities: List[List[RawNamedEntity]]) -> List[List[RawRef]]: - """ - Runs models on inputs to locate all refs and ref parts - Note: takes advantage of bulk spaCy operations. It is more efficient to pass multiple strings in input than to - run this function multiple times - @param inputs: List of strings to search for refs in. - @return: 2D list of RawRefs. Each inner list corresponds to the refs found in a string of the input. - """ - ref_part_input = reduce(lambda a, b: a + [(sub_b.text, b[0]) for sub_b in b[1]], enumerate(all_citation_entities), []) - all_raw_ref_part_spans = list(self._bulk_get_raw_ref_part_spans(ref_part_input, as_tuples=True)) - all_raw_ref_part_span_map = defaultdict(list) - for ref_part_span, input_idx in all_raw_ref_part_spans: - all_raw_ref_part_span_map[input_idx] += [ref_part_span] - - all_raw_refs = [] - for input_idx, named_entities in enumerate(all_citation_entities): - raw_ref_part_spans = all_raw_ref_part_span_map[input_idx] - all_raw_refs += [self._bulk_make_raw_refs(named_entities, raw_ref_part_spans)] - return all_raw_refs - - def _parse_raw_refs(self, citation_entities: List[RawNamedEntity]) -> List[RawRef]: - raw_ref_part_spans = list(self._bulk_get_raw_ref_part_spans([e.text for e in citation_entities])) - return self._bulk_make_raw_refs(citation_entities, raw_ref_part_spans) - - def bulk_map_normal_output_to_original_input(self, input: List[str], raw_ref_list_list: List[List[RawRef]]): - for temp_input, raw_ref_list in zip(input, raw_ref_list_list): - self.map_normal_output_to_original_input(temp_input, raw_ref_list) - - def map_normal_output_to_original_input(self, input: str, named_entities: List[RawNamedEntity]) -> None: - """ - Ref resolution ran on normalized input. Remap raw refs to original (non-normalized) input - """ - unnorm_doc = self._raw_ref_model.make_doc(input) - mapping = self._normalizer.get_mapping_after_normalization(input) - # this function name is waaay too long - conv = self._normalizer.convert_normalized_indices_to_unnormalized_indices - norm_inds = [named_entity.char_indices for named_entity in named_entities] - unnorm_inds = conv(norm_inds, mapping) - unnorm_part_inds = [] - for (named_entity, (norm_raw_ref_start, _)) in zip(named_entities, norm_inds): - raw_ref_parts = named_entity.raw_ref_parts if isinstance(named_entity, RawRef) else [] - unnorm_part_inds += [conv([[norm_raw_ref_start + i for i in part.char_indices] - for part in raw_ref_parts], mapping)] - for named_entity, temp_unnorm_inds, temp_unnorm_part_inds in zip(named_entities, unnorm_inds, unnorm_part_inds): - named_entity.map_new_indices(unnorm_doc, temp_unnorm_inds) - if isinstance(named_entity, RawRef): - named_entity.map_new_part_indices(temp_unnorm_part_inds) - - @property - def raw_ref_model(self): - return self._raw_ref_model - - @property - def raw_ref_part_model(self): - return self._raw_ref_part_model - - def _normalize_input(self, input: List[str]): - """ - Normalize input text to match normalization that happened at training time - """ - return [self._normalizer.normalize(s) for s in input] - - def _get_raw_named_entity_spans(self, st: str) -> List[SpanOrToken]: - doc = self._raw_ref_model(st) - return doc.ents - - def _get_raw_ref_part_spans(self, st: str) -> List[SpanOrToken]: - doc = self._raw_ref_part_model(st) - return doc.ents - - def _bulk_get_raw_named_entity_spans(self, input: List[str], batch_size=150, **kwargs) -> Generator[List[Span], None, None]: - for doc in self._raw_ref_model.pipe(input, batch_size=batch_size, **kwargs): - if kwargs.get('as_tuples', False): - doc, context = doc - yield doc.ents, context - else: - yield doc.ents - - def _bulk_get_raw_ref_part_spans(self, input: List[str], batch_size=None, **kwargs) -> Generator[List[Span], None, None]: - for doc in self._raw_ref_part_model.pipe(input, batch_size=batch_size or len(input), **kwargs): - if kwargs.get('as_tuples', False): - doc, context = doc - yield doc.ents, context - else: - yield doc.ents - - def _bulk_make_raw_refs(self, named_entities: List[RawNamedEntity], raw_ref_part_spans: List[List[SpanOrToken]]) -> List[RawRef]: - raw_refs = [] - dh_continuations = self._bulk_make_dh_continuations(named_entities, raw_ref_part_spans) - for named_entity, part_span_list, temp_dh_continuations in zip(named_entities, raw_ref_part_spans, dh_continuations): - raw_refs += [self._make_raw_ref(named_entity.span, part_span_list, temp_dh_continuations)] - return raw_refs - - def _make_raw_ref(self, span: SpanOrToken, part_span_list: List[SpanOrToken], dh_continuations: List[SpanOrToken]) -> RawRef: - raw_ref_parts = [] - for part_span, dh_continuation in zip(part_span_list, dh_continuations): - part_type = RefPartType.span_label_to_enum(part_span.label_) - raw_ref_parts += [RawRefPart(part_type, part_span, dh_continuation)] - return RawRef(span, self._lang, raw_ref_parts) - - def _bulk_make_dh_continuations(self, named_entities: List[RawNamedEntity], raw_ref_part_spans) -> List[List[SpanOrToken]]: - dh_continuations = [] - for ispan, (named_entity, part_span_list) in enumerate(zip(named_entities, raw_ref_part_spans)): - temp_dh_continuations = [] - for ipart, part_span in enumerate(part_span_list): - part_type = RefPartType.span_label_to_enum(part_span.label_) - dh_continuation = None - if part_type == RefPartType.DH: - dh_continuation = self._get_dh_continuation(ispan, ipart, named_entities, part_span_list, - named_entity.span, part_span) - temp_dh_continuations += [dh_continuation] - dh_continuations += [temp_dh_continuations] - return dh_continuations - - @staticmethod - def _get_dh_continuation(ispan: int, ipart: int, named_entities: List[RawNamedEntity], part_span_list: List[SpanOrToken], span: SpanOrToken, part_span: SpanOrToken) -> Optional[SpanOrToken]: - if ipart == len(part_span_list) - 1: - curr_doc = span.doc - _, span_end = span_inds(span) - if ispan == len(named_entities) - 1: - dh_cont = curr_doc[span_end:] - else: - next_span_start, _ = span_inds(named_entities[ispan + 1].span) - dh_cont = curr_doc[span_end:next_span_start] - else: - _, part_span_end = span_inds(part_span) - next_part_span_start, _ = span_inds(part_span_list[ipart + 1]) - dh_cont = part_span.doc[part_span_end:next_part_span_start] - - return dh_cont - class ResolvedNamedEntity: diff --git a/sefaria/model/linker/ref_resolver.py b/sefaria/model/linker/ref_resolver.py index 7c758dd2ab..8619d9108a 100644 --- a/sefaria/model/linker/ref_resolver.py +++ b/sefaria/model/linker/ref_resolver.py @@ -1,14 +1,12 @@ from collections import defaultdict from typing import List, Union, Dict, Optional, Tuple, Iterable, Set from enum import IntEnum, Enum -from tqdm import tqdm from sefaria.system.exceptions import InputError from sefaria.model import abstract as abst from sefaria.model import text from sefaria.model import schema -from sefaria.model.linker.named_entity_resolver import NamedEntityRecognizer, ResolvedNamedEntity -from sefaria.model.linker.ref_part import RawRef, RawRefPart, SpanOrToken, span_inds, RefPartType, SectionContext, ContextPart, TermContext, RawNamedEntity -from sefaria.model.linker.referenceable_book_node import NamedReferenceableBookNode, ReferenceableBookNode +from sefaria.model.linker.ref_part import RawRef, RawRefPart, SpanOrToken, span_inds, RefPartType, SectionContext, ContextPart, TermContext +from sefaria.model.linker.referenceable_book_node import ReferenceableBookNode from sefaria.model.linker.match_template import MatchTemplateTrie, LEAF_TRIE_ENTRY from sefaria.model.linker.resolved_ref_refiner_factory import resolved_ref_refiner_factory import structlog diff --git a/sefaria/model/text.py b/sefaria/model/text.py index 9c089e2362..1bf0cf2b87 100644 --- a/sefaria/model/text.py +++ b/sefaria/model/text.py @@ -5629,7 +5629,7 @@ def _build_named_entity_resolver(lang: str): @staticmethod def _build_named_entity_recognizer(lang: str): - from .linker.named_entity_resolver import NamedEntityRecognizer + from .linker.named_entity_recognizer import NamedEntityRecognizer from sefaria.helper.linker import load_spacy_model return NamedEntityRecognizer( From 88b80728bfc3e4378f1431cba0e15a7d2f7b52aa Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Sun, 12 Nov 2023 09:58:02 +0200 Subject: [PATCH 044/210] chore(linker): add Linker class to __init__.py --- sefaria/model/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sefaria/model/__init__.py b/sefaria/model/__init__.py index 83661bc1cb..e9b9f308a1 100644 --- a/sefaria/model/__init__.py +++ b/sefaria/model/__init__.py @@ -44,7 +44,7 @@ from .portal import Portal from .manuscript import Manuscript, ManuscriptSet, ManuscriptPage, ManuscriptPageSet from .linker.ref_part import RawRef -from .linker.ref_resolver import RefResolver +from .linker.linker import Linker from . import dependencies library._build_index_maps() From 86a15c9c2aaed1bed54821c3fb9688c9a634e2ce Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Sun, 12 Nov 2023 09:58:59 +0200 Subject: [PATCH 045/210] feat(linker): linker API now works in both English and Hebrew. --- sefaria/helper/linker.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/sefaria/helper/linker.py b/sefaria/helper/linker.py index 9a438ad770..abaef75b4c 100644 --- a/sefaria/helper/linker.py +++ b/sefaria/helper/linker.py @@ -3,7 +3,7 @@ import spacy import structlog from sefaria.model.linker.ref_part import TermContext, RefPartType -from sefaria.model.linker.ref_resolver import ResolvedRef, AmbiguousResolvedRef +from sefaria.model.linker.ref_resolver import PossiblyAmbigResolvedRef from sefaria.model import text, library from sefaria.model.webpage import WebPage from sefaria.system.cache import django_cache @@ -101,10 +101,7 @@ def _add_webpage_hit_for_url(url): @django_cache(cache_type="persistent") def _make_find_refs_response_with_cache(request_text: _FindRefsText, options: _FindRefsTextOptions, meta_data: dict) -> dict: - if request_text.lang == 'he': - response = _make_find_refs_response_linker_v3(request_text, options) - else: - response = _make_find_refs_response_linker_v2(request_text, options) + response = _make_find_refs_response_linker_v3(request_text, options) if meta_data: _, webpage = WebPage.add_or_update_from_linker({ @@ -179,11 +176,10 @@ def _get_trefs_from_response(response): return trefs -def _make_find_refs_response_inner(resolved: List[List[Union[AmbiguousResolvedRef, ResolvedRef]]], options: _FindRefsTextOptions): +def _make_find_refs_response_inner(resolved_ref_list: List[PossiblyAmbigResolvedRef], options: _FindRefsTextOptions): ref_results = [] ref_data = {} debug_data = [] - resolved_ref_list = [resolved_ref for inner_resolved in resolved for resolved_ref in inner_resolved] for resolved_ref in resolved_ref_list: resolved_refs = resolved_ref.resolved_raw_refs if resolved_ref.is_ambiguous else [resolved_ref] start_char, end_char = resolved_ref.raw_entity.char_indices @@ -251,7 +247,7 @@ def _get_ref_text_by_lang_for_linker(oref: text.Ref, lang: str, options: _FindRe return as_array[:options.max_segments or None], was_truncated -def _make_debug_response_for_linker(resolved_ref: ResolvedRef) -> dict: +def _make_debug_response_for_linker(resolved_ref: PossiblyAmbigResolvedRef) -> dict: debug_data = { "orig_part_strs": [p.text for p in resolved_ref.raw_entity.raw_ref_parts], "orig_part_types": [p.type.name for p in resolved_ref.raw_entity.raw_ref_parts], From a92d8e5eb34830670b1ce5c29432f8c28c27c65d Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Sun, 12 Nov 2023 10:06:23 +0200 Subject: [PATCH 046/210] refactor(linker): simplify get_mapping_after_normalization function --- sefaria/helper/normalization.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/sefaria/helper/normalization.py b/sefaria/helper/normalization.py index 2a10f25eed..0b119303fc 100644 --- a/sefaria/helper/normalization.py +++ b/sefaria/helper/normalization.py @@ -97,16 +97,10 @@ def get_mapping_after_normalization(self, text, removal_list=None, reverse=False meaning by the 2nd index, 5 chars have been removed then if you have a range (0,3) in the normalized string "abc" you will know that maps to (0, 8) in the original string """ - if removal_list is None: - removal_list = self.find_text_to_remove(text, **kwargs) + removal_list = removal_list or self.find_text_to_remove(text, **kwargs) total_removed = 0 removal_map = {} - for removal, subst in removal_list: - try: - start, end = removal - except TypeError: - # must be match object - start, end = removal.start(), removal.end() + for (start, end), subst in removal_list: normalized_text_index = start if reverse else (start + min(len(subst), end-start) - total_removed) curr_removed = end - start - len(subst) if curr_removed != 0: From 5cef34bd0397ec5812525c9259c1f1bee1872248 Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Sun, 12 Nov 2023 17:45:24 +0200 Subject: [PATCH 047/210] fix(linker): map normalized indices to unnormalized as last step (forgot to do that). --- sefaria/model/linker/linker.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sefaria/model/linker/linker.py b/sefaria/model/linker/linker.py index db0d66be6a..522a4cb204 100644 --- a/sefaria/model/linker/linker.py +++ b/sefaria/model/linker/linker.py @@ -74,7 +74,9 @@ def link(self, input_str: str, book_context_ref: Optional[Ref] = None, with_fail resolved_refs = self._ref_resolver.bulk_resolve(raw_refs, book_context_ref, with_failures, thoroughness) if type_filter in {'all', 'named entity'}: resolved_named_entities = self._ne_resolver.bulk_resolve(named_entities, with_failures) - return LinkedDoc(input_str, resolved_refs, resolved_named_entities) + doc = LinkedDoc(input_str, resolved_refs, resolved_named_entities) + self._ner.map_normal_output_to_original_input(input_str, [x.raw_entity for x in doc.all_resolved]) + return doc def get_ner(self) -> NamedEntityRecognizer: return self._ner From fb5ad5f83f1d2df1639aa44b3548ad2add595edc Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Sun, 12 Nov 2023 17:47:01 +0200 Subject: [PATCH 048/210] test(linker): revert normalization tests so that they rely on new normalization logic --- sefaria/helper/tests/normalization_tests.py | 32 ++++++++++----------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/sefaria/helper/tests/normalization_tests.py b/sefaria/helper/tests/normalization_tests.py index 76d59b8f07..f870d56135 100644 --- a/sefaria/helper/tests/normalization_tests.py +++ b/sefaria/helper/tests/normalization_tests.py @@ -56,9 +56,9 @@ def test_simpler_normalizer_composer(): nsc = NormalizerComposer(['brackets', 'double-space']) assert nsc.normalize(text) == normalized text_to_remove = nsc.find_text_to_remove(text) - assert len(text_to_remove) == 1 + assert len(text_to_remove) == 2 (start0, end0), repl0 = text_to_remove[0] - assert text[start0:end0] == " [" + assert text[start0:end0] == " " assert repl0 == ' ' @@ -68,24 +68,23 @@ def test_complicated_normalizer_composer(): nsc = NormalizerComposer(['html', "parens-plus-contents", 'brackets', 'double-space']) assert nsc.normalize(text) == normalized text_to_remove = nsc.find_text_to_remove(text) - assert len(text_to_remove) == 5 + assert len(text_to_remove) == 6 (start0, end0), repl0 = text_to_remove[0] - assert text[start0:end0] == "(hello other stuff) [" + assert text[start0:end0] == "(hello other stuff) " assert repl0 == ' ' -def test_mapping(): - text = """ test""" - normalized = """ test""" - nsc = NormalizerComposer(['html', 'double-space']) - assert nsc.normalize(text) == normalized - mapping = nsc.get_mapping_after_normalization(text) - test_word = "test" - start_norm_ind = normalized.index(test_word) +@pytest.mark.parametrize(('unnorm', 'norm', 'normalizer_steps', 'test_word'), [ + [" test", " test", ['html', 'double-space'], 'test'], + ["\n\n\nThe rest of Chapter 1.\n \n", " The rest of Chapter 1. ", ['unidecode', 'html', 'double-space'], 'Chapter 1'], +]) +def test_mapping(unnorm, norm, normalizer_steps, test_word): + nsc = NormalizerComposer(normalizer_steps) + assert nsc.normalize(unnorm) == norm + start_norm_ind = norm.index(test_word) norm_inds = (start_norm_ind, start_norm_ind+len(test_word)) - unnorm_inds = nsc.convert_normalized_indices_to_unnormalized_indices([norm_inds], mapping)[0] - # actual test - assert text[slice(*unnorm_inds)] == normalized[slice(*norm_inds)] + unnorm_inds = nsc.norm_to_unnorm_indices(unnorm, [norm_inds])[0] + assert unnorm[slice(*unnorm_inds)] == norm[slice(*norm_inds)] def test_html_normalizer_for_empty_prefix(): @@ -97,8 +96,7 @@ def test_html_normalizer_for_empty_prefix(): ne_start = norm_text.index(ne) ne_norm_prefix_inds = (ne_start, ne_start) assert norm_text[ne_norm_prefix_inds[0]:ne_norm_prefix_inds[0]+len(ne)] == ne - mapping = normalizer.get_mapping_after_normalization(text) - ne_inds = normalizer.convert_normalized_indices_to_unnormalized_indices([ne_norm_prefix_inds], mapping)[0] + ne_inds = normalizer.norm_to_unnorm_indices(text, [ne_norm_prefix_inds])[0] # actual test assert ne_inds[0] == ne_inds[1] assert text[ne_inds[0]:ne_inds[0]+len(ne)] == ne From 5c97e57a2f52faeacf1a26687cdf667de81cf696 Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Sun, 12 Nov 2023 17:47:45 +0200 Subject: [PATCH 049/210] fix(linker): keep track of substitution end indices which seem to be critical to get normalization mapping to work in all cases --- sefaria/helper/normalization.py | 25 +++++++++++-------- .../model/linker/named_entity_recognizer.py | 6 ++--- sefaria/model/linker/tests/linker_test.py | 3 +-- 3 files changed, 19 insertions(+), 15 deletions(-) diff --git a/sefaria/helper/normalization.py b/sefaria/helper/normalization.py index 0b119303fc..93fcdcace3 100644 --- a/sefaria/helper/normalization.py +++ b/sefaria/helper/normalization.py @@ -100,21 +100,28 @@ def get_mapping_after_normalization(self, text, removal_list=None, reverse=False removal_list = removal_list or self.find_text_to_remove(text, **kwargs) total_removed = 0 removal_map = {} + subst_end_indexes = set() for (start, end), subst in removal_list: normalized_text_index = start if reverse else (start + min(len(subst), end-start) - total_removed) curr_removed = end - start - len(subst) if curr_removed != 0: total_removed += curr_removed removal_map[normalized_text_index] = total_removed - return removal_map + if len(subst) > 0: + subst_end_indexes.add(normalized_text_index + 1) + return removal_map, subst_end_indexes + + def norm_to_unnorm_indices(self, text, normalized_indices, removal_list=None, reverse=False, **kwargs): + removal_map, subst_end_indices = self.get_mapping_after_normalization(text, removal_list, reverse, **kwargs) + return self.convert_normalized_indices_to_unnormalized_indices(normalized_indices, removal_map, subst_end_indices, reverse) @staticmethod - def convert_normalized_indices_to_unnormalized_indices(normalized_indices, removal_map, reverse=False, alignment_mode='contract'): + def convert_normalized_indices_to_unnormalized_indices(normalized_indices, removal_map, subst_end_indices, reverse=False): """ normalized_indices - list of tuples where each tuple is (x, y) x being start index, y is end index + 1 removal_map - return value of get_mapping_after_normalization() + subst_end_indices - reverse - if True, normalized_indices are actually unnormalized indices and removal_map was calculated using reverse=True in get_mapping_after_normalization() - alignment_mode - How to deal with cases where the end of a range touches a removal. Use "expand" if the removal should be included in the range. "contract" if it should be excluded. """ removal_keys = sorted(removal_map.keys()) unnormalized_indices = [] @@ -122,7 +129,7 @@ def convert_normalized_indices_to_unnormalized_indices(normalized_indices, remov for start, end in normalized_indices: unnorm_start_index = bisect_right(removal_keys, start) - 1 - bisect_end_index = end if (start == end or alignment_mode == 'expand') else end - 1 + bisect_end_index = end if (start == end or end in subst_end_indices) else end - 1 unnorm_end_index = bisect_right(removal_keys, bisect_end_index) - 1 unnorm_start = start if unnorm_start_index < 0 else start + (sign * removal_map[removal_keys[unnorm_start_index]]) @@ -262,8 +269,8 @@ def find_text_to_remove(self, s, **kwargs): text_to_remove_inds, text_to_remove_repls = [], [] else: text_to_remove_inds, text_to_remove_repls = zip(*curr_text_to_remove) - for mapping in reversed(mappings): - text_to_remove_inds = step.convert_normalized_indices_to_unnormalized_indices(text_to_remove_inds, mapping, alignment_mode='expand') + for mapping, subst_end_indices in reversed(mappings): + text_to_remove_inds = step.convert_normalized_indices_to_unnormalized_indices(text_to_remove_inds, mapping, subst_end_indices) curr_text_to_remove = list(zip(text_to_remove_inds, text_to_remove_repls)) # merge any overlapping ranges @@ -433,7 +440,6 @@ def char_indices_from_word_indices(input_string, word_ranges, split_regex=None): count += len(word) end = count word_indices.append((start, end)) - removal_map = regex_normalizer.get_mapping_after_normalization(input_string) normalized_char_indices = [] for i, words in enumerate(word_ranges): first_word, last_word = [w if w < len(word_indices) else -1 for w in words] @@ -443,7 +449,7 @@ def char_indices_from_word_indices(input_string, word_ranges, split_regex=None): word_indices[last_word][1] if last_word >= 0 else -1 ) ) - return regex_normalizer.convert_normalized_indices_to_unnormalized_indices(normalized_char_indices, removal_map) + return regex_normalizer.norm_to_unnorm_indices(input_string, normalized_char_indices) @lru_cache(maxsize=32) @@ -461,9 +467,8 @@ def word_index_from_char_index(full_string, char_index, split_regex=r'\s+'): def sanitized_words_to_unsanitized_words(input_string, sanitized_string, sanitization_method, sanitized_word_ranges): normalizer = FunctionNormalizer(sanitization_method) - removal_map = normalizer.get_mapping_after_normalization(input_string) sanitized_char_ranges = char_indices_from_word_indices(sanitized_string, sanitized_word_ranges) - unsanitzied_char_ranges = normalizer.convert_normalized_indices_to_unnormalized_indices(sanitized_char_ranges, removal_map) + unsanitzied_char_ranges = normalizer.norm_to_unnorm_indices(input_string, sanitized_char_ranges) # for char_range in unsanitied_char_ranges: # word_range = tuple(word_index_from_char_index(input_string, i) for i in char_range) # stuff.append(word_range) diff --git a/sefaria/model/linker/named_entity_recognizer.py b/sefaria/model/linker/named_entity_recognizer.py index ba27e2f708..4a7076f2ee 100644 --- a/sefaria/model/linker/named_entity_recognizer.py +++ b/sefaria/model/linker/named_entity_recognizer.py @@ -145,16 +145,16 @@ def map_normal_output_to_original_input(self, input: str, named_entities: List[R Ref resolution ran on normalized input. Remap raw refs to original (non-normalized) input """ unnorm_doc = self._raw_ref_model.make_doc(input) - mapping = self._normalizer.get_mapping_after_normalization(input) + mapping, subst_end_indices = self._normalizer.get_mapping_after_normalization(input) # this function name is waaay too long conv = self._normalizer.convert_normalized_indices_to_unnormalized_indices norm_inds = [named_entity.char_indices for named_entity in named_entities] - unnorm_inds = conv(norm_inds, mapping) + unnorm_inds = conv(norm_inds, mapping, subst_end_indices) unnorm_part_inds = [] for (named_entity, (norm_raw_ref_start, _)) in zip(named_entities, norm_inds): raw_ref_parts = named_entity.raw_ref_parts if isinstance(named_entity, RawRef) else [] unnorm_part_inds += [conv([[norm_raw_ref_start + i for i in part.char_indices] - for part in raw_ref_parts], mapping)] + for part in raw_ref_parts], mapping, subst_end_indices)] for named_entity, temp_unnorm_inds, temp_unnorm_part_inds in zip(named_entities, unnorm_inds, unnorm_part_inds): named_entity.map_new_indices(unnorm_doc, temp_unnorm_inds) if isinstance(named_entity, RawRef): diff --git a/sefaria/model/linker/tests/linker_test.py b/sefaria/model/linker/tests/linker_test.py index bd27a89020..85eb27ba37 100644 --- a/sefaria/model/linker/tests/linker_test.py +++ b/sefaria/model/linker/tests/linker_test.py @@ -397,8 +397,7 @@ def test_map_new_indices(crrd_params): n = linker.get_ner()._normalizer norm_text = n.normalize(text) norm_doc = nlp.make_doc(norm_text) - mapping = n.get_mapping_after_normalization(text, reverse=True) - norm_part_indices = n.convert_normalized_indices_to_unnormalized_indices(part_indices, mapping, reverse=True) + norm_part_indices = n.norm_to_unnorm_indices(text, part_indices, reverse=True) norm_part_spans = [norm_doc.char_span(s, e) for (s, e) in norm_part_indices] norm_part_token_inds = [] for span in norm_part_spans: From bdab709e4913aea4ae99a3b9c19c950619bc7a33 Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Mon, 13 Nov 2023 13:22:42 +0200 Subject: [PATCH 050/210] test(linker): fix linker helper tests --- sefaria/helper/tests/linker_test.py | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/sefaria/helper/tests/linker_test.py b/sefaria/helper/tests/linker_test.py index 4a0b96c41a..a2bb736e2b 100644 --- a/sefaria/helper/tests/linker_test.py +++ b/sefaria/helper/tests/linker_test.py @@ -124,10 +124,8 @@ def mock_webpage() -> WebPage: class TestFindRefsHelperClasses: - @patch('sefaria.utils.hebrew.is_hebrew', return_value=False) - def test_find_refs_text(self, mock_is_hebrew: Mock): - find_refs_text = linker._FindRefsText('title', 'body') - mock_is_hebrew.assert_called_once_with('body') + def test_find_refs_text(self): + find_refs_text = linker._FindRefsText('title', 'body', 'en') assert find_refs_text.lang == 'en' def test_find_refs_text_options(self): @@ -194,16 +192,16 @@ def test_add_webpage_hit_for_url_no_url(self, mock_webpage: Mock): class TestFindRefsResponseLinkerV3: @pytest.fixture - def mock_get_ref_resolver(self, spacy_model: spacy.Language): + def mock_get_linker(self, spacy_model: spacy.Language): from sefaria.model.text import library - with patch.object(library, 'get_ref_resolver') as mock_get_ref_resolver: - mock_ref_resolver = Mock() - mock_ref_resolver._raw_ref_model_by_lang = {"en": spacy_model} - mock_get_ref_resolver.return_value = mock_ref_resolver - mock_ref_resolver.bulk_resolve_refs.return_value = [[]] - yield mock_get_ref_resolver - - def test_make_find_refs_response_linker_v3(self, mock_get_ref_resolver: WSGIRequest, + from sefaria.model.linker.linker import LinkedDoc + with patch.object(library, 'get_linker') as mock_get_linker: + mock_linker = Mock() + mock_get_linker.return_value = mock_linker + mock_linker.link.return_value = LinkedDoc('', [], []) + yield mock_get_linker + + def test_make_find_refs_response_linker_v3(self, mock_get_linker: WSGIRequest, mock_find_refs_text: linker._FindRefsText, mock_find_refs_options: linker._FindRefsTextOptions): response = linker._make_find_refs_response_linker_v3(mock_find_refs_text, mock_find_refs_options) @@ -214,7 +212,7 @@ def test_make_find_refs_response_linker_v3(self, mock_get_ref_resolver: WSGIRequ class TestFindRefsResponseInner: @pytest.fixture def mock_resolved(self): - return [[]] + return [] def test_make_find_refs_response_inner(self, mock_resolved: Mock, mock_find_refs_options: linker._FindRefsTextOptions): response = linker._make_find_refs_response_inner(mock_resolved, mock_find_refs_options) From dac2c1dc94f759803b7302afea27e724ada859f7 Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Mon, 13 Nov 2023 13:23:43 +0200 Subject: [PATCH 051/210] refactor(linker): rename function to norm_to_unnorm_with_mapping --- scripts/catch_refs_yerushalmi_translation.py | 2 +- sefaria/helper/normalization.py | 22 ++++++++++++++----- .../model/linker/named_entity_recognizer.py | 3 +-- 3 files changed, 19 insertions(+), 8 deletions(-) diff --git a/scripts/catch_refs_yerushalmi_translation.py b/scripts/catch_refs_yerushalmi_translation.py index f420a9c815..a182aaaeac 100644 --- a/scripts/catch_refs_yerushalmi_translation.py +++ b/scripts/catch_refs_yerushalmi_translation.py @@ -168,7 +168,7 @@ def catch_refs_in_ref(self, st: str, en_tref: str, he_tref: str, version: Versio resolved_refs = self.post_process_resolved_refs(resolved_refs, context_ref) norm_indices = [r.raw_ref.char_indices for r in resolved_refs] mapping = self.normalizer.get_mapping_after_normalization(st) - orig_indices = self.normalizer.convert_normalized_indices_to_unnormalized_indices(norm_indices, mapping) + orig_indices = self.normalizer.norm_to_unnorm_indices_with_mapping(norm_indices, mapping, ) for resolved_ref, (start_char, end_char) in zip(resolved_refs, orig_indices): before_context, after_context = get_window_around_match(start_char, end_char, st) diff --git a/sefaria/helper/normalization.py b/sefaria/helper/normalization.py index 93fcdcace3..c69bc8487a 100644 --- a/sefaria/helper/normalization.py +++ b/sefaria/helper/normalization.py @@ -85,6 +85,9 @@ def remove_subsets_reducer(curr_text_to_remove: list, next: tuple) -> list: def get_mapping_after_normalization(self, text, removal_list=None, reverse=False, **kwargs): """ + Prefer norm_to_unnorm_indices() over this function since the former is simpler. + Use this function when you need more control over the mapping outputs. + It also can be useful to store the mapping and reuse it as an optimization. text - unnormalized text removal_list - instead of passing `find_text_to_remove`, you can pass an already calculated list of tuples. should be in same format as return value of find_text_to_remove reverse - bool. If True, then will return mapping from unnormalized string to normalized string @@ -112,15 +115,23 @@ def get_mapping_after_normalization(self, text, removal_list=None, reverse=False return removal_map, subst_end_indexes def norm_to_unnorm_indices(self, text, normalized_indices, removal_list=None, reverse=False, **kwargs): + """ + text - unnormalized text + normalized_indices - list of tuples where each tuple is (x, y) x being start index, y is end index + 1 + reverse - if True, normalized_indices are actually unnormalized indices and removal_map was calculated using reverse=True in get_mapping_after_normalization() + """ removal_map, subst_end_indices = self.get_mapping_after_normalization(text, removal_list, reverse, **kwargs) - return self.convert_normalized_indices_to_unnormalized_indices(normalized_indices, removal_map, subst_end_indices, reverse) + return self.norm_to_unnorm_indices_with_mapping(normalized_indices, removal_map, subst_end_indices, reverse) @staticmethod - def convert_normalized_indices_to_unnormalized_indices(normalized_indices, removal_map, subst_end_indices, reverse=False): + def norm_to_unnorm_indices_with_mapping(normalized_indices, removal_map, subst_end_indices, reverse=False): """ + Prefer norm_to_unnorm_indices() over this function since the former is simpler. + Use this function when you need more control over the mapping inputs. + It also can be useful to store the mapping and reuse it as an optimization. normalized_indices - list of tuples where each tuple is (x, y) x being start index, y is end index + 1 - removal_map - return value of get_mapping_after_normalization() - subst_end_indices - + removal_map - first return value of get_mapping_after_normalization() + subst_end_indices - second return value from get_mapping_after_normalization() reverse - if True, normalized_indices are actually unnormalized indices and removal_map was calculated using reverse=True in get_mapping_after_normalization() """ removal_keys = sorted(removal_map.keys()) @@ -270,7 +281,8 @@ def find_text_to_remove(self, s, **kwargs): else: text_to_remove_inds, text_to_remove_repls = zip(*curr_text_to_remove) for mapping, subst_end_indices in reversed(mappings): - text_to_remove_inds = step.convert_normalized_indices_to_unnormalized_indices(text_to_remove_inds, mapping, subst_end_indices) + text_to_remove_inds = step.norm_to_unnorm_indices_with_mapping(text_to_remove_inds, mapping, + subst_end_indices) curr_text_to_remove = list(zip(text_to_remove_inds, text_to_remove_repls)) # merge any overlapping ranges diff --git a/sefaria/model/linker/named_entity_recognizer.py b/sefaria/model/linker/named_entity_recognizer.py index 4a7076f2ee..1d6743ae42 100644 --- a/sefaria/model/linker/named_entity_recognizer.py +++ b/sefaria/model/linker/named_entity_recognizer.py @@ -146,8 +146,7 @@ def map_normal_output_to_original_input(self, input: str, named_entities: List[R """ unnorm_doc = self._raw_ref_model.make_doc(input) mapping, subst_end_indices = self._normalizer.get_mapping_after_normalization(input) - # this function name is waaay too long - conv = self._normalizer.convert_normalized_indices_to_unnormalized_indices + conv = self._normalizer.norm_to_unnorm_indices_with_mapping norm_inds = [named_entity.char_indices for named_entity in named_entities] unnorm_inds = conv(norm_inds, mapping, subst_end_indices) unnorm_part_inds = [] From a10d8ca7ca973af19579ba6282ff0546d83989cc Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Mon, 13 Nov 2023 21:42:50 +0200 Subject: [PATCH 052/210] fix(linker): early return if context is a complex text --- sefaria/model/linker/ref_resolver.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sefaria/model/linker/ref_resolver.py b/sefaria/model/linker/ref_resolver.py index 8619d9108a..a0aa20821d 100644 --- a/sefaria/model/linker/ref_resolver.py +++ b/sefaria/model/linker/ref_resolver.py @@ -488,6 +488,9 @@ def get_section_set(index: text.Index) -> Set[Tuple[str, str, bool]]: return [] context_node = context_ref.index_node + if not hasattr(context_node, 'addressType'): + # complex text + return [] referenceable_sections = getattr(context_node, 'referenceableSections', [True]*len(context_node.addressTypes)) context_sec_list = list(zip(context_node.addressTypes, context_node.sectionNames, referenceable_sections)) match_sec_set = get_section_set(match_index) From 10df8a72e2d0607f0b8b460d8b5ef9314ae74fb3 Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Wed, 15 Nov 2023 12:51:15 +0200 Subject: [PATCH 053/210] fix(linker): fix early return if context is a complex text --- sefaria/model/linker/ref_resolver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sefaria/model/linker/ref_resolver.py b/sefaria/model/linker/ref_resolver.py index a0aa20821d..478a59b44c 100644 --- a/sefaria/model/linker/ref_resolver.py +++ b/sefaria/model/linker/ref_resolver.py @@ -488,7 +488,7 @@ def get_section_set(index: text.Index) -> Set[Tuple[str, str, bool]]: return [] context_node = context_ref.index_node - if not hasattr(context_node, 'addressType'): + if not hasattr(context_node, 'addressTypes'): # complex text return [] referenceable_sections = getattr(context_node, 'referenceableSections', [True]*len(context_node.addressTypes)) From ce61d4295c2ff34eb1126cdfe9d2bfdc434bd7df Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Wed, 15 Nov 2023 12:54:12 +0200 Subject: [PATCH 054/210] test(linker): add test for segment level ibid context --- sefaria/model/linker/tests/linker_test.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sefaria/model/linker/tests/linker_test.py b/sefaria/model/linker/tests/linker_test.py index 85eb27ba37..7ee3adc4ec 100644 --- a/sefaria/model/linker/tests/linker_test.py +++ b/sefaria/model/linker/tests/linker_test.py @@ -90,6 +90,7 @@ def test_resolved_raw_ref_clone(): # Ibid [crrd(['&שם', '#ז'], prev_trefs=["Genesis 1"]), ["Genesis 7", "Genesis 1:7"]], # ambiguous ibid + [crrd(['&Ibid', '#12'], prev_trefs=["Exodus 1:7"], lang='en'), ["Exodus 1:12", "Exodus 12"]], # ambiguous ibid when context is segment level (not clear if this is really ambiguous. maybe should only have segment level result) [crrd(['#ב'], prev_trefs=["Genesis 1"]), ["Genesis 1:2", "Genesis 2"]], # ambiguous ibid [crrd(['#ב', '#ז'], prev_trefs=["Genesis 1:3", "Exodus 1:3"]), ["Genesis 2:7", "Exodus 2:7"]], [crrd(['@בראשית', '&שם', '#ז'], prev_trefs=["Exodus 1:3", "Genesis 1:3"]), ["Genesis 1:7"]], From 872d36f7061432b74129c7d9ebdfaaa1a377e346 Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Wed, 15 Nov 2023 14:43:54 +0200 Subject: [PATCH 055/210] feat(linker): add param to delete existing term --- sefaria/helper/linker_index_converter.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/sefaria/helper/linker_index_converter.py b/sefaria/helper/linker_index_converter.py index 3a795b45c2..f1b9815eed 100644 --- a/sefaria/helper/linker_index_converter.py +++ b/sefaria/helper/linker_index_converter.py @@ -55,6 +55,12 @@ def create_term(self, **kwargs): term.title_group.add_title(kwargs.get(lang), lang, primary=True) for title in kwargs.get(f"alt_{lang}", []): term.title_group.add_title(title, lang) + + if kwargs.get('delete_if_existing'): + slug = NonUniqueTerm.normalize_slug(term.slug) + existing_term = NonUniqueTerm.init(slug) + if existing_term: + existing_term.delete() term.save() self.context_and_primary_title_to_term[(kwargs.get('context'), term.get_primary_title('en'))] = term return term From 588c92be0241cc1009776404f5112c8c851e5415 Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Thu, 16 Nov 2023 08:45:21 +0200 Subject: [PATCH 056/210] feat(linker): fully support traversing ArrayMapNode children --- .../model/linker/referenceable_book_node.py | 136 +++++++++++++++--- sefaria/model/linker/resolved_ref_refiner.py | 31 +--- .../linker/resolved_ref_refiner_factory.py | 3 +- sefaria/model/linker/tests/linker_test.py | 4 + 4 files changed, 127 insertions(+), 47 deletions(-) diff --git a/sefaria/model/linker/referenceable_book_node.py b/sefaria/model/linker/referenceable_book_node.py index ad0e10bf6e..9b214fa732 100644 --- a/sefaria/model/linker/referenceable_book_node.py +++ b/sefaria/model/linker/referenceable_book_node.py @@ -1,8 +1,28 @@ import dataclasses -from typing import List, Union, Optional +from typing import List, Union, Optional, Tuple from sefaria.model import abstract as abst from sefaria.model import text from sefaria.model import schema +from sefaria.system.exceptions import InputError + + +def subref(ref: text.Ref, section: int): + if ref.index_node.addressTypes[len(ref.sections)-1] == "Talmud": + d = ref._core_dict() + d['sections'][-1] += (section-1) + d['toSections'] = d['sections'][:] + return text.Ref(_obj=d) + else: + return ref.subref(section) + + +def truncate_serialized_node_to_depth(serial_node: dict, depth: int) -> dict: + truncated_serial_node = serial_node.copy() + for list_attr in ('addressTypes', 'sectionNames', 'lengths', 'referenceableSections'): + if list_attr not in serial_node: + continue + truncated_serial_node[list_attr] = serial_node[list_attr][depth:] + return truncated_serial_node class ReferenceableBookNode: @@ -20,6 +40,10 @@ def get_children(self, *args, **kwargs) -> List['ReferenceableBookNode']: def is_default(self) -> bool: return False + @property + def referenceable(self) -> bool: + return True + class NamedReferenceableBookNode(ReferenceableBookNode): @@ -29,6 +53,10 @@ def __init__(self, titled_tree_node_or_index: Union[schema.TitledTreeNode, text. if isinstance(titled_tree_node_or_index, text.Index): self._titled_tree_node = titled_tree_node_or_index.nodes + @property + def referenceable(self): + return getattr(self._titled_tree_node, 'referenceable', True) + def is_default(self): return self._titled_tree_node.is_default() @@ -40,20 +68,44 @@ def ref(self) -> text.Ref: def _get_all_children(self) -> List[ReferenceableBookNode]: thingy = self._titled_tree_node_or_index - #the schema node for this referenceable node has a dibur hamatchil child + # the schema node for this referenceable node has a dibur hamatchil child if isinstance(thingy, schema.NumberedTitledTreeNode) and thingy.is_segment_level_dibur_hamatchil(): return [DiburHamatchilNodeSet({"container_refs": self.ref().normal()})] - #the schema node for this referenceable is a JAN. JANs act as both named and numbered nodes + # the schema node for this referenceable is a JAN. JANs act as both named and numbered nodes if isinstance(thingy, schema.JaggedArrayNode) and len(thingy.children) == 0: return [NumberedReferenceableBookNode(thingy)] if isinstance(thingy, text.Index): children = thingy.referenceable_children() + elif isinstance(thingy, schema.ArrayMapNode): + # TODO following two if's are very similar... + if getattr(thingy, 'refs', None): + address_types = thingy.addressTypes + section_names = thingy.sectionNames + children = [] + for ichild, tref in enumerate(thingy.refs): + oref = text.Ref(tref) + children += [MonoReferenceableBookNode(address_types, section_names, ichild+1, oref)] + return children + elif getattr(thingy, 'wholeRef', None): + whole_ref = text.Ref(thingy.wholeRef) + schema_node = whole_ref.index_node.serialize() + truncated_node = truncate_serialized_node_to_depth(schema_node, -2) + refs = whole_ref.split_spanning_ref() + children = [] + for oref in refs: + children += [MonoReferenceableBookNode(numeric_equivalent=oref.section_ref().sections[0], ref=oref, **truncated_node)] + return children + else: + children = self._titled_tree_node.children else: # Any other type of TitledTreeNode children = self._titled_tree_node.children children = [self._transform_schema_node_to_referenceable(x) for x in children] return children + def _get_children_from_array_map_node(self, node: schema.ArrayMapNode) -> List[ReferenceableBookNode]: + pass + @staticmethod def _transform_schema_node_to_referenceable(schema_node: schema.TitledTreeNode) -> ReferenceableBookNode: if isinstance(schema_node, schema.JaggedArrayNode) and (schema_node.is_default() or schema_node.parent is None): @@ -84,27 +136,44 @@ class NumberedReferenceableBookNode(ReferenceableBookNode): def __init__(self, ja_node: schema.NumberedTitledTreeNode): self._ja_node = ja_node + @property + def referenceable(self): + return getattr(self._ja_node, 'referenceable', True) + def is_default(self): return self._ja_node.is_default() and self._ja_node.parent is not None def ref(self): return self._ja_node.ref() + def possible_subrefs(self, lang: str, initial_ref: text.Ref, section_str: str, fromSections=None) -> Tuple[List[text.Ref], List[bool]]: + try: + possible_sections, possible_to_sections, addr_classes = self._address_class.get_all_possible_sections_from_string(lang, section_str, fromSections, strip_prefixes=True) + except (IndexError, TypeError, KeyError): + return [], [] + possible_subrefs = [] + can_match_out_of_order_list = [] + for sec, toSec, addr_class in zip(possible_sections, possible_to_sections, addr_classes): + try: + refined_ref = subref(initial_ref, sec) + if toSec != sec: + to_ref = subref(initial_ref, toSec) + refined_ref = refined_ref.to(to_ref) + possible_subrefs += [refined_ref] + can_match_out_of_order_list += [addr_class.can_match_out_of_order(lang, section_str)] + except (InputError, IndexError, AssertionError, AttributeError): + continue + return possible_subrefs, can_match_out_of_order_list + + # TODO move these two properties to be private @property - def address_class(self) -> schema.AddressType: + def _address_class(self) -> schema.AddressType: return self._ja_node.address_class(0) @property - def section_name(self) -> str: + def _section_name(self) -> str: return self._ja_node.sectionNames[0] - def get_all_possible_sections_from_string(self, *args, **kwargs): - """ - wraps AddressType function with same name - @return: - """ - return self.address_class.get_all_possible_sections_from_string(*args, **kwargs) - def _get_next_referenceable_depth(self): if self.is_default(): return 0 @@ -126,7 +195,7 @@ def get_children(self, context_ref=None, **kwargs) -> [ReferenceableBookNode]: if serial['depth'] <= 1 and self._ja_node.is_segment_level_dibur_hamatchil(): return [DiburHamatchilNodeSet({"container_refs": context_ref.normal()})] if (self._ja_node.depth - next_referenceable_depth) == 0: - if isinstance(self.address_class, schema.AddressTalmud): + if isinstance(self._address_class, schema.AddressTalmud): serial['addressTypes'] = ["Amud"] serial['sectionNames'] = ["Amud"] serial['lengths'] = [1] @@ -134,10 +203,7 @@ def get_children(self, context_ref=None, **kwargs) -> [ReferenceableBookNode]: else: return [] else: - for list_attr in ('addressTypes', 'sectionNames', 'lengths', 'referenceableSections'): - # truncate every list attribute by `next_referenceable_depth` - if list_attr not in serial: continue - serial[list_attr] = serial[list_attr][next_referenceable_depth:] + serial = truncate_serialized_node_to_depth(serial, next_referenceable_depth) new_ja = schema.JaggedArrayNode(serial=serial, index=getattr(self, 'index', None), **kwargs) return [NumberedReferenceableBookNode(new_ja)] @@ -145,11 +211,43 @@ def matches_section_context(self, section_context: 'SectionContext') -> bool: """ Does the address in `self` match the address in `section_context`? """ - if self.address_class.__class__ != section_context.addr_type.__class__: return False - if self.section_name != section_context.section_name: return False + if self._address_class.__class__ != section_context.addr_type.__class__: return False + if self._section_name != section_context.section_name: return False return True +class MonoReferenceableBookNode(NumberedReferenceableBookNode): + """ + Node that can only be referenced by one ref + """ + + def __init__(self, addressTypes: List[str], sectionNames: List[str], numeric_equivalent: int, ref: text.Ref, **ja_node_attrs): + ja_node = schema.JaggedArrayNode(serial={ + "addressTypes": addressTypes, + "sectionNames": sectionNames, + **ja_node_attrs, + "depth": len(addressTypes), + }) + super().__init__(ja_node) + self._numeric_equivalent = numeric_equivalent + self._ref = ref + + def ref(self): + return self._ref + + def possible_subrefs(self, lang: str, initial_ref: text.Ref, section_str: str, fromSections=None) -> Tuple[List[text.Ref], List[bool]]: + try: + possible_sections, possible_to_sections, addr_classes = self._address_class.\ + get_all_possible_sections_from_string(lang, section_str, fromSections, strip_prefixes=True) + except (IndexError, TypeError, KeyError): + return [], [] + # if any section matches numeric_equivalent, this node's ref is the subref. + for sec, to_sec in zip(possible_sections, possible_to_sections): + if sec == self._numeric_equivalent and sec == to_sec: + return [self._ref], [True] + return [], [] + + @dataclasses.dataclass class DiburHamatchilMatch: score: float diff --git a/sefaria/model/linker/resolved_ref_refiner.py b/sefaria/model/linker/resolved_ref_refiner.py index 88f364019b..f71d40cb27 100644 --- a/sefaria/model/linker/resolved_ref_refiner.py +++ b/sefaria/model/linker/resolved_ref_refiner.py @@ -8,16 +8,6 @@ from sefaria.model.text import Ref -def subref(ref: Ref, section: int): - if ref.index_node.addressTypes[len(ref.sections)-1] == "Talmud": - d = ref._core_dict() - d['sections'][-1] += (section-1) - d['toSections'] = d['sections'][:] - return Ref(_obj=d) - else: - return ref.subref(section) - - class ResolvedRefRefiner(ABC): def __init__(self, part_to_match: RawRefPart, node: ReferenceableBookNode, resolved_ref: 'ResolvedRef'): @@ -88,14 +78,10 @@ def __refine_context_full(self) -> List['ResolvedRef']: def __refine_context_free(self, lang: str, fromSections=None) -> List['ResolvedRef']: if self.node is None: return [] - try: - possible_sections, possible_to_sections, addr_classes = self.node.get_all_possible_sections_from_string(lang, self.part_to_match.text, fromSections, strip_prefixes=True) - except (IndexError, TypeError, KeyError): - return [] + possible_subrefs, can_match_out_of_order_list = self.node.possible_subrefs(lang, self.resolved_ref.ref, self.part_to_match.text, fromSections) refined_refs = [] - addr_classes_used = [] - for sec, toSec, addr_class in zip(possible_sections, possible_to_sections, addr_classes): - if self._has_prev_unused_numbered_ref_part() and not addr_class.can_match_out_of_order(lang, self.part_to_match.text): + for refined_ref, can_match_out_of_order in zip(possible_subrefs, can_match_out_of_order_list): + if self._has_prev_unused_numbered_ref_part() and not can_match_out_of_order: """ If raw_ref has NUMBERED parts [a, b] and part b matches before part a @@ -103,16 +89,7 @@ def __refine_context_free(self, lang: str, fromSections=None) -> List['ResolvedR discard match because AddressInteger parts need to match in order """ continue - try: - refined_ref = subref(self.resolved_ref.ref, sec) - if toSec != sec: - to_ref = subref(self.resolved_ref.ref, toSec) - refined_ref = refined_ref.to(to_ref) - refined_refs += [refined_ref] - addr_classes_used += [addr_class] - except (InputError, IndexError, AssertionError, AttributeError): - continue - + refined_refs += [refined_ref] return [self._clone_resolved_ref(resolved_parts=self._get_resolved_parts(), node=self.node, ref=refined_ref) for refined_ref in refined_refs] diff --git a/sefaria/model/linker/resolved_ref_refiner_factory.py b/sefaria/model/linker/resolved_ref_refiner_factory.py index 864d1b01d8..690c5b0b21 100644 --- a/sefaria/model/linker/resolved_ref_refiner_factory.py +++ b/sefaria/model/linker/resolved_ref_refiner_factory.py @@ -1,5 +1,5 @@ from sefaria.model.linker.ref_part import RawRefPart, RefPartType -from sefaria.model.linker.referenceable_book_node import ReferenceableBookNode, NamedReferenceableBookNode, NumberedReferenceableBookNode +from sefaria.model.linker.referenceable_book_node import ReferenceableBookNode, NamedReferenceableBookNode, NumberedReferenceableBookNode, MonoReferenceableBookNode from sefaria.model.linker.resolved_ref_refiner import ResolvedRefRefinerForDefaultNode, ResolvedRefRefinerForNumberedPart, ResolvedRefRefinerForDiburHamatchilPart, ResolvedRefRefinerForRangedPart, ResolvedRefRefinerForNamedNode, ResolvedRefRefiner, ResolvedRefRefinerCatchAll @@ -48,6 +48,7 @@ def initialize_resolved_ref_refiner_factory() -> ResolvedRefRefinerFactory: refiners_to_register = [ (key(is_default=True), ResolvedRefRefinerForDefaultNode), (key(RefPartType.NUMBERED, node_class=NumberedReferenceableBookNode), ResolvedRefRefinerForNumberedPart), + (key(RefPartType.NUMBERED, node_class=MonoReferenceableBookNode), ResolvedRefRefinerForNumberedPart), (key(RefPartType.RANGE, node_class=NumberedReferenceableBookNode), ResolvedRefRefinerForRangedPart), (key(RefPartType.NAMED, node_class=NamedReferenceableBookNode), ResolvedRefRefinerForNamedNode), (key(RefPartType.NUMBERED, node_class=NamedReferenceableBookNode), ResolvedRefRefinerForNamedNode), diff --git a/sefaria/model/linker/tests/linker_test.py b/sefaria/model/linker/tests/linker_test.py index 7ee3adc4ec..1f7ea636b0 100644 --- a/sefaria/model/linker/tests/linker_test.py +++ b/sefaria/model/linker/tests/linker_test.py @@ -29,6 +29,10 @@ def test_resolved_raw_ref_clone(): @pytest.mark.parametrize(('resolver_data', 'expected_trefs'), [ + # Using addressTypes of alt structs + [crrd(["@JT", "@Berakhot", "#2a"], lang="en"), ("Jerusalem Talmud Berakhot 1:1:7-11",)], + [crrd(["@JT", "@Berakhot", "@Chapter 1", "#2a"], lang="en"), ("Jerusalem Talmud Berakhot 1:1:7-11",)], + # Numbered JAs [crrd(["@בבלי", "@ברכות", "#דף ב"]), ("Berakhot 2",)], # amud-less talmud [crrd(["@ברכות", "#דף ב"]), ("Berakhot 2",)], # amud-less talmud From fa0045ab978c279891b77baba928558f4a7fce63 Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Sun, 19 Nov 2023 10:30:11 +0200 Subject: [PATCH 057/210] refactor(linker): use Map instead of Mono for nodes. Many fewer objects. --- .../model/linker/referenceable_book_node.py | 35 +++++++++---------- .../linker/resolved_ref_refiner_factory.py | 4 +-- 2 files changed, 19 insertions(+), 20 deletions(-) diff --git a/sefaria/model/linker/referenceable_book_node.py b/sefaria/model/linker/referenceable_book_node.py index 9b214fa732..e1dcab4572 100644 --- a/sefaria/model/linker/referenceable_book_node.py +++ b/sefaria/model/linker/referenceable_book_node.py @@ -1,5 +1,5 @@ import dataclasses -from typing import List, Union, Optional, Tuple +from typing import List, Union, Optional, Tuple, Dict from sefaria.model import abstract as abst from sefaria.model import text from sefaria.model import schema @@ -81,20 +81,18 @@ def _get_all_children(self) -> List[ReferenceableBookNode]: if getattr(thingy, 'refs', None): address_types = thingy.addressTypes section_names = thingy.sectionNames - children = [] - for ichild, tref in enumerate(thingy.refs): - oref = text.Ref(tref) - children += [MonoReferenceableBookNode(address_types, section_names, ichild+1, oref)] - return children - elif getattr(thingy, 'wholeRef', None): + section_ref_map = {ichild+1: text.Ref(tref) for ichild, tref in enumerate(thingy.refs)} + return [MapReferenceableBookNode(address_types, section_names, section_ref_map)] + elif getattr(thingy, 'wholeRef', None) and getattr(thingy, 'includeSections', False): whole_ref = text.Ref(thingy.wholeRef) schema_node = whole_ref.index_node.serialize() truncated_node = truncate_serialized_node_to_depth(schema_node, -2) refs = whole_ref.split_spanning_ref() - children = [] + section_ref_map = {} for oref in refs: - children += [MonoReferenceableBookNode(numeric_equivalent=oref.section_ref().sections[0], ref=oref, **truncated_node)] - return children + section = oref.section_ref().sections[0] + section_ref_map[section] = oref + return [MapReferenceableBookNode(section_ref_map=section_ref_map, **truncated_node)] else: children = self._titled_tree_node.children else: @@ -216,12 +214,12 @@ def matches_section_context(self, section_context: 'SectionContext') -> bool: return True -class MonoReferenceableBookNode(NumberedReferenceableBookNode): +class MapReferenceableBookNode(NumberedReferenceableBookNode): """ Node that can only be referenced by one ref """ - def __init__(self, addressTypes: List[str], sectionNames: List[str], numeric_equivalent: int, ref: text.Ref, **ja_node_attrs): + def __init__(self, addressTypes: List[str], sectionNames: List[str], section_ref_map: Dict[int, text.Ref], **ja_node_attrs): ja_node = schema.JaggedArrayNode(serial={ "addressTypes": addressTypes, "sectionNames": sectionNames, @@ -229,8 +227,7 @@ def __init__(self, addressTypes: List[str], sectionNames: List[str], numeric_equ "depth": len(addressTypes), }) super().__init__(ja_node) - self._numeric_equivalent = numeric_equivalent - self._ref = ref + self._section_ref_map = section_ref_map def ref(self): return self._ref @@ -241,11 +238,13 @@ def possible_subrefs(self, lang: str, initial_ref: text.Ref, section_str: str, f get_all_possible_sections_from_string(lang, section_str, fromSections, strip_prefixes=True) except (IndexError, TypeError, KeyError): return [], [] - # if any section matches numeric_equivalent, this node's ref is the subref. + # map sections to equivalent refs in section_ref_map + mapped_refs = [] for sec, to_sec in zip(possible_sections, possible_to_sections): - if sec == self._numeric_equivalent and sec == to_sec: - return [self._ref], [True] - return [], [] + mapped_ref = self._section_ref_map.get(sec) + if mapped_ref and sec == to_sec: + mapped_refs += [mapped_ref] + return mapped_refs, [True]*len(mapped_refs) @dataclasses.dataclass diff --git a/sefaria/model/linker/resolved_ref_refiner_factory.py b/sefaria/model/linker/resolved_ref_refiner_factory.py index 690c5b0b21..633b74705b 100644 --- a/sefaria/model/linker/resolved_ref_refiner_factory.py +++ b/sefaria/model/linker/resolved_ref_refiner_factory.py @@ -1,5 +1,5 @@ from sefaria.model.linker.ref_part import RawRefPart, RefPartType -from sefaria.model.linker.referenceable_book_node import ReferenceableBookNode, NamedReferenceableBookNode, NumberedReferenceableBookNode, MonoReferenceableBookNode +from sefaria.model.linker.referenceable_book_node import ReferenceableBookNode, NamedReferenceableBookNode, NumberedReferenceableBookNode, MapReferenceableBookNode from sefaria.model.linker.resolved_ref_refiner import ResolvedRefRefinerForDefaultNode, ResolvedRefRefinerForNumberedPart, ResolvedRefRefinerForDiburHamatchilPart, ResolvedRefRefinerForRangedPart, ResolvedRefRefinerForNamedNode, ResolvedRefRefiner, ResolvedRefRefinerCatchAll @@ -48,7 +48,7 @@ def initialize_resolved_ref_refiner_factory() -> ResolvedRefRefinerFactory: refiners_to_register = [ (key(is_default=True), ResolvedRefRefinerForDefaultNode), (key(RefPartType.NUMBERED, node_class=NumberedReferenceableBookNode), ResolvedRefRefinerForNumberedPart), - (key(RefPartType.NUMBERED, node_class=MonoReferenceableBookNode), ResolvedRefRefinerForNumberedPart), + (key(RefPartType.NUMBERED, node_class=MapReferenceableBookNode), ResolvedRefRefinerForNumberedPart), (key(RefPartType.RANGE, node_class=NumberedReferenceableBookNode), ResolvedRefRefinerForRangedPart), (key(RefPartType.NAMED, node_class=NamedReferenceableBookNode), ResolvedRefRefinerForNamedNode), (key(RefPartType.NUMBERED, node_class=NamedReferenceableBookNode), ResolvedRefRefinerForNamedNode), From 0eaab15d3b167815093fbb51d86e080f6833783f Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Wed, 22 Nov 2023 12:17:53 +0200 Subject: [PATCH 058/210] feat(linker): add functions to linker_index_converter.py that will aid adding new indexes to linker. --- sefaria/helper/linker_index_converter.py | 44 ++++++++++++++++++++---- 1 file changed, 38 insertions(+), 6 deletions(-) diff --git a/sefaria/helper/linker_index_converter.py b/sefaria/helper/linker_index_converter.py index f1b9815eed..3eddae3232 100644 --- a/sefaria/helper/linker_index_converter.py +++ b/sefaria/helper/linker_index_converter.py @@ -65,6 +65,17 @@ def create_term(self, **kwargs): self.context_and_primary_title_to_term[(kwargs.get('context'), term.get_primary_title('en'))] = term return term + def get_or_create_term_for_titled_obj(self, obj, context=None, new_alt_titles=None, title_modifier=None, title_adder=None): + term = self.get_existing_term_for_titled_obj(obj, new_alt_titles, title_modifier, title_adder) + if not term: + return self.create_term_from_titled_obj(obj, context, new_alt_titles, title_modifier, title_adder) + return term + + def get_existing_term_for_titled_obj(self, obj, new_alt_titles=None, title_modifier=None, title_adder=None): + en_title, he_title, alt_en_titles, alt_he_titles = self._make_titles_for_term(obj, new_alt_titles, + title_modifier, title_adder) + return NonUniqueTerm().load({"titles.text": {"$all": [en_title, he_title] + alt_en_titles + alt_he_titles}}) + def create_term_from_titled_obj(self, obj, context=None, new_alt_titles=None, title_modifier=None, title_adder=None): """ Create a NonUniqueTerm from 'titled object' (see explanation of `obj` param) @@ -103,6 +114,15 @@ def title_adder(lang, title): ... """ + en_title, he_title, alt_en_titles, alt_he_titles = self._make_titles_for_term(obj, new_alt_titles, + title_modifier, title_adder) + term = self.create_term(en=en_title, he=he_title, context=context, alt_en=alt_en_titles, alt_he=alt_he_titles) + if isinstance(obj, Term): + self.old_term_map[obj.name] = term + return term + + @staticmethod + def _make_titles_for_term(obj, new_alt_titles=None, title_modifier=None, title_adder=None): new_alt_titles = new_alt_titles or [] title_group = obj if isinstance(obj, TitleGroup) else obj.title_group en_title = title_group.primary_title('en') @@ -128,10 +148,7 @@ def title_adder(lang, title): # make unique alt_en_titles = list(set(alt_en_titles)) alt_he_titles = list(set(alt_he_titles)) - term = self.create_term(en=en_title, he=he_title, context=context, alt_en=alt_en_titles, alt_he=alt_he_titles) - if isinstance(obj, Term): - self.old_term_map[obj.name] = term - return term + return en_title, he_title, alt_en_titles, alt_he_titles class LinkerCategoryConverter: @@ -375,6 +392,18 @@ def _update_lengths(self): outer_shape = base_outer_shape self.index.nodes.lengths = [outer_shape] + ac[1:] + @staticmethod + def get_all_alt_struct_nodes(index): + def alt_struct_nodes_helper(node, nodes): + nodes.append(node) + for child in node.children: + alt_struct_nodes_helper(child, nodes) + + nodes = [] + for node in index.get_alt_struct_roots(): + alt_struct_nodes_helper(node, nodes) + return nodes + def convert(self): if self.get_alt_structs: alt_struct_dict = self.get_alt_structs(self.index) @@ -382,7 +411,7 @@ def convert(self): for name, root in alt_struct_dict.items(): self.index.set_alt_structure(name, root) self._traverse_nodes(self.index.nodes, self.node_visitor, is_alt_node=False) - alt_nodes = self.index.get_alt_struct_leaves() + alt_nodes = self.get_all_alt_struct_nodes(self.index) for inode, node in enumerate(alt_nodes): self.node_visitor(node, 1, inode, len(alt_nodes), True) self._update_lengths() # update lengths for good measure @@ -425,4 +454,7 @@ def node_visitor(self, node, depth, isibling, num_siblings, is_alt_node): if other_fields_dict is not None: for key, val in other_fields_dict.items(): if val is None: continue - setattr(node, key, val) + if val == "DELETE!": + delattr(node, key) + else: + setattr(node, key, val) From 305bf99953d0fe00175977ff92f64f54f87eb890 Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Wed, 22 Nov 2023 12:18:35 +0200 Subject: [PATCH 059/210] test(linker): many more tests for Yerushalmi and Zohar --- sefaria/model/linker/tests/linker_test.py | 25 ++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/sefaria/model/linker/tests/linker_test.py b/sefaria/model/linker/tests/linker_test.py index 1f7ea636b0..66757d6350 100644 --- a/sefaria/model/linker/tests/linker_test.py +++ b/sefaria/model/linker/tests/linker_test.py @@ -29,10 +29,6 @@ def test_resolved_raw_ref_clone(): @pytest.mark.parametrize(('resolver_data', 'expected_trefs'), [ - # Using addressTypes of alt structs - [crrd(["@JT", "@Berakhot", "#2a"], lang="en"), ("Jerusalem Talmud Berakhot 1:1:7-11",)], - [crrd(["@JT", "@Berakhot", "@Chapter 1", "#2a"], lang="en"), ("Jerusalem Talmud Berakhot 1:1:7-11",)], - # Numbered JAs [crrd(["@בבלי", "@ברכות", "#דף ב"]), ("Berakhot 2",)], # amud-less talmud [crrd(["@ברכות", "#דף ב"]), ("Berakhot 2",)], # amud-less talmud @@ -61,7 +57,7 @@ def test_resolved_raw_ref_clone(): # Named alt structs [crrd(["@פרק אלו דברים", "@בפסחים"]), ("Pesachim 65b:10-73b:16",)], # talmud perek (that's ambiguous) - [crrd(["@פרק אלו דברים"]), ("Pesachim 65b:10-73b:16", "Berakhot 51b:11-53b:33")], # talmud perek without book that's ambiguous + [crrd(["@פרק אלו דברים"]), ("Pesachim 65b:10-73b:16", "Berakhot 51b:11-53b:33", "Jerusalem Talmud Berakhot 8:1:1-8:7", "Jerusalem Talmud Pesachim 6:1:1-6:4", "Jerusalem Talmud Demai 2:1:1-5:4")], # talmud perek without book that's ambiguous [crrd(["@רש\"י", "@פרק יום טוב", "@בביצה"]), ("Rashi on Beitzah 15b:1-23b:10",)], # rashi perek [crrd(["@רש\"י", "@פרק מאימתי"]), ("Rashi on Berakhot 2a:1-13a:15",)], # rashi perek [crrd(["@רש\"י", "@פרק כל כנויי נזירות", "@בנזיר", "*ד\"ה כל כינויי נזירות"]), ("Rashi on Nazir 2a:1:1",)], # rashi perek dibur hamatchil @@ -74,6 +70,16 @@ def test_resolved_raw_ref_clone(): [crrd(["#פרק בתרא", "@בפסחים"]), ("Mishnah Pesachim 10", "Pesachim 99b:1-121b:3")], # numbered talmud perek [crrd(['@מגמ\'', '#דרפ\"ו', '@דנדה']), ("Niddah 48a:11-54b:9",)], # prefixes in front of perek name + # Using addressTypes of alt structs + [crrd(["@JT", "@Bikkurim", "#Chapter 2"], lang="en"), ("Jerusalem Talmud Bikkurim 2",)], + [crrd(["@Tosafot Rabbi Akiva Eiger", "@Shabbat", "#Letter 87"], lang="en"), ("Tosafot Rabbi Akiva Eiger on Mishnah Shabbat 7.2.1",)], + [crrd(["@JT", "@Berakhot", "#2a"], lang="en"), ("Jerusalem Talmud Berakhot 1:1:2-4", "Jerusalem Talmud Berakhot 1:1:7-11",)], # ambig b/w Venice and Vilna + [crrd(["@JT", "@Berakhot", "#Chapter 1", "#2a"], lang="en"), ("Jerusalem Talmud Berakhot 1:1:2-4", "Jerusalem Talmud Berakhot 1:1:7-11",)], + [crrd(["@JT", "@Peah", "#10b"], lang="en"), ("Jerusalem Talmud Peah 2:1:1-4",)], # Venice not ambig + [crrd(["@JT", "@Peah", "#Chapter 3", "#15b"], lang="en"), ("Jerusalem Talmud Peah 3:2:4-4:3",)], # Venice not ambig because of chapter + [crrd(["@JT", "@Peah", "#15c"], lang="en"), ("Jerusalem Talmud Peah 1:1:20-30",)], # Folio address + [crrd(["@Chapter 1"], lang="en"), tuple()], # It used to be that Bavli perakim where Chapter N which causes problems for global scope + # Dibur hamatchils [crrd(["@רש\"י", "@יום טוב", "*ד\"ה שמא יפשע"]), ("Rashi on Beitzah 15b:8:1",)], [crrd(["@רש\"י", "@ביצה", "*ד\"ה שמא יפשע"]), ("Rashi on Beitzah 15b:8:1",)], @@ -163,8 +169,13 @@ def test_resolved_raw_ref_clone(): [crrd(['@טור יורה דעה', '#סימן א']), ['Tur, Yoreh Deah 1']], [crrd(['@תוספתא', '@ברכות', '#א', '#א']), ['Tosefta Berakhot 1:1', 'Tosefta Berakhot (Lieberman) 1:1']], # tosefta ambiguity [crrd(['@תוספתא', '@ברכות', '#א', '#טז']), ['Tosefta Berakhot 1:16']], # tosefta ambiguity - [crrd(['@זוה"ק', '#ח"א', '#דף פג:']), ['Zohar 1:83b']], - # pytest.param(crrd(None, 'he', 'זוהר שמות י.', [0, 1, slice(2, 4)], [RPT.NAMED, RPT.NAMED, RPT.NUMBERED]), ['Zohar 2:10a'], marks=pytest.mark.xfail(reason="Don't support Sefer HaTashbetz yet")), # infer Zohar volume from parasha + + # zohar + [crrd(['@זוה"ק', '#ח"א','@לך לך', '@סתרי תורה', '#דף פ.']), ['Zohar, Lech Lecha 10.78-84']], + [crrd(['@זוה"ק', '#ח"א','@לך לך', '#דף פג:']), ['Zohar, Lech Lecha 17.152-18.165']], + [crrd(['@זוה"ק', '#ח"א', '#דף פג:']), ['Zohar, Lech Lecha 17.152-18.165']], + [crrd(['@זוה"ק', '@לך לך', '#דף פג:']), ['Zohar, Lech Lecha 17.152-18.165']], + [crrd(['@זהר חדש', '@בראשית']), ['Zohar Chadash, Bereshit']], [crrd(['@מסכת', '@סופרים', '#ב', '#ג']), ['Tractate Soferim 2:3']], [crrd(['@אדר"נ', '#ב', '#ג']), ["Avot D'Rabbi Natan 2:3"]], From 7761eb2e7c0b0ea4365cd1a14c8306ada4d17940 Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Wed, 22 Nov 2023 12:19:30 +0200 Subject: [PATCH 060/210] feat(linker): make struct node in alt structures be new AltStructNode. This allows us to add linker fields to structural nodes in alt structures. --- sefaria/model/schema.py | 7 +++++++ sefaria/model/text.py | 4 ++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/sefaria/model/schema.py b/sefaria/model/schema.py index fe9537c7d3..6ad9e75112 100644 --- a/sefaria/model/schema.py +++ b/sefaria/model/schema.py @@ -1130,6 +1130,13 @@ def is_segment_level_dibur_hamatchil(self) -> bool: return getattr(self, 'isSegmentLevelDiburHamatchil', False) +class AltStructNode(TitledTreeNode): + optional_param_keys = ["match_templates", "numeric_equivalent", 'referenceable'] + + def ref(self): + return None + + class ArrayMapNode(NumberedTitledTreeNode): """ A :class:`TreeNode` that contains jagged arrays of references. diff --git a/sefaria/model/text.py b/sefaria/model/text.py index c913c1172e..2ec1f11d65 100644 --- a/sefaria/model/text.py +++ b/sefaria/model/text.py @@ -25,7 +25,7 @@ import re from . import abstract as abst -from .schema import deserialize_tree, SchemaNode, VirtualNode, DictionaryNode, JaggedArrayNode, TitledTreeNode, DictionaryEntryNode, SheetNode, AddressTalmud, Term, TermSet, TitleGroup, AddressType +from .schema import deserialize_tree, AltStructNode, VirtualNode, DictionaryNode, JaggedArrayNode, TitledTreeNode, DictionaryEntryNode, SheetNode, AddressTalmud, Term, TermSet, TitleGroup, AddressType from sefaria.system.database import db import sefaria.system.cache as scache @@ -240,7 +240,7 @@ def _set_struct_objs(self): self.struct_objs = {} if getattr(self, "alt_structs", None) and self.nodes: for name, struct in list(self.alt_structs.items()): - self.struct_objs[name] = deserialize_tree(struct, index=self, struct_class=TitledTreeNode) + self.struct_objs[name] = deserialize_tree(struct, index=self, struct_class=AltStructNode) self.struct_objs[name].title_group = self.nodes.title_group def is_complex(self): From befd778f2ca017c957635583bb514ad240217768 Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Wed, 22 Nov 2023 12:20:51 +0200 Subject: [PATCH 061/210] fix(linker): fully support ref being None in ResolvedRef. In addition, better logic for removing duplicate refs. --- sefaria/model/linker/ref_resolver.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/sefaria/model/linker/ref_resolver.py b/sefaria/model/linker/ref_resolver.py index 478a59b44c..884e7b7177 100644 --- a/sefaria/model/linker/ref_resolver.py +++ b/sefaria/model/linker/ref_resolver.py @@ -650,6 +650,13 @@ def is_match_correct(match: ResolvedRef) -> bool: @staticmethod def remove_superfluous_matches(thoroughness: ResolutionThoroughness, resolved_refs: List[ResolvedRef]) -> List[ResolvedRef]: + # make matches with refs that are essentially equivalent (i.e. refs cover same span) actually equivalent + resolved_refs.sort(key=lambda x: x.ref and x.ref.order_id()) + for i, r in enumerate(resolved_refs[:-1]): + next_r = resolved_refs[i+1] + if r.ref.contains(next_r.ref) and next_r.ref.contains(r.ref): + next_r.ref = r.ref + # make unique resolved_refs = list({r.ref: r for r in resolved_refs}.values()) if thoroughness >= ResolutionThoroughness.HIGH or len(resolved_refs) > 1: @@ -706,15 +713,15 @@ def _merge_subset_matches(resolved_refs: List[ResolvedRef]) -> List[ResolvedRef] Merge matches where one ref is contained in another ref E.g. if matchA.ref == Ref("Genesis 1") and matchB.ref == Ref("Genesis 1:1"), matchA will be deleted and its parts will be appended to matchB's parts """ - resolved_refs.sort(key=lambda x: x.ref and x.ref.order_id()) + resolved_refs.sort(key=lambda x: "N/A" if x.ref is None else x.ref.order_id()) merged_resolved_refs = [] next_merged = False for imatch, match in enumerate(resolved_refs[:-1]): - if match.is_ambiguous or match.ref is None or next_merged: + next_match = resolved_refs[imatch+1] + if match.is_ambiguous or match.ref is None or next_match.ref is None or next_merged: merged_resolved_refs += [match] next_merged = False continue - next_match = resolved_refs[imatch+1] if match.ref.index.title != next_match.ref.index.title: # optimization, the easiest cases to check for merged_resolved_refs += [match] From 537ffed8fb63b76c6d222dc710da91ff77b492a7 Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Wed, 22 Nov 2023 12:21:26 +0200 Subject: [PATCH 062/210] feat(linker): fully support all the attributes of ArrayMapNode that can alter how the mapping should be made. --- .../model/linker/referenceable_book_node.py | 87 +++++++++++++------ 1 file changed, 61 insertions(+), 26 deletions(-) diff --git a/sefaria/model/linker/referenceable_book_node.py b/sefaria/model/linker/referenceable_book_node.py index e1dcab4572..ad9f812533 100644 --- a/sefaria/model/linker/referenceable_book_node.py +++ b/sefaria/model/linker/referenceable_book_node.py @@ -4,6 +4,7 @@ from sefaria.model import text from sefaria.model import schema from sefaria.system.exceptions import InputError +from bisect import bisect_right def subref(ref: text.Ref, section: int): @@ -55,7 +56,7 @@ def __init__(self, titled_tree_node_or_index: Union[schema.TitledTreeNode, text. @property def referenceable(self): - return getattr(self._titled_tree_node, 'referenceable', True) + return getattr(self._titled_tree_node, 'referenceable', not self.is_default()) def is_default(self): return self._titled_tree_node.is_default() @@ -76,25 +77,8 @@ def _get_all_children(self) -> List[ReferenceableBookNode]: return [NumberedReferenceableBookNode(thingy)] if isinstance(thingy, text.Index): children = thingy.referenceable_children() - elif isinstance(thingy, schema.ArrayMapNode): - # TODO following two if's are very similar... - if getattr(thingy, 'refs', None): - address_types = thingy.addressTypes - section_names = thingy.sectionNames - section_ref_map = {ichild+1: text.Ref(tref) for ichild, tref in enumerate(thingy.refs)} - return [MapReferenceableBookNode(address_types, section_names, section_ref_map)] - elif getattr(thingy, 'wholeRef', None) and getattr(thingy, 'includeSections', False): - whole_ref = text.Ref(thingy.wholeRef) - schema_node = whole_ref.index_node.serialize() - truncated_node = truncate_serialized_node_to_depth(schema_node, -2) - refs = whole_ref.split_spanning_ref() - section_ref_map = {} - for oref in refs: - section = oref.section_ref().sections[0] - section_ref_map[section] = oref - return [MapReferenceableBookNode(section_ref_map=section_ref_map, **truncated_node)] - else: - children = self._titled_tree_node.children + elif isinstance(thingy, schema.ArrayMapNode) and (getattr(thingy, "refs", None) or (getattr(thingy, "wholeRef", None) and getattr(thingy, "includeSections", None))): + return [MapReferenceableBookNode(thingy)] else: # Any other type of TitledTreeNode children = self._titled_tree_node.children @@ -163,7 +147,6 @@ def possible_subrefs(self, lang: str, initial_ref: text.Ref, section_str: str, f continue return possible_subrefs, can_match_out_of_order_list - # TODO move these two properties to be private @property def _address_class(self) -> schema.AddressType: return self._ja_node.address_class(0) @@ -216,18 +199,70 @@ def matches_section_context(self, section_context: 'SectionContext') -> bool: class MapReferenceableBookNode(NumberedReferenceableBookNode): """ - Node that can only be referenced by one ref + Node that can only be referenced by refs in a mapping """ - def __init__(self, addressTypes: List[str], sectionNames: List[str], section_ref_map: Dict[int, text.Ref], **ja_node_attrs): - ja_node = schema.JaggedArrayNode(serial={ + def __init__(self, node: schema.ArrayMapNode): + ja_node = self.__make_ja_from_array_map(node) + super().__init__(ja_node) + self._section_ref_map = self.__make_section_ref_map(node) + + @staticmethod + def __make_ja_from_array_map(node: schema.ArrayMapNode): + return MapReferenceableBookNode.__make_ja(**MapReferenceableBookNode.__get_ja_attributes_from_array_map(node)) + + @staticmethod + def __make_ja(addressTypes: List[str], sectionNames: List[str], **ja_node_attrs): + return schema.JaggedArrayNode(serial={ "addressTypes": addressTypes, "sectionNames": sectionNames, **ja_node_attrs, "depth": len(addressTypes), }) - super().__init__(ja_node) - self._section_ref_map = section_ref_map + + @staticmethod + def __get_ja_attributes_from_array_map(node: schema.ArrayMapNode) -> dict: + if getattr(node, 'refs', None): + address_types = node.addressTypes + section_names = node.sectionNames + return {"addressTypes": address_types, "sectionNames": section_names} + elif getattr(node, 'wholeRef', None) and getattr(node, 'includeSections', False): + whole_ref = text.Ref(node.wholeRef) + schema_node = whole_ref.index_node.serialize() + return truncate_serialized_node_to_depth(schema_node, -2) + else: + return {} + + def __make_section_ref_map(self, node: schema.ArrayMapNode) -> Dict[int, text.Ref]: + if getattr(node, 'refs', None): + section_ref_map = { + self.__get_section_with_offset(ichild, node): text.Ref(tref) + for ichild, tref in enumerate(node.refs) + } + elif getattr(node, 'wholeRef', None) and getattr(node, 'includeSections', False): + whole_ref = text.Ref(node.wholeRef) + refs = whole_ref.split_spanning_ref() + section_ref_map = {} + for oref in refs: + section = oref.section_ref().sections[0] + section_ref_map[section] = oref + else: + raise Exception("ArrayMapNode doesn't have expected attributes 'refs' or 'wholeRef'.") + return section_ref_map + + def __get_section_with_offset(self, i: int, node: schema.ArrayMapNode) -> int: + addresses = getattr(node, "addresses", None) + if addresses: + return addresses[i] + section = i + 1 + starting_address = getattr(node, "startingAddress", None) + if starting_address: + section = i + self._address_class.toNumber("en", starting_address) + skipped_addresses = getattr(node, "skipped_addresses", None) + if skipped_addresses: + skipped_addresses.sort() + section += bisect_right(skipped_addresses, section) + return section def ref(self): return self._ref From 0de2f2e1e90d9fc9d24eb8a192d07bcfbb6563b7 Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Thu, 23 Nov 2023 11:47:07 +0200 Subject: [PATCH 063/210] docs(linker): add explanation for new node. --- sefaria/model/schema.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/sefaria/model/schema.py b/sefaria/model/schema.py index 6ad9e75112..7115bef730 100644 --- a/sefaria/model/schema.py +++ b/sefaria/model/schema.py @@ -1131,6 +1131,12 @@ def is_segment_level_dibur_hamatchil(self) -> bool: class AltStructNode(TitledTreeNode): + """ + Structural node for alt structs + Allows additional attributes for referencing these nodes with the linker + Note, these nodes can't be the end of a reference since they themselves don't map to a `Ref`. But they are helpful + being intermediate nodes in a longer reference. + """ optional_param_keys = ["match_templates", "numeric_equivalent", 'referenceable'] def ref(self): From 63232a76e6db037cb4a77034e7abd63a3c2202ae Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Thu, 23 Nov 2023 12:45:57 +0200 Subject: [PATCH 064/210] refactor(linker): refactor raw_ref_model to named_entity_model which better reflects its purpose --- sefaria/model/linker/named_entity_recognizer.py | 14 +++++++------- sefaria/model/linker/tests/linker_test.py | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/sefaria/model/linker/named_entity_recognizer.py b/sefaria/model/linker/named_entity_recognizer.py index 1d6743ae42..47518edd87 100644 --- a/sefaria/model/linker/named_entity_recognizer.py +++ b/sefaria/model/linker/named_entity_recognizer.py @@ -20,9 +20,9 @@ class NamedEntityRecognizer: - groups of people """ - def __init__(self, lang: str, raw_ref_model: Language, raw_ref_part_model: Language): + def __init__(self, lang: str, named_entity_model: Language, raw_ref_part_model: Language): self._lang = lang - self._raw_ref_model = raw_ref_model + self._named_entity_model = named_entity_model self._raw_ref_part_model = raw_ref_part_model self._normalizer = self.__init_normalizer() @@ -144,7 +144,7 @@ def map_normal_output_to_original_input(self, input: str, named_entities: List[R """ Ref resolution ran on normalized input. Remap raw refs to original (non-normalized) input """ - unnorm_doc = self._raw_ref_model.make_doc(input) + unnorm_doc = self._named_entity_model.make_doc(input) mapping, subst_end_indices = self._normalizer.get_mapping_after_normalization(input) conv = self._normalizer.norm_to_unnorm_indices_with_mapping norm_inds = [named_entity.char_indices for named_entity in named_entities] @@ -160,8 +160,8 @@ def map_normal_output_to_original_input(self, input: str, named_entities: List[R named_entity.map_new_part_indices(temp_unnorm_part_inds) @property - def raw_ref_model(self): - return self._raw_ref_model + def named_entity_model(self): + return self._named_entity_model @property def raw_ref_part_model(self): @@ -174,7 +174,7 @@ def _normalize_input(self, input: List[str]): return [self._normalizer.normalize(s) for s in input] def _get_raw_named_entity_spans(self, st: str) -> List[SpanOrToken]: - doc = self._raw_ref_model(st) + doc = self._named_entity_model(st) return doc.ents def _get_raw_ref_part_spans(self, st: str) -> List[SpanOrToken]: @@ -182,7 +182,7 @@ def _get_raw_ref_part_spans(self, st: str) -> List[SpanOrToken]: return doc.ents def _bulk_get_raw_named_entity_spans(self, input: List[str], batch_size=150, **kwargs) -> Generator[List[Span], None, None]: - for doc in self._raw_ref_model.pipe(input, batch_size=batch_size, **kwargs): + for doc in self._named_entity_model.pipe(input, batch_size=batch_size, **kwargs): if kwargs.get('as_tuples', False): doc, context = doc yield doc.ents, context diff --git a/sefaria/model/linker/tests/linker_test.py b/sefaria/model/linker/tests/linker_test.py index 66757d6350..7bf4b729ac 100644 --- a/sefaria/model/linker/tests/linker_test.py +++ b/sefaria/model/linker/tests/linker_test.py @@ -403,7 +403,7 @@ def test_map_new_indices(crrd_params): raw_ref, _, lang, _ = crrd(*crrd_params) text = raw_ref.text linker = library.get_linker(lang) - nlp = linker.get_ner().raw_ref_model + nlp = linker.get_ner().named_entity_model doc = nlp.make_doc(text) indices = raw_ref.char_indices part_indices = [p.char_indices for p in raw_ref.raw_ref_parts] From 2382894cd83af2e4de614de59abafac04a7b2215 Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Thu, 23 Nov 2023 14:23:08 +0200 Subject: [PATCH 065/210] fix(linker): remove CORSDebugMiddleware that caused duplicate allow origin headers --- sefaria/settings.py | 1 - sefaria/system/middleware.py | 9 --------- 2 files changed, 10 deletions(-) diff --git a/sefaria/settings.py b/sefaria/settings.py index 7eea25ff70..3c0b948fed 100644 --- a/sefaria/settings.py +++ b/sefaria/settings.py @@ -117,7 +117,6 @@ 'sefaria.system.middleware.LanguageCookieMiddleware', 'sefaria.system.middleware.LanguageSettingsMiddleware', 'sefaria.system.middleware.ProfileMiddleware', - 'sefaria.system.middleware.CORSDebugMiddleware', 'sefaria.system.middleware.SharedCacheMiddleware', 'sefaria.system.multiserver.coordinator.MultiServerEventListenerMiddleware', 'django_structlog.middlewares.RequestMiddleware', diff --git a/sefaria/system/middleware.py b/sefaria/system/middleware.py index 8cdc458ac6..be294ab7fd 100644 --- a/sefaria/system/middleware.py +++ b/sefaria/system/middleware.py @@ -170,15 +170,6 @@ def current_domain_lang(request): return domain_lang -class CORSDebugMiddleware(MiddlewareMixin): - def process_response(self, request, response): - if DEBUG: - response["Access-Control-Allow-Origin"] = "*" - response["Access-Control-Allow-Methods"] = "POST, GET" - response["Access-Control-Allow-Headers"] = "*" - return response - - class ProfileMiddleware(MiddlewareMixin): """ Displays profiling for any view. From a118328d45ad214342b1a17f0b4d6f59fd9d5b66 Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Thu, 23 Nov 2023 14:34:01 +0200 Subject: [PATCH 066/210] docs(linker): add docs to NamedEntityRecognizer.__init__ --- sefaria/model/linker/named_entity_recognizer.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/sefaria/model/linker/named_entity_recognizer.py b/sefaria/model/linker/named_entity_recognizer.py index 47518edd87..7e67f693aa 100644 --- a/sefaria/model/linker/named_entity_recognizer.py +++ b/sefaria/model/linker/named_entity_recognizer.py @@ -21,6 +21,12 @@ class NamedEntityRecognizer: """ def __init__(self, lang: str, named_entity_model: Language, raw_ref_part_model: Language): + """ + + @param lang: language that the Recognizer understands (based on how the models were trained) + @param named_entity_model: spaCy model which takes a string and recognizes where entities are + @param raw_ref_part_model: spaCy model which takes a string raw ref and recognizes the parts of the ref + """ self._lang = lang self._named_entity_model = named_entity_model self._raw_ref_part_model = raw_ref_part_model From 2a2d8923f82c853846beb6a1d83d9dd40b466f44 Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Thu, 23 Nov 2023 23:20:18 +0200 Subject: [PATCH 067/210] feat(linker): split input by newline which seems to improve models performance --- sefaria/helper/linker.py | 34 ++++++++++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/sefaria/helper/linker.py b/sefaria/helper/linker.py index abaef75b4c..8f3cb5581f 100644 --- a/sefaria/helper/linker.py +++ b/sefaria/helper/linker.py @@ -121,16 +121,46 @@ def _make_find_refs_response_linker_v3(request_text: _FindRefsText, options: _Fi context_ref = None if len(title_doc.resolved_refs) == 1 and not title_doc.resolved_refs[0].is_ambiguous: context_ref = title_doc.resolved_refs[0].ref - body_doc = linker.link(request_text.body, context_ref, with_failures=True, type_filter='citation') + body_resolved = _split_and_link(linker, request_text.body, context_ref, with_failures=True, type_filter='citation') response = { "title": _make_find_refs_response_inner(title_doc.resolved_refs, options), - "body": _make_find_refs_response_inner(body_doc.resolved_refs, options), + "body": _make_find_refs_response_inner(body_resolved, options), } return response +def _split_and_link(linker, input_str: str, *linker_args, **linker_kwargs): + from sefaria.model.linker.ref_part import RawRef, span_inds + from sefaria.model.linker.ref_resolver import ResolvedRef + make_doc = linker._ner._named_entity_model.make_doc + full_spacy_doc = make_doc(input_str) + + inputs = input_str.split('\n') + resolved_list = [] + offset = 0 + for curr_input in inputs: + linked_doc = linker.link(curr_input, *linker_args, **linker_kwargs) + # add offset to resolved refs + for curr_resolved in linked_doc.resolved_refs: + assert isinstance(curr_resolved, ResolvedRef) + named_entity = curr_resolved.raw_entity + curr_start, curr_end = span_inds(named_entity.span) + new_start, new_end = curr_start+offset, curr_end+offset + named_entity.span = full_spacy_doc[new_start:new_end] + if isinstance(named_entity, RawRef): + for part in named_entity.raw_ref_parts: + curr_start, curr_end = span_inds(part.span) + new_start, new_end = curr_start+offset, curr_end+offset + part.span = full_spacy_doc[new_start:new_end] + # end add offset + resolved_list += linked_doc.resolved_refs + curr_spacy_doc = make_doc(curr_input) + offset += len(curr_spacy_doc) + return resolved_list + + def _make_find_refs_response_linker_v2(request_text: _FindRefsText, options: _FindRefsTextOptions) -> dict: response = { "title": _make_find_refs_response_inner_linker_v2(request_text.lang, request_text.title, options), From 50c6d0c9b95cf508bbf8190d656cfa1b5b67839b Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Thu, 23 Nov 2023 23:20:33 +0200 Subject: [PATCH 068/210] Revert "fix(linker): remove CORSDebugMiddleware that caused duplicate allow origin headers" This reverts commit 2382894cd83af2e4de614de59abafac04a7b2215. --- sefaria/settings.py | 1 + sefaria/system/middleware.py | 9 +++++++++ 2 files changed, 10 insertions(+) diff --git a/sefaria/settings.py b/sefaria/settings.py index 3c0b948fed..7eea25ff70 100644 --- a/sefaria/settings.py +++ b/sefaria/settings.py @@ -117,6 +117,7 @@ 'sefaria.system.middleware.LanguageCookieMiddleware', 'sefaria.system.middleware.LanguageSettingsMiddleware', 'sefaria.system.middleware.ProfileMiddleware', + 'sefaria.system.middleware.CORSDebugMiddleware', 'sefaria.system.middleware.SharedCacheMiddleware', 'sefaria.system.multiserver.coordinator.MultiServerEventListenerMiddleware', 'django_structlog.middlewares.RequestMiddleware', diff --git a/sefaria/system/middleware.py b/sefaria/system/middleware.py index be294ab7fd..8cdc458ac6 100644 --- a/sefaria/system/middleware.py +++ b/sefaria/system/middleware.py @@ -170,6 +170,15 @@ def current_domain_lang(request): return domain_lang +class CORSDebugMiddleware(MiddlewareMixin): + def process_response(self, request, response): + if DEBUG: + response["Access-Control-Allow-Origin"] = "*" + response["Access-Control-Allow-Methods"] = "POST, GET" + response["Access-Control-Allow-Headers"] = "*" + return response + + class ProfileMiddleware(MiddlewareMixin): """ Displays profiling for any view. From f3506989510a0d2166e188661880d68b4caccc2f Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Thu, 23 Nov 2023 23:41:55 +0200 Subject: [PATCH 069/210] feat(linker): split input by newline which seems to improve models performance --- sefaria/helper/linker.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sefaria/helper/linker.py b/sefaria/helper/linker.py index 8f3cb5581f..47e17b0125 100644 --- a/sefaria/helper/linker.py +++ b/sefaria/helper/linker.py @@ -134,10 +134,11 @@ def _make_find_refs_response_linker_v3(request_text: _FindRefsText, options: _Fi def _split_and_link(linker, input_str: str, *linker_args, **linker_kwargs): from sefaria.model.linker.ref_part import RawRef, span_inds from sefaria.model.linker.ref_resolver import ResolvedRef + import re make_doc = linker._ner._named_entity_model.make_doc full_spacy_doc = make_doc(input_str) - inputs = input_str.split('\n') + inputs = re.split(r'\n+', input_str) resolved_list = [] offset = 0 for curr_input in inputs: @@ -157,7 +158,7 @@ def _split_and_link(linker, input_str: str, *linker_args, **linker_kwargs): # end add offset resolved_list += linked_doc.resolved_refs curr_spacy_doc = make_doc(curr_input) - offset += len(curr_spacy_doc) + offset += len(curr_spacy_doc)+1 # 1 for newline return resolved_list From d509f5b33dc0c5ca79940f1e803aa85941e90249 Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Thu, 23 Nov 2023 23:20:33 +0200 Subject: [PATCH 070/210] Revert "fix(linker): remove CORSDebugMiddleware that caused duplicate allow origin headers" This reverts commit 2382894cd83af2e4de614de59abafac04a7b2215. --- sefaria/settings.py | 1 + sefaria/system/middleware.py | 9 +++++++++ 2 files changed, 10 insertions(+) diff --git a/sefaria/settings.py b/sefaria/settings.py index 3c0b948fed..7eea25ff70 100644 --- a/sefaria/settings.py +++ b/sefaria/settings.py @@ -117,6 +117,7 @@ 'sefaria.system.middleware.LanguageCookieMiddleware', 'sefaria.system.middleware.LanguageSettingsMiddleware', 'sefaria.system.middleware.ProfileMiddleware', + 'sefaria.system.middleware.CORSDebugMiddleware', 'sefaria.system.middleware.SharedCacheMiddleware', 'sefaria.system.multiserver.coordinator.MultiServerEventListenerMiddleware', 'django_structlog.middlewares.RequestMiddleware', diff --git a/sefaria/system/middleware.py b/sefaria/system/middleware.py index be294ab7fd..8cdc458ac6 100644 --- a/sefaria/system/middleware.py +++ b/sefaria/system/middleware.py @@ -170,6 +170,15 @@ def current_domain_lang(request): return domain_lang +class CORSDebugMiddleware(MiddlewareMixin): + def process_response(self, request, response): + if DEBUG: + response["Access-Control-Allow-Origin"] = "*" + response["Access-Control-Allow-Methods"] = "POST, GET" + response["Access-Control-Allow-Headers"] = "*" + return response + + class ProfileMiddleware(MiddlewareMixin): """ Displays profiling for any view. From e0a45ebd8942b65669861d9c09c1bb1a9dbe1c0e Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Sun, 26 Nov 2023 07:38:51 +0200 Subject: [PATCH 071/210] fix(linker): only add CORS headers when debugging on localhost --- sefaria/system/middleware.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/sefaria/system/middleware.py b/sefaria/system/middleware.py index 8cdc458ac6..0e67309ded 100644 --- a/sefaria/system/middleware.py +++ b/sefaria/system/middleware.py @@ -172,7 +172,12 @@ def current_domain_lang(request): class CORSDebugMiddleware(MiddlewareMixin): def process_response(self, request, response): - if DEBUG: + """ + CORS headers are normally added in nginx response. + However, nginx isn't normally running when debugging with localhost + """ + origin = request.get_host() + if ('localhost' in origin or '127.0.0.1' in origin) and DEBUG: response["Access-Control-Allow-Origin"] = "*" response["Access-Control-Allow-Methods"] = "POST, GET" response["Access-Control-Allow-Headers"] = "*" From 5e6bb95ec79a9b099e46ee0adecae425d8b4631d Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Sun, 26 Nov 2023 09:50:33 +0200 Subject: [PATCH 072/210] fix(linker): use bulk_link to improve speed. better splitting regex that matches spacy's tokenizer. --- sefaria/helper/linker.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/sefaria/helper/linker.py b/sefaria/helper/linker.py index 47e17b0125..ce27105587 100644 --- a/sefaria/helper/linker.py +++ b/sefaria/helper/linker.py @@ -2,6 +2,7 @@ import json import spacy import structlog +from functools import reduce from sefaria.model.linker.ref_part import TermContext, RefPartType from sefaria.model.linker.ref_resolver import PossiblyAmbigResolvedRef from sefaria.model import text, library @@ -131,21 +132,18 @@ def _make_find_refs_response_linker_v3(request_text: _FindRefsText, options: _Fi return response -def _split_and_link(linker, input_str: str, *linker_args, **linker_kwargs): +def _split_and_link(linker, input_str: str, book_context_ref, *linker_args, **linker_kwargs): from sefaria.model.linker.ref_part import RawRef, span_inds - from sefaria.model.linker.ref_resolver import ResolvedRef import re make_doc = linker._ner._named_entity_model.make_doc full_spacy_doc = make_doc(input_str) - inputs = re.split(r'\n+', input_str) - resolved_list = [] + inputs = re.split(r'\s*\n+\s*', input_str) + linked_docs = linker.bulk_link(inputs, [book_context_ref]*len(inputs), *linker_args, **linker_kwargs) offset = 0 - for curr_input in inputs: - linked_doc = linker.link(curr_input, *linker_args, **linker_kwargs) + for curr_input, linked_doc in zip(inputs, linked_docs): # add offset to resolved refs for curr_resolved in linked_doc.resolved_refs: - assert isinstance(curr_resolved, ResolvedRef) named_entity = curr_resolved.raw_entity curr_start, curr_end = span_inds(named_entity.span) new_start, new_end = curr_start+offset, curr_end+offset @@ -156,10 +154,9 @@ def _split_and_link(linker, input_str: str, *linker_args, **linker_kwargs): new_start, new_end = curr_start+offset, curr_end+offset part.span = full_spacy_doc[new_start:new_end] # end add offset - resolved_list += linked_doc.resolved_refs curr_spacy_doc = make_doc(curr_input) offset += len(curr_spacy_doc)+1 # 1 for newline - return resolved_list + return reduce(lambda a, b: a + b.resolved_refs, linked_docs, []) def _make_find_refs_response_linker_v2(request_text: _FindRefsText, options: _FindRefsTextOptions) -> dict: From d95462cf68dc7eec014dc8e9e096bdda05c6d099 Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Sun, 26 Nov 2023 12:25:37 +0200 Subject: [PATCH 073/210] fix(linker): find text beyond element boundaries by treating all whitespace as the same --- static/js/linker.v3/main.js | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/static/js/linker.v3/main.js b/static/js/linker.v3/main.js index b5dab6b852..6f5ba9215d 100644 --- a/static/js/linker.v3/main.js +++ b/static/js/linker.v3/main.js @@ -6,6 +6,10 @@ import {LinkExcluder} from "./excluder"; (function(ns) { + function escapeRegExp(string) { + return string.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); // $& means the whole matched string + } + function sanitizeElem(elem) { const cleaned = DOMPurify.sanitize(elem, { USE_PROFILES: { html: true } }); const cleanedElem = document.createElement("div"); @@ -100,7 +104,6 @@ import {LinkExcluder} from "./excluder"; function findOccurrences(text) { const occurrences = []; findAndReplaceDOMText(document, { - preset: 'prose', find: text, replace: function(portion, match) { if (portion.index === 0) { @@ -114,7 +117,7 @@ import {LinkExcluder} from "./excluder"; function getNextWhiteSpaceIndex(text) { const match = text.match(/\S\s+/); // `\S` so whitespace can't be at beginning of string - if (match === null || text.substring(0, match.index+1).indexOf('\n') > -1) { return -1; } // \n's are added in by Readability and therefore make it challenging to match against. stop when you hit one. + if (match === null) { return -1; } return match.index + 1; } @@ -142,9 +145,12 @@ import {LinkExcluder} from "./excluder"; const newEndChar = getNthWhiteSpaceIndex(text, numWordsAround, endChar); const textRev = [...text].reverse().join(""); const newStartChar = text.length - getNthWhiteSpaceIndex(textRev, numWordsAround, text.length - startChar); - const wordsAroundText = text.substring(newStartChar, newEndChar); + const wordsAroundText = escapeRegExp(text.substring(newStartChar, newEndChar)); + // findAndReplaceDOMText and Readability deal with element boundaries differently + // in order to more flexibly find these boundaries, we treat all whitespace the same + const wordsAroundReg = wordsAroundText.replace(/\s+/g, '\\s+'); return { - text: wordsAroundText, + text: RegExp(wordsAroundReg, "g"), startChar: startChar - newStartChar, }; } @@ -193,6 +199,17 @@ import {LinkExcluder} from "./excluder"; return node; } } + function isMatchUniqueEnough(globalLinkStarts, match, charError=5) { + /** + * Return true if `match` represents one of the matches we've determined to be unique enough to represent this link + */ + for (let globalStart of globalLinkStarts) { + if (Math.abs(match.startIndex - globalStart) <= charError) { + return true; + } + } + return false; + } function wrapRef(linkObj, normalizedText, refData, iLinkObj, resultsKey, maxNumWordsAround = 10, maxSearchLength = 30) { /** @@ -228,11 +245,11 @@ import {LinkExcluder} from "./excluder"; } const globalLinkStarts = occurrences.map(([start, end]) => linkStartChar + start); findAndReplaceDOMText(document, { - preset: 'prose', find: linkObj.text, replace: function(portion, match) { - // check this is the unique match found above - if (globalLinkStarts.indexOf(match.startIndex) === -1) { return portion.text; } + if (!isMatchUniqueEnough(globalLinkStarts, match)) { + return portion.text; + } // check if should be excluded from linking and/or tracking const matchKey = match.startIndex + "|" + match.endIndex; From ea721dd61f270e57194c13bb352b638db70eeafd Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Sun, 26 Nov 2023 12:25:37 +0200 Subject: [PATCH 074/210] fix(linker): find text beyond element boundaries by treating all whitespace as the same --- static/js/linker.v3/main.js | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/static/js/linker.v3/main.js b/static/js/linker.v3/main.js index b5dab6b852..6f5ba9215d 100644 --- a/static/js/linker.v3/main.js +++ b/static/js/linker.v3/main.js @@ -6,6 +6,10 @@ import {LinkExcluder} from "./excluder"; (function(ns) { + function escapeRegExp(string) { + return string.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); // $& means the whole matched string + } + function sanitizeElem(elem) { const cleaned = DOMPurify.sanitize(elem, { USE_PROFILES: { html: true } }); const cleanedElem = document.createElement("div"); @@ -100,7 +104,6 @@ import {LinkExcluder} from "./excluder"; function findOccurrences(text) { const occurrences = []; findAndReplaceDOMText(document, { - preset: 'prose', find: text, replace: function(portion, match) { if (portion.index === 0) { @@ -114,7 +117,7 @@ import {LinkExcluder} from "./excluder"; function getNextWhiteSpaceIndex(text) { const match = text.match(/\S\s+/); // `\S` so whitespace can't be at beginning of string - if (match === null || text.substring(0, match.index+1).indexOf('\n') > -1) { return -1; } // \n's are added in by Readability and therefore make it challenging to match against. stop when you hit one. + if (match === null) { return -1; } return match.index + 1; } @@ -142,9 +145,12 @@ import {LinkExcluder} from "./excluder"; const newEndChar = getNthWhiteSpaceIndex(text, numWordsAround, endChar); const textRev = [...text].reverse().join(""); const newStartChar = text.length - getNthWhiteSpaceIndex(textRev, numWordsAround, text.length - startChar); - const wordsAroundText = text.substring(newStartChar, newEndChar); + const wordsAroundText = escapeRegExp(text.substring(newStartChar, newEndChar)); + // findAndReplaceDOMText and Readability deal with element boundaries differently + // in order to more flexibly find these boundaries, we treat all whitespace the same + const wordsAroundReg = wordsAroundText.replace(/\s+/g, '\\s+'); return { - text: wordsAroundText, + text: RegExp(wordsAroundReg, "g"), startChar: startChar - newStartChar, }; } @@ -193,6 +199,17 @@ import {LinkExcluder} from "./excluder"; return node; } } + function isMatchUniqueEnough(globalLinkStarts, match, charError=5) { + /** + * Return true if `match` represents one of the matches we've determined to be unique enough to represent this link + */ + for (let globalStart of globalLinkStarts) { + if (Math.abs(match.startIndex - globalStart) <= charError) { + return true; + } + } + return false; + } function wrapRef(linkObj, normalizedText, refData, iLinkObj, resultsKey, maxNumWordsAround = 10, maxSearchLength = 30) { /** @@ -228,11 +245,11 @@ import {LinkExcluder} from "./excluder"; } const globalLinkStarts = occurrences.map(([start, end]) => linkStartChar + start); findAndReplaceDOMText(document, { - preset: 'prose', find: linkObj.text, replace: function(portion, match) { - // check this is the unique match found above - if (globalLinkStarts.indexOf(match.startIndex) === -1) { return portion.text; } + if (!isMatchUniqueEnough(globalLinkStarts, match)) { + return portion.text; + } // check if should be excluded from linking and/or tracking const matchKey = match.startIndex + "|" + match.endIndex; From 23636ef1af4b0fd1bd7b56740a07ac7cddd44f9f Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Sun, 26 Nov 2023 15:16:26 +0200 Subject: [PATCH 075/210] refactor(linker): move logic to split and run by paragraph into Linker class --- sefaria/helper/linker.py | 31 ++--------------- sefaria/model/linker/linker.py | 32 +++++++++++++++++ .../model/linker/named_entity_recognizer.py | 4 +-- sefaria/model/linker/ref_part.py | 34 ++++++++++++++++--- sefaria/model/linker/tests/linker_test.py | 4 +-- 5 files changed, 67 insertions(+), 38 deletions(-) diff --git a/sefaria/helper/linker.py b/sefaria/helper/linker.py index ce27105587..f79c4378cc 100644 --- a/sefaria/helper/linker.py +++ b/sefaria/helper/linker.py @@ -122,43 +122,16 @@ def _make_find_refs_response_linker_v3(request_text: _FindRefsText, options: _Fi context_ref = None if len(title_doc.resolved_refs) == 1 and not title_doc.resolved_refs[0].is_ambiguous: context_ref = title_doc.resolved_refs[0].ref - body_resolved = _split_and_link(linker, request_text.body, context_ref, with_failures=True, type_filter='citation') + body_doc = linker.link_by_paragraph(request_text.body, context_ref, with_failures=True, type_filter='citation') response = { "title": _make_find_refs_response_inner(title_doc.resolved_refs, options), - "body": _make_find_refs_response_inner(body_resolved, options), + "body": _make_find_refs_response_inner(body_doc.resolved_refs, options), } return response -def _split_and_link(linker, input_str: str, book_context_ref, *linker_args, **linker_kwargs): - from sefaria.model.linker.ref_part import RawRef, span_inds - import re - make_doc = linker._ner._named_entity_model.make_doc - full_spacy_doc = make_doc(input_str) - - inputs = re.split(r'\s*\n+\s*', input_str) - linked_docs = linker.bulk_link(inputs, [book_context_ref]*len(inputs), *linker_args, **linker_kwargs) - offset = 0 - for curr_input, linked_doc in zip(inputs, linked_docs): - # add offset to resolved refs - for curr_resolved in linked_doc.resolved_refs: - named_entity = curr_resolved.raw_entity - curr_start, curr_end = span_inds(named_entity.span) - new_start, new_end = curr_start+offset, curr_end+offset - named_entity.span = full_spacy_doc[new_start:new_end] - if isinstance(named_entity, RawRef): - for part in named_entity.raw_ref_parts: - curr_start, curr_end = span_inds(part.span) - new_start, new_end = curr_start+offset, curr_end+offset - part.span = full_spacy_doc[new_start:new_end] - # end add offset - curr_spacy_doc = make_doc(curr_input) - offset += len(curr_spacy_doc)+1 # 1 for newline - return reduce(lambda a, b: a + b.resolved_refs, linked_docs, []) - - def _make_find_refs_response_linker_v2(request_text: _FindRefsText, options: _FindRefsTextOptions) -> dict: response = { "title": _make_find_refs_response_inner_linker_v2(request_text.lang, request_text.title, options), diff --git a/sefaria/model/linker/linker.py b/sefaria/model/linker/linker.py index 522a4cb204..49b7fab906 100644 --- a/sefaria/model/linker/linker.py +++ b/sefaria/model/linker/linker.py @@ -78,6 +78,38 @@ def link(self, input_str: str, book_context_ref: Optional[Ref] = None, with_fail self._ner.map_normal_output_to_original_input(input_str, [x.raw_entity for x in doc.all_resolved]) return doc + def link_by_paragraph(self, input_str: str, book_context_ref: Ref, *link_args, **link_kwargs) -> LinkedDoc: + """ + Similar to `link()` except model is run on each paragraph individually (via a bulk operation) + This better mimics the way the underlying ML models were trained and tends to lead to better results + Paragraphs are delineated by new line characters + @param input_str: + @param book_context_ref: + @param link_args: *args to be passed to link() + @param link_kwargs: **kwargs to be passed to link() + @return: + """ + import re + + inputs = re.split(r'\s*\n+\s*', input_str) + linked_docs = self.bulk_link(inputs, [book_context_ref]*len(inputs), *link_args, **link_kwargs) + resolved_refs = [] + resolved_named_entities = [] + full_spacy_doc = self._ner.named_entity_model.make_doc(input_str) + offset = 0 + for curr_input, linked_doc in zip(inputs, linked_docs): + resolved_refs += linked_doc.resolved_refs + resolved_named_entities += linked_doc.resolved_named_entities + + for resolved in linked_doc.all_resolved: + named_entity = resolved.raw_entity + named_entity.align_to_new_doc(full_spacy_doc, offset) + if isinstance(named_entity, RawRef): + named_entity.align_parts_to_new_doc(full_spacy_doc, offset) + curr_token_count = len(self._ner.named_entity_model.make_doc(curr_input)) + offset += curr_token_count+1 # +1 for newline token + return LinkedDoc(input_str, resolved_refs, resolved_named_entities) + def get_ner(self) -> NamedEntityRecognizer: return self._ner diff --git a/sefaria/model/linker/named_entity_recognizer.py b/sefaria/model/linker/named_entity_recognizer.py index 7e67f693aa..dabf190cd2 100644 --- a/sefaria/model/linker/named_entity_recognizer.py +++ b/sefaria/model/linker/named_entity_recognizer.py @@ -161,9 +161,9 @@ def map_normal_output_to_original_input(self, input: str, named_entities: List[R unnorm_part_inds += [conv([[norm_raw_ref_start + i for i in part.char_indices] for part in raw_ref_parts], mapping, subst_end_indices)] for named_entity, temp_unnorm_inds, temp_unnorm_part_inds in zip(named_entities, unnorm_inds, unnorm_part_inds): - named_entity.map_new_indices(unnorm_doc, temp_unnorm_inds) + named_entity.map_new_char_indices(unnorm_doc, temp_unnorm_inds) if isinstance(named_entity, RawRef): - named_entity.map_new_part_indices(temp_unnorm_part_inds) + named_entity.map_new_part_char_indices(temp_unnorm_part_inds) @property def named_entity_model(self): diff --git a/sefaria/model/linker/ref_part.py b/sefaria/model/linker/ref_part.py index cd5340152f..b628310cdd 100644 --- a/sefaria/model/linker/ref_part.py +++ b/sefaria/model/linker/ref_part.py @@ -316,12 +316,24 @@ def __init__(self, span: SpanOrToken, type: NamedEntityType, **cloneable_kwargs) self.span = span self.type = type - def map_new_indices(self, new_doc: Doc, new_indices: Tuple[int, int]) -> None: + def map_new_char_indices(self, new_doc: Doc, new_char_indices: Tuple[int, int]) -> None: """ Remap self.span to new indices """ - self.span = new_doc.char_span(*new_indices, alignment_mode='expand') - if self.span is None: raise InputError(f"${new_indices} don't match token boundaries. Using 'expand' alignment mode text is '{new_doc.char_span(*new_indices, alignment_mode='expand')}'") + self.span = new_doc.char_span(*new_char_indices, alignment_mode='expand') + if self.span is None: raise InputError(f"${new_char_indices} don't match token boundaries. Using 'expand' alignment mode text is '{new_doc.char_span(*new_indices, alignment_mode='expand')}'") + + def align_to_new_doc(self, new_doc: Doc, offset: int) -> None: + """ + Aligns underlying span to `new_doc`'s tokens. Assumption is `new_doc` has some token offset from the original + doc of `self.span` + + @param new_doc: new Doc to align to + @param offset: token offset that aligns tokens in `self.span` to `new_doc + """ + curr_start, curr_end = span_inds(self.span) + new_start, new_end = curr_start+offset, curr_end+offset + self.span = new_doc[new_start:new_end] @property def text(self): @@ -454,13 +466,25 @@ def split_part(self, part: RawRefPart, str_end) -> Tuple['RawRef', RawRefPart, R new_parts_to_match = self.parts_to_match return self.clone(raw_ref_parts=new_parts, parts_to_match=new_parts_to_match), apart, bpart - def map_new_part_indices(self, new_part_indices: List[Tuple[int, int]]) -> None: + def map_new_part_char_indices(self, new_part_char_indices: List[Tuple[int, int]]) -> None: """ Remap self.span and all spans of parts to new indices """ start_char, _ = self.char_indices doc_span = self.span.as_doc() - for part, temp_part_indices in zip(self.raw_ref_parts, new_part_indices): + for part, temp_part_indices in zip(self.raw_ref_parts, new_part_char_indices): part.span = doc_span.char_span(*[i-start_char for i in temp_part_indices], alignment_mode='expand') if part.span is None: raise InputError(f"{temp_part_indices} doesn't match token boundaries for part {part}.") + + def align_parts_to_new_doc(self, new_doc: Doc, offset: int) -> None: + """ + See `RawNamedEntity.align_to_new_doc` + @param new_doc: + @param offset: + @return: + """ + for part in self.raw_ref_parts: + curr_start, curr_end = span_inds(part.span) + new_start, new_end = curr_start+offset, curr_end+offset + part.span = new_doc[new_start:new_end] diff --git a/sefaria/model/linker/tests/linker_test.py b/sefaria/model/linker/tests/linker_test.py index 7bf4b729ac..5058a2efd6 100644 --- a/sefaria/model/linker/tests/linker_test.py +++ b/sefaria/model/linker/tests/linker_test.py @@ -428,8 +428,8 @@ def test_map_new_indices(crrd_params): # test assert norm_raw_ref.text == norm_text.strip() - norm_raw_ref.map_new_indices(doc, indices) - norm_raw_ref.map_new_part_indices(part_indices) + norm_raw_ref.map_new_char_indices(doc, indices) + norm_raw_ref.map_new_part_char_indices(part_indices) assert norm_raw_ref.text == raw_ref.text for norm_part, part in zip(norm_raw_ref.raw_ref_parts, raw_ref.raw_ref_parts): assert norm_part.text == part.text From 738c93c048005c77f8659d0cdc753a7aa228b2f4 Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Sun, 26 Nov 2023 23:23:11 +0200 Subject: [PATCH 076/210] fix(linker): dont say match is unique enough if it hasn't increased the search by at least one word. --- static/js/linker.v3/main.js | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/static/js/linker.v3/main.js b/static/js/linker.v3/main.js index 6f5ba9215d..700de691c4 100644 --- a/static/js/linker.v3/main.js +++ b/static/js/linker.v3/main.js @@ -210,6 +210,15 @@ import {LinkExcluder} from "./excluder"; } return false; } + function isMatchedTextUniqueEnough(occurrences, linkObj, maxSearchLength=30) { + /** + * return true if first occurrence is sufficiently long (longer than `maxSearchLength`) + * AND searchText includes more than just the text of the link. + */ + if (occurrences.length === 0) { return false; } + const firstOccurrenceLength = occurrences[0][1] - occurrences[0][0]; + return firstOccurrenceLength >= maxSearchLength && firstOccurrenceLength > linkObj.text.length; + } function wrapRef(linkObj, normalizedText, refData, iLinkObj, resultsKey, maxNumWordsAround = 10, maxSearchLength = 30) { /** @@ -235,9 +244,9 @@ import {LinkExcluder} from "./excluder"; ({ text: searchText, startChar: linkStartChar } = getNumWordsAround(linkObj, normalizedText, numWordsAround)); occurrences = findOccurrences(searchText); numWordsAround += 1; - if (searchText.length >= maxSearchLength) { break; } + if (isMatchedTextUniqueEnough(occurrences, linkObj, maxSearchLength)) { break; } } - if (occurrences.length === 0 || (occurrences.length > 1 && searchText.length < maxSearchLength)) { + if (occurrences.length !== 1 && !isMatchedTextUniqueEnough(occurrences, linkObj, maxSearchLength)) { if (ns.debug) { console.log("MISSED", numWordsAround, occurrences.length, linkObj); } From a60f866f49538267a4a7f524b96033fda18a6e0b Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Mon, 27 Nov 2023 10:02:00 +0200 Subject: [PATCH 077/210] fix(linker): change parameter order to match the way we're passing parameters. --- sefaria/helper/linker.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sefaria/helper/linker.py b/sefaria/helper/linker.py index 35a8ef8251..fbd9801b9a 100644 --- a/sefaria/helper/linker.py +++ b/sefaria/helper/linker.py @@ -52,8 +52,8 @@ class _FindRefsTextOptions: @attr version_preferences_by_corpus: dict of dicts of the form { : { : }} """ - debug: bool = False with_text: bool = False + debug: bool = False max_segments: int = 0 version_preferences_by_corpus: dict = None From 658b363ffd4d27e83d9ab90f23ed2b589bd2b4e7 Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Mon, 27 Nov 2023 14:35:24 +0200 Subject: [PATCH 078/210] fix(linker): dont pull context if previous non cts ref is ambiguous --- sefaria/model/linker/ref_resolver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sefaria/model/linker/ref_resolver.py b/sefaria/model/linker/ref_resolver.py index 884e7b7177..53820588b5 100644 --- a/sefaria/model/linker/ref_resolver.py +++ b/sefaria/model/linker/ref_resolver.py @@ -326,7 +326,7 @@ def resolve_raw_ref(self, book_context_ref: Optional[text.Ref], raw_ref: RawRef) is_non_cts = i > 0 and len(resolved_list) > 0 if is_non_cts: # TODO assumes context is only first resolved ref - book_context_ref = resolved_list[0].ref + book_context_ref = None if resolved_list[0].is_ambiguous else resolved_list[0].ref context_swap_map = None if book_context_ref is None else getattr(book_context_ref.index.nodes, 'ref_resolver_context_swaps', None) self._apply_context_swaps(raw_ref, context_swap_map) From 8d46aa4d8c99874f594ae368d648801c89c7dc03 Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Tue, 26 Dec 2023 10:19:09 +0200 Subject: [PATCH 079/210] fix(linker): fix how parts are offset in link_by_paragraph() --- sefaria/model/linker/linker.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sefaria/model/linker/linker.py b/sefaria/model/linker/linker.py index 49b7fab906..e00b5d726c 100644 --- a/sefaria/model/linker/linker.py +++ b/sefaria/model/linker/linker.py @@ -2,7 +2,7 @@ from typing import List, Optional, Union, Iterable, Tuple from tqdm import tqdm from sefaria.model.text import Ref -from sefaria.model.linker.ref_part import RawRef, RawNamedEntity +from sefaria.model.linker.ref_part import RawRef, RawNamedEntity, span_inds from sefaria.model.linker.ref_resolver import RefResolver, ResolutionThoroughness, PossiblyAmbigResolvedRef from sefaria.model.linker.named_entity_resolver import NamedEntityResolver, ResolvedNamedEntity from sefaria.model.linker.named_entity_recognizer import NamedEntityRecognizer @@ -105,7 +105,9 @@ def link_by_paragraph(self, input_str: str, book_context_ref: Ref, *link_args, * named_entity = resolved.raw_entity named_entity.align_to_new_doc(full_spacy_doc, offset) if isinstance(named_entity, RawRef): - named_entity.align_parts_to_new_doc(full_spacy_doc, offset) + # named_entity's current start has already been offset so it's the offset we need for each part + raw_ref_offset, _ = span_inds(named_entity.span) + named_entity.align_parts_to_new_doc(full_spacy_doc, raw_ref_offset) curr_token_count = len(self._ner.named_entity_model.make_doc(curr_input)) offset += curr_token_count+1 # +1 for newline token return LinkedDoc(input_str, resolved_refs, resolved_named_entities) From c4680a08801233cab62b46d11887e9d6c22575ca Mon Sep 17 00:00:00 2001 From: stevekaplan123 Date: Tue, 23 Jan 2024 14:48:34 +0200 Subject: [PATCH 080/210] chore: add two tests to crrd --- sefaria/model/linker/tests/linker_test.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sefaria/model/linker/tests/linker_test.py b/sefaria/model/linker/tests/linker_test.py index 5058a2efd6..ae35ab8d73 100644 --- a/sefaria/model/linker/tests/linker_test.py +++ b/sefaria/model/linker/tests/linker_test.py @@ -30,6 +30,8 @@ def test_resolved_raw_ref_clone(): @pytest.mark.parametrize(('resolver_data', 'expected_trefs'), [ # Numbered JAs + [crrd(["@Jerusalem", "@Talmud", "@Yoma", "#5a"], lang='en'), ("Jerusalem Talmud Yoma 1:1:20-25",)], + [crrd(["@Babylonian", "@Talmud", "@Sukkah", "#49b"], lang='en'), ("Sukkah 49b",)], [crrd(["@בבלי", "@ברכות", "#דף ב"]), ("Berakhot 2",)], # amud-less talmud [crrd(["@ברכות", "#דף ב"]), ("Berakhot 2",)], # amud-less talmud [crrd(["@בבלי", "@שבת", "#דף ב."]), ("Shabbat 2a",)], # amud-ful talmud From b07d4f58f1e2c806509545b37f4863b947c567b8 Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Sun, 28 Jan 2024 12:14:46 +0200 Subject: [PATCH 081/210] fix(linker): normalize the URL before querying for a domain name. This will remove the extra www. if it appears. --- sefaria/views.py | 1 + 1 file changed, 1 insertion(+) diff --git a/sefaria/views.py b/sefaria/views.py index 76dd901c98..ba1d880f6e 100644 --- a/sefaria/views.py +++ b/sefaria/views.py @@ -343,6 +343,7 @@ def find_refs_api(request): @api_view(["GET"]) def websites_api(request, domain): cb = request.GET.get("callback", None) + domain = WebPage.normalize_url(domain) website = WebSite().load({"domains": domain}) if website is None: return jsonResponse({"error": f"no website found with domain: '{domain}'"}) From dd32508e7a61b3ebe7f3baf4689902a94756247b Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Sun, 28 Jan 2024 12:21:17 +0200 Subject: [PATCH 082/210] fix(linker): make https optional in WebPage.normalize_url so that the function is more reusable --- sefaria/model/webpage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sefaria/model/webpage.py b/sefaria/model/webpage.py index 5be91223ca..10e2c00300 100644 --- a/sefaria/model/webpage.py +++ b/sefaria/model/webpage.py @@ -113,7 +113,7 @@ def normalize_url(url): "remove url params": lambda url: re.sub(r"\?.+", "", url), "remove utm params": lambda url: re.sub(r"\?utm_.+", "", url), "remove fbclid param": lambda url: re.sub(r"\?fbclid=.+", "", url), - "remove www": lambda url: re.sub(r"^(https?://)www\.", r"\1", url), + "remove www": lambda url: re.sub(r"^(https?://)?www\.", r"\1", url), "remove mediawiki params": lambda url: re.sub(r"&.+", "", url), "remove sort param": lambda url: re.sub(r"\?sort=.+", "", url), "remove all params after id": lambda url: re.sub(r"(\?id=\d+).+$", r"\1", url) From f1676abeea29f9ccb2cd5865ca49c656a8185f3c Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Sun, 28 Jan 2024 15:23:19 +0200 Subject: [PATCH 083/210] fix(linker): allow topics map to be empty --- sefaria/model/linker/named_entity_resolver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sefaria/model/linker/named_entity_resolver.py b/sefaria/model/linker/named_entity_resolver.py index b8384d94fd..085121e557 100644 --- a/sefaria/model/linker/named_entity_resolver.py +++ b/sefaria/model/linker/named_entity_resolver.py @@ -105,7 +105,7 @@ def __init__(self, lang: str, named_entity_types_to_topics: Dict[str, Dict[str, named_entity_type: self.__generate_topic_list_from_spec(topic_spec) for named_entity_type, topic_spec in named_entity_types_to_topics.items() } - all_topics = reduce(lambda a, b: a + b, topics_by_type.values()) + all_topics = reduce(lambda a, b: a + b, topics_by_type.values(), []) self._slug_topic_map = {t.slug: t for t in all_topics} self._title_slug_map_by_type = { named_entity_type: self.__get_title_map_for_topics(topics_by_type[named_entity_type]) From b65108ff580b66b39689f200886c2e4439041f2e Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Sun, 28 Jan 2024 15:23:54 +0200 Subject: [PATCH 084/210] fix(linker): more precisely remove duplicate matches at book level. Make sure not to remove duplicate refs that matched different parts. --- sefaria/model/linker/ref_resolver.py | 20 ++++++++++++++++++- .../model/linker/referenceable_book_node.py | 12 ++++++++++- sefaria/model/linker/tests/linker_test.py | 1 + sefaria/model/schema.py | 2 +- 4 files changed, 32 insertions(+), 3 deletions(-) diff --git a/sefaria/model/linker/ref_resolver.py b/sefaria/model/linker/ref_resolver.py index 53820588b5..bcc6ab38c9 100644 --- a/sefaria/model/linker/ref_resolver.py +++ b/sefaria/model/linker/ref_resolver.py @@ -581,9 +581,27 @@ def prune_unrefined_ref_part_matches(ref_part_matches: List[ResolvedRef]) -> Lis index_match_map[key] += [match] pruned_matches = [] for match_list in index_match_map.values(): - pruned_matches += [max(match_list, key=lambda m: m.num_resolved())] + pruned_matches += ResolvedRefPruner.remove_subset_sets(match_list, key=lambda match: set(part.char_indices for part in match.get_resolved_parts())) return pruned_matches + @staticmethod + def remove_subset_sets(items, key=None): + if key: + sets_to_filter = [key(x) for x in items] + else: + sets_to_filter = items + items, sets_to_filter = zip(*sorted((zip(items, sets_to_filter)), key=lambda x: len(x[1]), reverse=True)) + result = [] + for i in range(len(sets_to_filter)): + for j in range(i): + if sets_to_filter[i].issubset(sets_to_filter[j]): + # Break the loop as the sublist is a subset of a previous sublist + break + else: + # If the sublist is not a subset of any previous sublist, add it to the result + result.append(items[i]) + return result + @staticmethod def do_explicit_sections_match_before_context_sections(match: ResolvedRef) -> bool: first_explicit_section = None diff --git a/sefaria/model/linker/referenceable_book_node.py b/sefaria/model/linker/referenceable_book_node.py index ad9f812533..52d477dc5b 100644 --- a/sefaria/model/linker/referenceable_book_node.py +++ b/sefaria/model/linker/referenceable_book_node.py @@ -67,6 +67,16 @@ def get_numeric_equivalent(self): def ref(self) -> text.Ref: return self._titled_tree_node.ref() + @staticmethod + def _is_array_map_referenceable(node: schema.ArrayMapNode) -> bool: + if not getattr(node, "isMapReferenceable", True): + return False + if getattr(node, "refs", None): + return True + if getattr(node, "wholeRef", None) and getattr(node, "includeSections", None): + return True + return False + def _get_all_children(self) -> List[ReferenceableBookNode]: thingy = self._titled_tree_node_or_index # the schema node for this referenceable node has a dibur hamatchil child @@ -77,7 +87,7 @@ def _get_all_children(self) -> List[ReferenceableBookNode]: return [NumberedReferenceableBookNode(thingy)] if isinstance(thingy, text.Index): children = thingy.referenceable_children() - elif isinstance(thingy, schema.ArrayMapNode) and (getattr(thingy, "refs", None) or (getattr(thingy, "wholeRef", None) and getattr(thingy, "includeSections", None))): + elif isinstance(thingy, schema.ArrayMapNode) and self._is_array_map_referenceable(thingy): return [MapReferenceableBookNode(thingy)] else: # Any other type of TitledTreeNode diff --git a/sefaria/model/linker/tests/linker_test.py b/sefaria/model/linker/tests/linker_test.py index ae35ab8d73..c1c20675c6 100644 --- a/sefaria/model/linker/tests/linker_test.py +++ b/sefaria/model/linker/tests/linker_test.py @@ -130,6 +130,7 @@ def test_resolved_raw_ref_clone(): [crrd(['<לקמן', '#משנה א'], "Mishnah Berakhot 1", prev_trefs=['Mishnah Shabbat 1']), ("Mishnah Berakhot 1:1",)], # competing relative and sham # Superfluous information + [crrd(['@Vayikra', '@Leviticus', '#1'], lang='en'), ("Leviticus 1",)], [crrd(['@תוספות', '#פרק קמא', '@דברכות', '#דף ב']), ['Tosafot on Berakhot 2']], # YERUSHALMI EN diff --git a/sefaria/model/schema.py b/sefaria/model/schema.py index 95396b4d4f..189e5d327a 100644 --- a/sefaria/model/schema.py +++ b/sefaria/model/schema.py @@ -1153,7 +1153,7 @@ class ArrayMapNode(NumberedTitledTreeNode): (e.g., Parsha structures of chapter/verse stored Tanach, or Perek structures of Daf/Line stored Talmud) """ required_param_keys = ["depth", "wholeRef"] - optional_param_keys = ["lengths", "addressTypes", "sectionNames", "refs", "includeSections", "startingAddress", "match_templates", "numeric_equivalent", "referenceableSections", "isSegmentLevelDiburHamatchil", "diburHamatchilRegexes", 'referenceable', "addresses", "skipped_addresses"] # "addressTypes", "sectionNames", "refs" are not required for depth 0, but are required for depth 1 + + optional_param_keys = ["lengths", "addressTypes", "sectionNames", "refs", "includeSections", "startingAddress", "match_templates", "numeric_equivalent", "referenceableSections", "isSegmentLevelDiburHamatchil", "diburHamatchilRegexes", 'referenceable', "addresses", "skipped_addresses", "isMapReferenceable"] # "addressTypes", "sectionNames", "refs" are not required for depth 0, but are required for depth 1 + has_key = False # This is not used as schema for content def get_ref_from_sections(self, sections): From 1ff1690448eff57365761d20001e4c5770005ca1 Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Mon, 29 Jan 2024 08:50:03 +0200 Subject: [PATCH 085/210] fix(linker): WIP allow perek to be child of parsha node. still need to figure out how to do subref on parsha. --- sefaria/model/linker/referenceable_book_node.py | 9 +++++++-- sefaria/model/linker/tests/linker_test.py | 1 + 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/sefaria/model/linker/referenceable_book_node.py b/sefaria/model/linker/referenceable_book_node.py index 52d477dc5b..36106666e5 100644 --- a/sefaria/model/linker/referenceable_book_node.py +++ b/sefaria/model/linker/referenceable_book_node.py @@ -87,8 +87,13 @@ def _get_all_children(self) -> List[ReferenceableBookNode]: return [NumberedReferenceableBookNode(thingy)] if isinstance(thingy, text.Index): children = thingy.referenceable_children() - elif isinstance(thingy, schema.ArrayMapNode) and self._is_array_map_referenceable(thingy): - return [MapReferenceableBookNode(thingy)] + elif isinstance(thingy, schema.ArrayMapNode): + if self._is_array_map_referenceable(thingy): + return [MapReferenceableBookNode(thingy)] + else: + index = thingy.ref().index + yo = NamedReferenceableBookNode(index) + return yo.get_children() else: # Any other type of TitledTreeNode children = self._titled_tree_node.children diff --git a/sefaria/model/linker/tests/linker_test.py b/sefaria/model/linker/tests/linker_test.py index c1c20675c6..95a33bb2ee 100644 --- a/sefaria/model/linker/tests/linker_test.py +++ b/sefaria/model/linker/tests/linker_test.py @@ -29,6 +29,7 @@ def test_resolved_raw_ref_clone(): @pytest.mark.parametrize(('resolver_data', 'expected_trefs'), [ + [crrd(["@Parshat Vayikra", "#2", "#3"], lang='en'), ('Leviticus 2:3',)], # Numbered JAs [crrd(["@Jerusalem", "@Talmud", "@Yoma", "#5a"], lang='en'), ("Jerusalem Talmud Yoma 1:1:20-25",)], [crrd(["@Babylonian", "@Talmud", "@Sukkah", "#49b"], lang='en'), ("Sukkah 49b",)], From 0a00e1d1616b921072ab36eebf3dd99bee6b2738 Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Mon, 29 Jan 2024 09:28:48 +0200 Subject: [PATCH 086/210] fix(linker): Allow citations like Parshat Vayikra 2:3 --- .../model/linker/referenceable_book_node.py | 29 ++++++++++++++++--- sefaria/model/linker/tests/linker_test.py | 6 +++- 2 files changed, 30 insertions(+), 5 deletions(-) diff --git a/sefaria/model/linker/referenceable_book_node.py b/sefaria/model/linker/referenceable_book_node.py index 36106666e5..bb09aa0cff 100644 --- a/sefaria/model/linker/referenceable_book_node.py +++ b/sefaria/model/linker/referenceable_book_node.py @@ -9,10 +9,31 @@ def subref(ref: text.Ref, section: int): if ref.index_node.addressTypes[len(ref.sections)-1] == "Talmud": - d = ref._core_dict() - d['sections'][-1] += (section-1) - d['toSections'] = d['sections'][:] - return text.Ref(_obj=d) + return _talmud_subref(ref, section) + elif ref.index.categories == ['Tanakh', 'Torah']: + return _parsha_subref(ref, section) + else: + return ref.subref(section) + + +def _talmud_subref(ref: text.Ref, section: int): + d = ref._core_dict() + d['sections'][-1] += (section-1) + d['toSections'] = d['sections'][:] + return text.Ref(_obj=d) + + +def _parsha_subref(ref: text.Ref, section: int): + parsha_trefs = {n.wholeRef for n in ref.index.get_alt_struct_leaves()} + if ref.normal() in parsha_trefs: + book_subref = text.Ref(ref.index.title).subref(section) + if ref.contains(book_subref): + return book_subref + else: + # section doesn't fall within parsha + # Note, only validates that perek is in parsha range, doesn't check segment level. + # Edge case is Parshat Noach 6:3 + raise InputError else: return ref.subref(section) diff --git a/sefaria/model/linker/tests/linker_test.py b/sefaria/model/linker/tests/linker_test.py index 95a33bb2ee..863631d274 100644 --- a/sefaria/model/linker/tests/linker_test.py +++ b/sefaria/model/linker/tests/linker_test.py @@ -29,7 +29,6 @@ def test_resolved_raw_ref_clone(): @pytest.mark.parametrize(('resolver_data', 'expected_trefs'), [ - [crrd(["@Parshat Vayikra", "#2", "#3"], lang='en'), ('Leviticus 2:3',)], # Numbered JAs [crrd(["@Jerusalem", "@Talmud", "@Yoma", "#5a"], lang='en'), ("Jerusalem Talmud Yoma 1:1:20-25",)], [crrd(["@Babylonian", "@Talmud", "@Sukkah", "#49b"], lang='en'), ("Sukkah 49b",)], @@ -51,6 +50,11 @@ def test_resolved_raw_ref_clone(): [crrd(['@שבת', '#א', '#ב']), ["Mishnah Shabbat 1:2"]], # shouldn't match Shabbat 2a by reversing order of parts [crrd(['@שבת', '#ב', '#א']), ["Shabbat 2a", "Mishnah Shabbat 2:1"]], # ambiguous case + # Parsha -> sections + [crrd(["@Parshat Vayikra", "#2", "#3"], lang='en'), ('Leviticus 2:3',)], + [crrd(["@Parshat Tzav", "#2", "#3"], lang='en'), tuple()], # validate that sections fall within parsha + pytest.param(crrd(["@Parshat Noach", "#6", "#3"], lang='en'), tuple(), marks=pytest.mark.xfail(reason="currently dont check if pasuk/perek pair fall in parsha, only perek")), + # Aliases for perakim [crrd(["@משנה", "@ברכות", "#פרק קמא"]), ("Mishnah Berakhot 1",)], [crrd(["@משנה", "@ברכות", "#פרק בתרא"]), ("Mishnah Berakhot 9",)], From b7973b077c7b189a9deb53e485413d7d37f6ccf0 Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Mon, 29 Jan 2024 11:35:17 +0200 Subject: [PATCH 087/210] fix(linker): dont reset ibid history every paragraph for bulk resolution --- sefaria/model/linker/linker.py | 3 ++- sefaria/model/linker/ref_resolver.py | 10 ++++++---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/sefaria/model/linker/linker.py b/sefaria/model/linker/linker.py index e00b5d726c..f496780dd0 100644 --- a/sefaria/model/linker/linker.py +++ b/sefaria/model/linker/linker.py @@ -40,6 +40,7 @@ def bulk_link(self, inputs: List[str], book_context_refs: Optional[List[Optional @param type_filter: Type of entities to return, either 'all', 'citation' or 'named entity' @return: list of LinkedDocs """ + self._ref_resolver.reset_ibid_history() all_named_entities = self._ner.bulk_recognize(inputs) docs = [] book_context_refs = book_context_refs or [None]*len(all_named_entities) @@ -48,7 +49,7 @@ def bulk_link(self, inputs: List[str], book_context_refs: Optional[List[Optional raw_refs, named_entities = self._partition_raw_refs_and_named_entities(inner_named_entities) resolved_refs, resolved_named_entities = [], [] if type_filter in {'all', 'citation'}: - resolved_refs = self._ref_resolver.bulk_resolve(raw_refs, book_context_ref, with_failures, thoroughness) + resolved_refs = self._ref_resolver.bulk_resolve(raw_refs, book_context_ref, with_failures, thoroughness, reset_ibids=False) if type_filter in {'all', 'named entity'}: resolved_named_entities = self._ne_resolver.bulk_resolve(named_entities, with_failures) docs += [LinkedDoc(input_str, resolved_refs, resolved_named_entities)] diff --git a/sefaria/model/linker/ref_resolver.py b/sefaria/model/linker/ref_resolver.py index bcc6ab38c9..632dbb22bd 100644 --- a/sefaria/model/linker/ref_resolver.py +++ b/sefaria/model/linker/ref_resolver.py @@ -254,17 +254,19 @@ def reset_ibid_history(self): self._ibid_history = IbidHistory() def bulk_resolve(self, raw_refs: List[RawRef], book_context_ref: Optional[text.Ref] = None, - with_failures=False, thoroughness=ResolutionThoroughness.NORMAL) -> List[PossiblyAmbigResolvedRef]: + with_failures=False, thoroughness=ResolutionThoroughness.NORMAL, reset_ibids=True) -> List[PossiblyAmbigResolvedRef]: """ Main function for resolving refs in text. Given a list of RawRefs, returns ResolvedRefs for each @param raw_refs: @param book_context_ref: @param with_failures: @param thoroughness: how thorough should the search be. More thorough == slower. Currently "normal" will avoid searching for DH matches at book level and avoid filtering empty refs + @param reset_ibids: If true, reset ibid history before resolving @return: """ self._thoroughness = thoroughness - self.reset_ibid_history() + if reset_ibids: + self.reset_ibid_history() resolved = [] for raw_ref in raw_refs: temp_resolved = self._resolve_raw_ref_and_update_ibid_history(raw_ref, book_context_ref, with_failures) @@ -273,12 +275,12 @@ def bulk_resolve(self, raw_refs: List[RawRef], book_context_ref: Optional[text.R def _resolve_raw_ref_and_update_ibid_history(self, raw_ref: RawRef, book_context_ref: text.Ref, with_failures=False) -> List[PossiblyAmbigResolvedRef]: temp_resolved = self.resolve_raw_ref(book_context_ref, raw_ref) - self._update_ibid_history(temp_resolved) + self._update_ibid_history(raw_ref, temp_resolved) if len(temp_resolved) == 0 and with_failures: return [ResolvedRef(raw_ref, [], None, None, context_ref=book_context_ref)] return temp_resolved - def _update_ibid_history(self, temp_resolved: List[PossiblyAmbigResolvedRef]): + def _update_ibid_history(self, raw_ref: RawRef, temp_resolved: List[PossiblyAmbigResolvedRef]): if len(temp_resolved) == 0: self.reset_ibid_history() elif any(r.is_ambiguous for r in temp_resolved): From df270cb82d81bd74ef71ce8f345575cb693fbc26 Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Mon, 29 Jan 2024 11:54:52 +0200 Subject: [PATCH 088/210] fix(linker): ignore certain term slugs for resetting ibid history --- sefaria/model/linker/ref_resolver.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/sefaria/model/linker/ref_resolver.py b/sefaria/model/linker/ref_resolver.py index 632dbb22bd..71869ec842 100644 --- a/sefaria/model/linker/ref_resolver.py +++ b/sefaria/model/linker/ref_resolver.py @@ -1,5 +1,6 @@ from collections import defaultdict from typing import List, Union, Dict, Optional, Tuple, Iterable, Set +from functools import reduce from enum import IntEnum, Enum from sefaria.system.exceptions import InputError from sefaria.model import abstract as abst @@ -208,12 +209,23 @@ def match_terms(self, ref_parts: List[RawRefPart]) -> List[schema.NonUniqueTerm] class IbidHistory: + ignored_term_slugs = ['torah', 'talmud', 'gemara', 'mishnah', 'midrash'] + def __init__(self, last_n_titles: int = 3, last_n_refs: int = 3): self.last_n_titles = last_n_titles self.last_n_refs = last_n_refs self._last_refs: List[text.Ref] = [] self._last_titles: List[str] = [] self._title_ref_map: Dict[str, text.Ref] = {} + self._ignored_titles: Set[str] = self._get_ignored_titles() + + @classmethod + def _get_ignored_titles(cls) -> Set[str]: + terms = [schema.NonUniqueTerm.init(slug) for slug in cls.ignored_term_slugs] + return reduce(lambda a, b: a | set(b), [term.get_titles() for term in terms], set()) + + def should_ignore_text(self, text) -> bool: + return text in self._ignored_titles def _get_last_refs(self) -> List[text.Ref]: return self._last_refs @@ -281,6 +293,8 @@ def _resolve_raw_ref_and_update_ibid_history(self, raw_ref: RawRef, book_context return temp_resolved def _update_ibid_history(self, raw_ref: RawRef, temp_resolved: List[PossiblyAmbigResolvedRef]): + if self._ibid_history.should_ignore_text(raw_ref.text): + return if len(temp_resolved) == 0: self.reset_ibid_history() elif any(r.is_ambiguous for r in temp_resolved): From b84aeebc5fe5d7f2da1cafedcd546759ad077269 Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Sun, 11 Feb 2024 09:23:47 +0200 Subject: [PATCH 089/210] fix(linker): add lots of dashes to tokenizer --- sefaria/spacy_function_registry.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sefaria/spacy_function_registry.py b/sefaria/spacy_function_registry.py index a1a91bb342..a4a86049c0 100644 --- a/sefaria/spacy_function_registry.py +++ b/sefaria/spacy_function_registry.py @@ -5,7 +5,7 @@ def inner_punct_tokenizer_factory(): def inner_punct_tokenizer(nlp): # infix_re = spacy.util.compile_infix_regex(nlp.Defaults.infixes) - infix_re = re.compile(r'''[.,?!:;…‘’`“”"'~–\-/()<>]''') + infix_re = re.compile(r'''[.,?!:;…‘’`“”"'~–—\-‐‑‒־―⸺⸻/()<>]''') prefix_re = spacy.util.compile_prefix_regex(nlp.Defaults.prefixes) suffix_re = spacy.util.compile_suffix_regex(nlp.Defaults.suffixes) From aa7e3ade433300e1bf69cab3178b3e1b01fb47b1 Mon Sep 17 00:00:00 2001 From: YishaiGlasner Date: Mon, 27 May 2024 15:49:48 +0300 Subject: [PATCH 090/210] refactor(Version): rearrange the language related attributes of Version: 1. change 'actualLanguage', 'languageFamilyName', 'isSource', 'isPrimary' and 'direction' to be required. 2. change _normalize to set default values for those attributes before saving. 3. remove the attribute 'isBaeText'. --- sefaria/model/text.py | 43 ++++++++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 19 deletions(-) diff --git a/sefaria/model/text.py b/sefaria/model/text.py index 9f929e2817..4b3100d2f4 100644 --- a/sefaria/model/text.py +++ b/sefaria/model/text.py @@ -1286,7 +1286,12 @@ class Version(AbstractTextRecord, abst.AbstractMongoRecord, AbstractSchemaConten "title", # FK to Index.title "versionSource", "versionTitle", - "chapter" # required. change to "content"? + "chapter", # required. change to "content"? + "actualLanguage", # ISO language code + 'languageFamilyName', # full name of the language, but without specificity (for Judeo Arabic actualLanguage=jrb, languageFamilyName=arabic + 'isSource', # bool, True if this version is not a translation + 'isPrimary', # bool, True if we see it as a primary version (usually equals to isSource, but Hebrew Kuzarif or example is primary but not source) + 'direction', # 'rtl' or 'ltr' ] """ @@ -1312,12 +1317,6 @@ class Version(AbstractTextRecord, abst.AbstractMongoRecord, AbstractSchemaConten "purchaseInformationImage", "purchaseInformationURL", "hasManuallyWrappedRefs", # true for texts where refs were manually wrapped in a-tags. no need to run linker at run-time. - "actualLanguage", # ISO language code - 'languageFamilyName', # full name of the language, but without specificity (for Judeo Arabic actualLanguage=jrb, languageFamilyName=arabic - "isBaseText", # should be deprecated (needs some changes on client side) - 'isSource', # bool, True if this version is not a translation - 'isPrimary', # bool, True if we see it as a primary version (usually equals to isSource, but Hebrew Kuzarif or example is primary but not source) - 'direction', # 'rtl' or 'ltr' ] def __str__(self): @@ -1332,14 +1331,6 @@ def _validate(self): Old style database text record have a field called 'chapter' Version records in the wild have a field called 'text', and not always a field called 'chapter' """ - languageCodeRe = re.search(r"\[([a-z]{2})\]$", getattr(self, "versionTitle", None)) - if languageCodeRe and languageCodeRe.group(1) != getattr(self,"actualLanguage",None): - self.actualLanguage = languageCodeRe.group(1) - if not getattr(self, 'languageFamilyName', None): - try: - self.languageFamilyName = constants.LANGUAGE_CODES[self.actualLanguage] - except KeyError: - self.languageFamilyName = constants.LANGUAGE_CODES[self.language] if getattr(self,"language", None) not in ["en", "he"]: raise InputError("Version language must be either 'en' or 'he'") index = self.get_index() @@ -1375,14 +1366,28 @@ def _check_node_offsets(self, content, node): def _normalize(self): # add actualLanguage -- TODO: migration to get rid of bracket notation completely - actualLanguage = getattr(self, "actualLanguage", None) - versionTitle = getattr(self, "versionTitle", None) + actualLanguage = getattr(self, "actualLanguage", None) + versionTitle = getattr(self, "versionTitle", None) if not actualLanguage and versionTitle: languageCode = re.search(r"\[([a-z]{2})\]$", versionTitle) if languageCode and languageCode.group(1): - self.actualLanguage = languageCode.group(1) + actualLanguage = languageCode.group(1) + if not actualLanguage: + actualLanguage = self.language + self.actualLanguage = actualLanguage + + if not getattr(self, 'languageFamilyName', None): + try: + self.languageFamilyName = constants.LANGUAGE_CODES[self.actualLanguage] + except KeyError: + self.languageFamilyName = constants.LANGUAGE_CODES[self.language] + + self.isSource = getattr(self, "isSource", False) + if not getattr(self, "isPrimary", False): + if self.isSource or not VersionSet({'title': self.title}): + self.isPrimary = True else: - self.actualLanguage = self.language + self.isPrimary = False if not getattr(self, 'direction', None): self.direction = 'rtl' if self.language == 'he' else 'ltr' From 08fd2905b92c2208fffcb3f374066b877e8be503 Mon Sep 17 00:00:00 2001 From: YishaiGlasner Date: Tue, 28 May 2024 11:21:43 +0300 Subject: [PATCH 091/210] docs(api): remove isBaseText from API documentation. --- docs/openAPI.json | 102 ++++++++++------------------------------------ 1 file changed, 21 insertions(+), 81 deletions(-) diff --git a/docs/openAPI.json b/docs/openAPI.json index 7816c243bd..fc374ba44d 100644 --- a/docs/openAPI.json +++ b/docs/openAPI.json @@ -59,7 +59,6 @@ "hasManuallyWrappedRefs": "", "actualLanguage": "he", "languageFamilyName": "hebrew", - "isBaseText": true, "isSource": true, "isPrimary": true, "direction": "rtl", @@ -91,7 +90,6 @@ "hasManuallyWrappedRefs": "", "actualLanguage": "he", "languageFamilyName": "hebrew", - "isBaseText": true, "isSource": true, "isPrimary": true, "direction": "rtl", @@ -121,9 +119,8 @@ "hasManuallyWrappedRefs": "", "actualLanguage": "en", "languageFamilyName": "english", - "isBaseText": false, "isSource": false, - "isPrimary": "", + "isPrimary": false, "direction": "ltr", "language": "en", "title": "Esther", @@ -151,9 +148,8 @@ "hasManuallyWrappedRefs": "", "actualLanguage": "en", "languageFamilyName": "english", - "isBaseText": false, "isSource": false, - "isPrimary": "", + "isPrimary": false, "direction": "ltr", "language": "en", "title": "Esther", @@ -181,7 +177,6 @@ "hasManuallyWrappedRefs": "", "actualLanguage": "he", "languageFamilyName": "hebrew", - "isBaseText": true, "isSource": true, "isPrimary": true, "direction": "rtl", @@ -211,7 +206,6 @@ "hasManuallyWrappedRefs": "", "actualLanguage": "he", "languageFamilyName": "hebrew", - "isBaseText": true, "isSource": true, "isPrimary": true, "direction": "rtl", @@ -241,7 +235,6 @@ "hasManuallyWrappedRefs": "", "actualLanguage": "he", "languageFamilyName": "hebrew", - "isBaseText": true, "isSource": true, "isPrimary": true, "direction": "rtl", @@ -271,9 +264,8 @@ "hasManuallyWrappedRefs": "", "actualLanguage": "en", "languageFamilyName": "english", - "isBaseText": false, "isSource": false, - "isPrimary": "", + "isPrimary": false, "direction": "ltr", "language": "en", "title": "Esther", @@ -301,9 +293,8 @@ "hasManuallyWrappedRefs": "", "actualLanguage": "en", "languageFamilyName": "english", - "isBaseText": false, "isSource": false, - "isPrimary": "", + "isPrimary": false, "direction": "ltr", "language": "en", "title": "Esther", @@ -331,9 +322,8 @@ "hasManuallyWrappedRefs": "", "actualLanguage": "en", "languageFamilyName": "english", - "isBaseText": false, "isSource": false, - "isPrimary": "", + "isPrimary": false, "direction": "ltr", "language": "en", "title": "Esther", @@ -361,9 +351,8 @@ "hasManuallyWrappedRefs": "", "actualLanguage": "en", "languageFamilyName": "english", - "isBaseText": false, "isSource": false, - "isPrimary": "", + "isPrimary": false, "direction": "ltr", "language": "en", "title": "Esther", @@ -391,7 +380,6 @@ "hasManuallyWrappedRefs": "", "actualLanguage": "he", "languageFamilyName": "hebrew", - "isBaseText": true, "isSource": true, "isPrimary": true, "direction": "rtl", @@ -421,7 +409,6 @@ "hasManuallyWrappedRefs": "", "actualLanguage": "he", "languageFamilyName": "hebrew", - "isBaseText": true, "isSource": true, "isPrimary": true, "direction": "rtl", @@ -451,9 +438,8 @@ "hasManuallyWrappedRefs": "", "actualLanguage": "fr", "languageFamilyName": "french", - "isBaseText": false, "isSource": false, - "isPrimary": "", + "isPrimary": false, "direction": "ltr", "language": "en", "title": "Esther", @@ -481,7 +467,6 @@ "hasManuallyWrappedRefs": "", "actualLanguage": "he", "languageFamilyName": "hebrew", - "isBaseText": true, "isSource": true, "isPrimary": true, "direction": "rtl", @@ -511,9 +496,8 @@ "hasManuallyWrappedRefs": "", "actualLanguage": "en", "languageFamilyName": "english", - "isBaseText": false, "isSource": false, - "isPrimary": "", + "isPrimary": false, "direction": "ltr", "language": "en", "title": "Esther", @@ -541,9 +525,8 @@ "hasManuallyWrappedRefs": "", "actualLanguage": "pl", "languageFamilyName": "polish", - "isBaseText": false, "isSource": false, - "isPrimary": "", + "isPrimary": false, "direction": "ltr", "language": "en", "title": "Esther", @@ -571,9 +554,8 @@ "hasManuallyWrappedRefs": "", "actualLanguage": "it", "languageFamilyName": "italian", - "isBaseText": false, "isSource": false, - "isPrimary": "", + "isPrimary": false, "direction": "ltr", "language": "en", "title": "Esther", @@ -601,9 +583,8 @@ "hasManuallyWrappedRefs": "", "actualLanguage": "fa", "languageFamilyName": "persian", - "isBaseText": false, "isSource": false, - "isPrimary": "", + "isPrimary": false, "direction": "rtl", "language": "he", "title": "Esther", @@ -631,9 +612,8 @@ "hasManuallyWrappedRefs": "", "actualLanguage": "fa", "languageFamilyName": "persian", - "isBaseText": false, "isSource": false, - "isPrimary": "", + "isPrimary": false, "direction": "ltr", "language": "en", "title": "Esther", @@ -661,9 +641,8 @@ "hasManuallyWrappedRefs": "", "actualLanguage": "yi", "languageFamilyName": "yiddish", - "isBaseText": false, "isSource": false, - "isPrimary": "", + "isPrimary": false, "direction": "rtl", "language": "he", "title": "Esther", @@ -691,9 +670,8 @@ "hasManuallyWrappedRefs": "", "actualLanguage": "es", "languageFamilyName": "spanish", - "isBaseText": false, "isSource": false, - "isPrimary": "", + "isPrimary": false, "direction": "ltr", "language": "en", "title": "Esther", @@ -721,9 +699,8 @@ "hasManuallyWrappedRefs": "", "actualLanguage": "fr", "languageFamilyName": "french", - "isBaseText": false, "isSource": false, - "isPrimary": "", + "isPrimary": false, "direction": "ltr", "language": "en", "title": "Esther", @@ -751,9 +728,8 @@ "hasManuallyWrappedRefs": "", "actualLanguage": "eo", "languageFamilyName": "esperanto", - "isBaseText": false, "isSource": false, - "isPrimary": "", + "isPrimary": false, "direction": "ltr", "language": "en", "title": "Esther", @@ -781,9 +757,8 @@ "hasManuallyWrappedRefs": "", "actualLanguage": "de", "languageFamilyName": "german", - "isBaseText": false, "isSource": false, - "isPrimary": "", + "isPrimary": false, "direction": "ltr", "language": "en", "title": "Esther", @@ -811,7 +786,6 @@ "hasManuallyWrappedRefs": "", "actualLanguage": "he", "languageFamilyName": "hebrew", - "isBaseText": "", "isSource": true, "isPrimary": true, "direction": "rtl", @@ -841,9 +815,8 @@ "hasManuallyWrappedRefs": "", "actualLanguage": "en", "languageFamilyName": "english", - "isBaseText": "", "isSource": false, - "isPrimary": "", + "isPrimary": false, "direction": "ltr", "language": "en", "title": "Esther", @@ -1374,7 +1347,6 @@ "purchaseInformationURL": "https://www.israelbookshoppublications.com/store/pc/Metsudah-Five-Megilloth-w-Rashi-26p309.htm", "shortVersionTitle": "", "shortVersionTitleInHebrew": "", - "isBaseText": true, "firstSectionRef": "Rashi on Ecclesiastes 1:1" }, { @@ -1395,7 +1367,6 @@ "purchaseInformationURL": "https://www.israelbookshoppublications.com/store/pc/Metsudah-Five-Megilloth-w-Rashi-26p309.htm", "shortVersionTitle": "", "shortVersionTitleInHebrew": "", - "isBaseText": false, "firstSectionRef": "Rashi on Ecclesiastes 1:1" }, { @@ -1416,7 +1387,6 @@ "purchaseInformationURL": "", "shortVersionTitle": "", "shortVersionTitleInHebrew": "", - "isBaseText": true, "firstSectionRef": "Rashi on Ecclesiastes 1:1" }, { @@ -1437,7 +1407,6 @@ "purchaseInformationURL": "", "shortVersionTitle": "", "shortVersionTitleInHebrew": "", - "isBaseText": false, "firstSectionRef": "Rashi on Ecclesiastes 3:3" }, { @@ -1458,7 +1427,6 @@ "purchaseInformationURL": "", "shortVersionTitle": "Abraham Cohen, Soncino Press, 1946", "shortVersionTitleInHebrew": "", - "isBaseText": false, "firstSectionRef": "Rashi on Ecclesiastes 1:1" } ] @@ -5641,9 +5609,6 @@ "shortVersionTitleInHebrew": { "type": "string" }, - "isBaseText": { - "type": "boolean" - }, "firstSectionRef": { "type": "string" } @@ -5666,7 +5631,6 @@ "purchaseInformationURL": "https://jps.org/books/contemporary-torah/", "shortVersionTitle": "The Contemporary Torah, JPS, 2006", "shortVersionTitleInHebrew": "", - "isBaseText": false, "firstSectionRef": "Genesis 1" } }, @@ -6697,7 +6661,6 @@ "purchaseInformationURL": "", "shortVersionTitle": "", "shortVersionTitleInHebrew": "", - "isBaseText": true }, { "title": "Mishnah Peah", @@ -6717,7 +6680,6 @@ "purchaseInformationURL": "", "shortVersionTitle": "", "shortVersionTitleInHebrew": "", - "isBaseText": true }, { "title": "Mishnah Peah", @@ -6737,7 +6699,6 @@ "purchaseInformationURL": "", "shortVersionTitle": "Dr. Joshua Kulp", "shortVersionTitleInHebrew": "", - "isBaseText": false }, { "title": "Mishnah Peah", @@ -6757,7 +6718,6 @@ "purchaseInformationURL": "", "shortVersionTitle": "", "shortVersionTitleInHebrew": "", - "isBaseText": false }, { "title": "Mishnah Peah", @@ -6777,7 +6737,6 @@ "purchaseInformationURL": "", "shortVersionTitle": "Lazarus Goldschmidt, 1929 ", "shortVersionTitleInHebrew": "", - "isBaseText": false }, { "title": "Mishnah Peah", @@ -6797,7 +6756,6 @@ "purchaseInformationURL": "", "shortVersionTitle": "", "shortVersionTitleInHebrew": "", - "isBaseText": false }, { "title": "Mishnah Peah", @@ -6817,7 +6775,6 @@ "purchaseInformationURL": "", "shortVersionTitle": "", "shortVersionTitleInHebrew": "", - "isBaseText": false }, { "title": "Mishnah Peah", @@ -6837,7 +6794,6 @@ "purchaseInformationURL": "", "shortVersionTitle": "", "shortVersionTitleInHebrew": "", - "isBaseText": "" } ], "textDepth": 2, @@ -7378,11 +7334,6 @@ "description": "A Hebrew short title for the version", "type": "string" }, - "isBaseText": { - "description": "Is this a text that has commentary. `true` indicates that there are commentaries on this text, while `false` indicates that the text does not have any commentary. ", - "type": "boolean", - "example": "True" - }, "firstSectionRef": { "description": "The first `ref` for this title on a section level (as opposed to the more granular segment level). For example, `Genesis 1` would be the first section level ref of Genesis (as opposed to the segment, `Genesis 1:1`), and `Rashi on Kohelet 1:1` is the first section level ref of `Rashi on Kohelet` (as opposed to the segment level of `Rashi on Kohelet 1:1:1`)", "type": "string", @@ -7407,7 +7358,6 @@ "purchaseInformationURL": "", "shortVersionTitle": "", "shortVersionTitleInHebrew": "", - "isBaseText": false, "firstSectionRef": "Shulchan Arukh, Orach Chayim 1" } }, @@ -9089,7 +9039,7 @@ "items": { "$ref": "#/components/schemas/v3TextVersionsJSON" }, - "example": "{\n\"status\": \"locked\",\n\"priority\": 2,\n\"license\": \"CC-BY-SA\",\n\"versionNotes\": \"Miqra According to the Masorah (MAM) is a digital Hebrew edition of the Tanakh based on the Aleppo Codex and related manuscripts. It is designed for readers, and as such it contains added elements to aid vocalization of the text. For instance: When an accent is marked in an unstressed syllable, an extra accent is added in the proper place (pashta, zarqa, segol, telisha). Legarmeih and paseq are visibly distinguished. Qamaz qatan is indicated by its designated Unicode character (alternatives are documented where traditions differ about its application).
The text of MAM is fully documented. The complete introduction to the edition (Hebrew) explains the types of editorial decisions that have been made and the reasons for them (English abstract). In addition, every word in the Bible about which there is some textual concern or ambiguity includes a documentation note; these notes can be viewed conveniently here. If an error is discovered, it may be reported to User:Dovi at Hebrew Wikisource. Please check the documentation notes before reporting an error.\",\n\"formatAsPoetry\": \"\",\n\"digitizedBySefaria\": \"\",\n\"method\": \"\",\n\"heversionSource\": \"\",\n\"versionUrl\": \"\",\n\"versionTitleInHebrew\": \"מקרא על פי המסורה\",\n\"versionNotesInHebrew\": \"\",\n\"shortVersionTitle\": \"\",\n\"shortVersionTitleInHebrew\": \"\",\n\"extendedNotes\": \"\",\n\"extendedNotesHebrew\": \"\",\n\"purchaseInformationImage\": \"\",\n\"purchaseInformationURL\": \"\",\n\"hasManuallyWrappedRefs\": \"\",\n\"actualLanguage\": \"he\",\n\"languageFamilyName\": \"hebrew\",\n\"isBaseText\": true,\n\"isSource\": true,\n\"isPrimary\": true,\n\"direction\": \"rtl\",\n\"language\": \"he\",\n\"versionSource\": \"https://he.wikisource.org/wiki/%D7%9E%D7%A9%D7%AA%D7%9E%D7%A9:Dovi/%D7%9E%D7%A7%D7%A8%D7%90_%D7%A2%D7%9C_%D7%A4%D7%99_%D7%94%D7%9E%D7%A1%D7%95%D7%A8%D7%94\",\n\"versionTitle\": \"Miqra according to the Masorah\",\n\"text\": [\n[\n\"חֲז֖וֹן עֹֽבַדְיָ֑ה כֹּֽה־אָמַר֩ אֲדֹנָ֨י יֱהֹוִ֜ה לֶאֱד֗וֹם שְׁמוּעָ֨ה שָׁמַ֜עְנוּ מֵאֵ֤ת יְהֹוָה֙ וְצִיר֙ בַּגּוֹיִ֣ם שֻׁלָּ֔ח ק֛וּמוּ וְנָק֥וּמָה עָלֶ֖יהָ לַמִּלְחָמָֽה׃\",\n\"הִנֵּ֥ה קָטֹ֛ן נְתַתִּ֖יךָ בַּגּוֹיִ֑ם בָּז֥וּי אַתָּ֖ה מְאֹֽד׃\",\n\"זְד֤וֹן לִבְּךָ֙ הִשִּׁיאֶ֔ךָ שֹׁכְנִ֥י בְחַגְוֵי־סֶ֖לַע מְר֣וֹם שִׁבְתּ֑וֹ אֹמֵ֣ר בְּלִבּ֔וֹ מִ֥י יוֹרִדֵ֖נִי אָֽרֶץ׃\",\n\"אִם־תַּגְבִּ֣יהַּ כַּנֶּ֔שֶׁר וְאִם־בֵּ֥ין כּֽוֹכָבִ֖ים שִׂ֣ים קִנֶּ֑ךָ מִשָּׁ֥ם אוֹרִֽידְךָ֖ נְאֻם־יְהֹוָֽה׃\",\n\"אִם־גַּנָּבִ֤ים בָּאֽוּ־לְךָ֙ אִם־שׁ֣וֹדְדֵי לַ֔יְלָה אֵ֣יךְ נִדְמֵ֔יתָה הֲל֥וֹא יִגְנְב֖וּ דַּיָּ֑ם אִם־בֹּֽצְרִים֙ בָּ֣אוּ לָ֔ךְ הֲל֖וֹא יַשְׁאִ֥ירוּ עֹלֵלֽוֹת׃\",\n\"אֵ֚יךְ נֶחְפְּשׂ֣וּ עֵשָׂ֔ו נִבְע֖וּ מַצְפֻּנָֽיו׃\",\n\"עַֽד־הַגְּב֣וּל שִׁלְּח֗וּךָ כֹּ֚ל אַנְשֵׁ֣י בְרִיתֶ֔ךָ הִשִּׁיא֛וּךָ יָכְל֥וּ לְךָ֖ אַנְשֵׁ֣י שְׁלֹמֶ֑ךָ לַחְמְךָ֗ יָשִׂ֤ימוּ מָזוֹר֙ תַּחְתֶּ֔יךָ אֵ֥ין תְּבוּנָ֖ה בּֽוֹ׃\",\n\"הֲל֛וֹא בַּיּ֥וֹם הַה֖וּא נְאֻם־יְהֹוָ֑ה וְהַאֲבַדְתִּ֤י חֲכָמִים֙ מֵֽאֱד֔וֹם וּתְבוּנָ֖ה מֵהַ֥ר עֵשָֽׂו׃\",\n\"וְחַתּ֥וּ גִבּוֹרֶ֖יךָ תֵּימָ֑ן לְמַ֧עַן יִכָּֽרֶת־אִ֛ישׁ מֵהַ֥ר עֵשָׂ֖ו מִקָּֽטֶל׃\",\n\"מֵחֲמַ֛ס אָחִ֥יךָ יַעֲקֹ֖ב תְּכַסְּךָ֣ בוּשָׁ֑ה וְנִכְרַ֖תָּ לְעוֹלָֽם׃\",\n\"בְּיוֹם֙ עֲמׇֽדְךָ֣ מִנֶּ֔גֶד בְּי֛וֹם שְׁב֥וֹת זָרִ֖ים חֵיל֑וֹ וְנׇכְרִ֞ים בָּ֣אוּ שְׁעָרָ֗ו וְעַל־יְרוּשָׁלַ֙͏ִם֙ יַדּ֣וּ גוֹרָ֔ל גַּם־אַתָּ֖ה כְּאַחַ֥ד מֵהֶֽם׃\",\n\"וְאַל־תֵּ֤רֶא בְיוֹם־אָחִ֙יךָ֙ בְּי֣וֹם נׇכְר֔וֹ וְאַל־תִּשְׂמַ֥ח לִבְנֵֽי־יְהוּדָ֖ה בְּי֣וֹם אׇבְדָ֑ם וְאַל־תַּגְדֵּ֥ל פִּ֖יךָ בְּי֥וֹם צָרָֽה׃\",\n\"אַל־תָּב֤וֹא בְשַֽׁעַר־עַמִּי֙ בְּי֣וֹם אֵידָ֔ם אַל־תֵּ֧רֶא גַם־אַתָּ֛ה בְּרָעָת֖וֹ בְּי֣וֹם אֵיד֑וֹ וְאַל־תִּשְׁלַ֥חְנָה בְחֵיל֖וֹ בְּי֥וֹם אֵידֽוֹ׃\",\n\"וְאַֽל־תַּעֲמֹד֙ עַל־הַפֶּ֔רֶק לְהַכְרִ֖ית אֶת־פְּלִיטָ֑יו וְאַל־תַּסְגֵּ֥ר שְׂרִידָ֖יו בְּי֥וֹם צָרָֽה׃\",\n\"כִּֽי־קָר֥וֹב יוֹם־יְהֹוָ֖ה עַל־כׇּל־הַגּוֹיִ֑ם כַּאֲשֶׁ֤ר עָשִׂ֙יתָ֙ יֵעָ֣שֶׂה לָּ֔ךְ גְּמֻלְךָ֖ יָשׁ֥וּב בְּרֹאשֶֽׁךָ׃\",\n\"כִּ֗י כַּֽאֲשֶׁ֤ר שְׁתִיתֶם֙ עַל־הַ֣ר קׇדְשִׁ֔י יִשְׁתּ֥וּ כׇֽל־הַגּוֹיִ֖ם תָּמִ֑יד וְשָׁת֣וּ וְלָע֔וּ וְהָי֖וּ כְּל֥וֹא הָיֽוּ׃\",\n\"וּבְהַ֥ר צִיּ֛וֹן תִּהְיֶ֥ה פְלֵיטָ֖ה וְהָ֣יָה קֹ֑דֶשׁ וְיָֽרְשׁוּ֙ בֵּ֣ית יַֽעֲקֹ֔ב אֵ֖ת מוֹרָֽשֵׁיהֶֽם׃\",\n\"וְהָיָה֩ בֵית־יַעֲקֹ֨ב אֵ֜שׁ וּבֵ֧ית יוֹסֵ֣ף לֶהָבָ֗ה וּבֵ֤ית עֵשָׂו֙ לְקַ֔שׁ וְדָלְק֥וּ בָהֶ֖ם וַאֲכָל֑וּם וְלֹֽא־יִֽהְיֶ֤ה שָׂרִיד֙ לְבֵ֣ית עֵשָׂ֔ו כִּ֥י יְהֹוָ֖ה דִּבֵּֽר׃\",\n\"וְיָרְשׁ֨וּ הַנֶּ֜גֶב אֶת־הַ֣ר עֵשָׂ֗ו וְהַשְּׁפֵלָה֙ אֶת־פְּלִשְׁתִּ֔ים וְיָרְשׁוּ֙ אֶת־שְׂדֵ֣ה אֶפְרַ֔יִם וְאֵ֖ת שְׂדֵ֣ה שֹׁמְר֑וֹן וּבִנְיָמִ֖ן אֶת־הַגִּלְעָֽד׃\",\n\"וְגָלֻ֣ת הַֽחֵל־הַ֠זֶּ֠ה לִבְנֵ֨י יִשְׂרָאֵ֤ל אֲשֶֽׁר־כְּנַעֲנִים֙ עַד־צָ֣רְפַ֔ת וְגָלֻ֥ת יְרוּשָׁלַ֖͏ִם אֲשֶׁ֣ר בִּסְפָרַ֑ד יִֽרְשׁ֕וּ אֵ֖ת עָרֵ֥י הַנֶּֽגֶב׃\",\n\"וְעָל֤וּ מֽוֹשִׁעִים֙ בְּהַ֣ר צִיּ֔וֹן לִשְׁפֹּ֖ט אֶת־הַ֣ר עֵשָׂ֑ו וְהָיְתָ֥ה לַֽיהֹוָ֖ה הַמְּלוּכָֽה׃\"\n]\n],\n\"firstSectionRef\": \"Obadiah 1\"}\n" + "example": "{\n\"status\": \"locked\",\n\"priority\": 2,\n\"license\": \"CC-BY-SA\",\n\"versionNotes\": \"Miqra According to the Masorah (MAM) is a digital Hebrew edition of the Tanakh based on the Aleppo Codex and related manuscripts. It is designed for readers, and as such it contains added elements to aid vocalization of the text. For instance: When an accent is marked in an unstressed syllable, an extra accent is added in the proper place (pashta, zarqa, segol, telisha). Legarmeih and paseq are visibly distinguished. Qamaz qatan is indicated by its designated Unicode character (alternatives are documented where traditions differ about its application).
The text of MAM is fully documented. The complete introduction to the edition (Hebrew) explains the types of editorial decisions that have been made and the reasons for them (English abstract). In addition, every word in the Bible about which there is some textual concern or ambiguity includes a documentation note; these notes can be viewed conveniently here. If an error is discovered, it may be reported to User:Dovi at Hebrew Wikisource. Please check the documentation notes before reporting an error.\",\n\"formatAsPoetry\": \"\",\n\"digitizedBySefaria\": \"\",\n\"method\": \"\",\n\"heversionSource\": \"\",\n\"versionUrl\": \"\",\n\"versionTitleInHebrew\": \"מקרא על פי המסורה\",\n\"versionNotesInHebrew\": \"\",\n\"shortVersionTitle\": \"\",\n\"shortVersionTitleInHebrew\": \"\",\n\"extendedNotes\": \"\",\n\"extendedNotesHebrew\": \"\",\n\"purchaseInformationImage\": \"\",\n\"purchaseInformationURL\": \"\",\n\"hasManuallyWrappedRefs\": \"\",\n\"actualLanguage\": \"he\",\n\"languageFamilyName\": \"hebrew\",\n\"isSource\": true,\n\"isPrimary\": true,\n\"direction\": \"rtl\",\n\"language\": \"he\",\n\"versionSource\": \"https://he.wikisource.org/wiki/%D7%9E%D7%A9%D7%AA%D7%9E%D7%A9:Dovi/%D7%9E%D7%A7%D7%A8%D7%90_%D7%A2%D7%9C_%D7%A4%D7%99_%D7%94%D7%9E%D7%A1%D7%95%D7%A8%D7%94\",\n\"versionTitle\": \"Miqra according to the Masorah\",\n\"text\": [\n[\n\"חֲז֖וֹן עֹֽבַדְיָ֑ה כֹּֽה־אָמַר֩ אֲדֹנָ֨י יֱהֹוִ֜ה לֶאֱד֗וֹם שְׁמוּעָ֨ה שָׁמַ֜עְנוּ מֵאֵ֤ת יְהֹוָה֙ וְצִיר֙ בַּגּוֹיִ֣ם שֻׁלָּ֔ח ק֛וּמוּ וְנָק֥וּמָה עָלֶ֖יהָ לַמִּלְחָמָֽה׃\",\n\"הִנֵּ֥ה קָטֹ֛ן נְתַתִּ֖יךָ בַּגּוֹיִ֑ם בָּז֥וּי אַתָּ֖ה מְאֹֽד׃\",\n\"זְד֤וֹן לִבְּךָ֙ הִשִּׁיאֶ֔ךָ שֹׁכְנִ֥י בְחַגְוֵי־סֶ֖לַע מְר֣וֹם שִׁבְתּ֑וֹ אֹמֵ֣ר בְּלִבּ֔וֹ מִ֥י יוֹרִדֵ֖נִי אָֽרֶץ׃\",\n\"אִם־תַּגְבִּ֣יהַּ כַּנֶּ֔שֶׁר וְאִם־בֵּ֥ין כּֽוֹכָבִ֖ים שִׂ֣ים קִנֶּ֑ךָ מִשָּׁ֥ם אוֹרִֽידְךָ֖ נְאֻם־יְהֹוָֽה׃\",\n\"אִם־גַּנָּבִ֤ים בָּאֽוּ־לְךָ֙ אִם־שׁ֣וֹדְדֵי לַ֔יְלָה אֵ֣יךְ נִדְמֵ֔יתָה הֲל֥וֹא יִגְנְב֖וּ דַּיָּ֑ם אִם־בֹּֽצְרִים֙ בָּ֣אוּ לָ֔ךְ הֲל֖וֹא יַשְׁאִ֥ירוּ עֹלֵלֽוֹת׃\",\n\"אֵ֚יךְ נֶחְפְּשׂ֣וּ עֵשָׂ֔ו נִבְע֖וּ מַצְפֻּנָֽיו׃\",\n\"עַֽד־הַגְּב֣וּל שִׁלְּח֗וּךָ כֹּ֚ל אַנְשֵׁ֣י בְרִיתֶ֔ךָ הִשִּׁיא֛וּךָ יָכְל֥וּ לְךָ֖ אַנְשֵׁ֣י שְׁלֹמֶ֑ךָ לַחְמְךָ֗ יָשִׂ֤ימוּ מָזוֹר֙ תַּחְתֶּ֔יךָ אֵ֥ין תְּבוּנָ֖ה בּֽוֹ׃\",\n\"הֲל֛וֹא בַּיּ֥וֹם הַה֖וּא נְאֻם־יְהֹוָ֑ה וְהַאֲבַדְתִּ֤י חֲכָמִים֙ מֵֽאֱד֔וֹם וּתְבוּנָ֖ה מֵהַ֥ר עֵשָֽׂו׃\",\n\"וְחַתּ֥וּ גִבּוֹרֶ֖יךָ תֵּימָ֑ן לְמַ֧עַן יִכָּֽרֶת־אִ֛ישׁ מֵהַ֥ר עֵשָׂ֖ו מִקָּֽטֶל׃\",\n\"מֵחֲמַ֛ס אָחִ֥יךָ יַעֲקֹ֖ב תְּכַסְּךָ֣ בוּשָׁ֑ה וְנִכְרַ֖תָּ לְעוֹלָֽם׃\",\n\"בְּיוֹם֙ עֲמׇֽדְךָ֣ מִנֶּ֔גֶד בְּי֛וֹם שְׁב֥וֹת זָרִ֖ים חֵיל֑וֹ וְנׇכְרִ֞ים בָּ֣אוּ שְׁעָרָ֗ו וְעַל־יְרוּשָׁלַ֙͏ִם֙ יַדּ֣וּ גוֹרָ֔ל גַּם־אַתָּ֖ה כְּאַחַ֥ד מֵהֶֽם׃\",\n\"וְאַל־תֵּ֤רֶא בְיוֹם־אָחִ֙יךָ֙ בְּי֣וֹם נׇכְר֔וֹ וְאַל־תִּשְׂמַ֥ח לִבְנֵֽי־יְהוּדָ֖ה בְּי֣וֹם אׇבְדָ֑ם וְאַל־תַּגְדֵּ֥ל פִּ֖יךָ בְּי֥וֹם צָרָֽה׃\",\n\"אַל־תָּב֤וֹא בְשַֽׁעַר־עַמִּי֙ בְּי֣וֹם אֵידָ֔ם אַל־תֵּ֧רֶא גַם־אַתָּ֛ה בְּרָעָת֖וֹ בְּי֣וֹם אֵיד֑וֹ וְאַל־תִּשְׁלַ֥חְנָה בְחֵיל֖וֹ בְּי֥וֹם אֵידֽוֹ׃\",\n\"וְאַֽל־תַּעֲמֹד֙ עַל־הַפֶּ֔רֶק לְהַכְרִ֖ית אֶת־פְּלִיטָ֑יו וְאַל־תַּסְגֵּ֥ר שְׂרִידָ֖יו בְּי֥וֹם צָרָֽה׃\",\n\"כִּֽי־קָר֥וֹב יוֹם־יְהֹוָ֖ה עַל־כׇּל־הַגּוֹיִ֑ם כַּאֲשֶׁ֤ר עָשִׂ֙יתָ֙ יֵעָ֣שֶׂה לָּ֔ךְ גְּמֻלְךָ֖ יָשׁ֥וּב בְּרֹאשֶֽׁךָ׃\",\n\"כִּ֗י כַּֽאֲשֶׁ֤ר שְׁתִיתֶם֙ עַל־הַ֣ר קׇדְשִׁ֔י יִשְׁתּ֥וּ כׇֽל־הַגּוֹיִ֖ם תָּמִ֑יד וְשָׁת֣וּ וְלָע֔וּ וְהָי֖וּ כְּל֥וֹא הָיֽוּ׃\",\n\"וּבְהַ֥ר צִיּ֛וֹן תִּהְיֶ֥ה פְלֵיטָ֖ה וְהָ֣יָה קֹ֑דֶשׁ וְיָֽרְשׁוּ֙ בֵּ֣ית יַֽעֲקֹ֔ב אֵ֖ת מוֹרָֽשֵׁיהֶֽם׃\",\n\"וְהָיָה֩ בֵית־יַעֲקֹ֨ב אֵ֜שׁ וּבֵ֧ית יוֹסֵ֣ף לֶהָבָ֗ה וּבֵ֤ית עֵשָׂו֙ לְקַ֔שׁ וְדָלְק֥וּ בָהֶ֖ם וַאֲכָל֑וּם וְלֹֽא־יִֽהְיֶ֤ה שָׂרִיד֙ לְבֵ֣ית עֵשָׂ֔ו כִּ֥י יְהֹוָ֖ה דִּבֵּֽר׃\",\n\"וְיָרְשׁ֨וּ הַנֶּ֜גֶב אֶת־הַ֣ר עֵשָׂ֗ו וְהַשְּׁפֵלָה֙ אֶת־פְּלִשְׁתִּ֔ים וְיָרְשׁוּ֙ אֶת־שְׂדֵ֣ה אֶפְרַ֔יִם וְאֵ֖ת שְׂדֵ֣ה שֹׁמְר֑וֹן וּבִנְיָמִ֖ן אֶת־הַגִּלְעָֽד׃\",\n\"וְגָלֻ֣ת הַֽחֵל־הַ֠זֶּ֠ה לִבְנֵ֨י יִשְׂרָאֵ֤ל אֲשֶֽׁר־כְּנַעֲנִים֙ עַד־צָ֣רְפַ֔ת וְגָלֻ֥ת יְרוּשָׁלַ֖͏ִם אֲשֶׁ֣ר בִּסְפָרַ֑ד יִֽרְשׁ֕וּ אֵ֖ת עָרֵ֥י הַנֶּֽגֶב׃\",\n\"וְעָל֤וּ מֽוֹשִׁעִים֙ בְּהַ֣ר צִיּ֔וֹן לִשְׁפֹּ֖ט אֶת־הַ֣ר עֵשָׂ֑ו וְהָיְתָ֥ה לַֽיהֹוָ֖ה הַמְּלוּכָֽה׃\"\n]\n],\n\"firstSectionRef\": \"Obadiah 1\"}\n" }, "available_versions": { "type": "array", @@ -9274,7 +9224,6 @@ "hasManuallyWrappedRefs": "", "actualLanguage": "he", "languageFamilyName": "hebrew", - "isBaseText": true, "isSource": true, "isPrimary": true, "direction": "rtl", @@ -9419,14 +9368,11 @@ "description": "The overarching family for the specific language detailed in `actualLanguage`. For example, `Arabic` would be the overarching `languageFamily` for `judeo-arabic`.", "type": "string" }, - "isBaseText": { - "type": "string" - }, "isSource": { "type": "boolean" }, "isPrimary": { - "type": "string" + "type": "boolean" }, "direction": { "type": "string" @@ -9472,9 +9418,8 @@ "hasManuallyWrappedRefs": "", "actualLanguage": "en", "languageFamilyName": "english", - "isBaseText": "", "isSource": false, - "isPrimary": "", + "isPrimary": false, "direction": "ltr", "language": "en", "title": "Obadiah", @@ -9564,10 +9509,6 @@ "description": "The overarching family for the specific language detailed in `actualLanguage`. For example, `Arabic` would be the overarching family for `judeo-arabic`. ", "type": "string" }, - "isBaseText": { - "description": "Indicates whether or not this text is the base for a commentary (i.e. `Genesis` is a base text for `Ramban on Genesis`)", - "type": "boolean" - }, "isSource": { "type": "boolean" }, @@ -9629,7 +9570,6 @@ "hasManuallyWrappedRefs": "", "actualLanguage": "he", "languageFamilyName": "hebrew", - "isBaseText": true, "isSource": true, "isPrimary": true, "direction": "rtl", From 83b51c32beec999eb98825ab7afb2229143fc310 Mon Sep 17 00:00:00 2001 From: yonadavGit Date: Wed, 29 May 2024 13:50:23 +0300 Subject: [PATCH 092/210] feat(reader options): first pass at setting tooltip --- static/js/Misc.jsx | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/static/js/Misc.jsx b/static/js/Misc.jsx index eec2762407..e42efb7062 100644 --- a/static/js/Misc.jsx +++ b/static/js/Misc.jsx @@ -1347,8 +1347,11 @@ class CloseButton extends Component { class DisplaySettingsButton extends Component { render() { - let style = this.props.placeholder ? {visibility: "hidden"} : {}; + // let style = this.props.placeholder ? {visibility: "hidden"} : {}; let icon; + let style = this.props.placeholder ? {visibility: "hidden"} : {}; + const altText = "Text display options"; + const classes = classNames({saveButton: 1, "tooltip-toggle": true}); if (Sefaria._siteSettings.TORAH_SPECIFIC) { icon = @@ -1359,17 +1362,22 @@ class DisplaySettingsButton extends Component { } else { icon = Aa; } - return ( - {icon} - ); + return ( + + + {icon} + + + + ); } } DisplaySettingsButton.propTypes = { From 0006e4f971c4948226a492849dd83cb56f8d9661 Mon Sep 17 00:00:00 2001 From: yonadavGit Date: Wed, 29 May 2024 17:21:22 +0300 Subject: [PATCH 093/210] feat(reader header): tooltip altText for hebrew interface --- static/js/Misc.jsx | 5 ++--- static/js/sefaria/strings.js | 1 + 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/static/js/Misc.jsx b/static/js/Misc.jsx index e42efb7062..6776233aea 100644 --- a/static/js/Misc.jsx +++ b/static/js/Misc.jsx @@ -1347,10 +1347,9 @@ class CloseButton extends Component { class DisplaySettingsButton extends Component { render() { - // let style = this.props.placeholder ? {visibility: "hidden"} : {}; - let icon; let style = this.props.placeholder ? {visibility: "hidden"} : {}; - const altText = "Text display options"; + let icon; + const altText = Sefaria._('Text display options') const classes = classNames({saveButton: 1, "tooltip-toggle": true}); if (Sefaria._siteSettings.TORAH_SPECIFIC) { diff --git a/static/js/sefaria/strings.js b/static/js/sefaria/strings.js index 48daedbfc6..323247e295 100644 --- a/static/js/sefaria/strings.js +++ b/static/js/sefaria/strings.js @@ -273,6 +273,7 @@ const Strings = { "Location: ": "מיקום: ", "Translations": "תרגומים", "Uncategorized": "לא מסווג", + "Text display options": "אפשרויות תצוגת טקסט", // Collections "Collections": "אסופות", From cf2c53ab4d12081faed6c0d67dd42b3e0c0783c2 Mon Sep 17 00:00:00 2001 From: Brendan Galloway Date: Thu, 30 May 2024 09:40:10 +0200 Subject: [PATCH 094/210] ci: add visibility of continuous tests to merge_queue trigger --- .github/workflows/continuous.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/continuous.yaml b/.github/workflows/continuous.yaml index d9821422be..41c3b8fa70 100644 --- a/.github/workflows/continuous.yaml +++ b/.github/workflows/continuous.yaml @@ -1,6 +1,7 @@ name: Continuous on: pull_request: + merge_group: concurrency: group: ${{ github.ref }} @@ -8,6 +9,7 @@ concurrency: jobs: build-generic: + if: ${{ github.event_name == 'pull_request' }} name: "Continuous Image Build" permissions: contents: 'read' @@ -76,6 +78,7 @@ jobs: tags: ${{ steps.meta.outputs.tags }} labels: ${{ steps.meta.outputs.labels }} build-derived: + if: ${{ github.event_name == 'pull_request' }} name: "Continuous Image Build Stage 2" runs-on: ubuntu-latest permissions: From 141924606975ffd60fb95ac9761cef9771dffa43 Mon Sep 17 00:00:00 2001 From: yonadavGit Date: Thu, 30 May 2024 11:22:57 +0300 Subject: [PATCH 095/210] feat(reader header): design polish tooltip altText for hebrew interface --- static/css/s2.css | 4 ++++ static/js/Misc.jsx | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/static/css/s2.css b/static/css/s2.css index 7fed6d7eb2..0f7b7d67bd 100644 --- a/static/css/s2.css +++ b/static/css/s2.css @@ -4992,6 +4992,10 @@ body .ui-autocomplete.dictionary-toc-autocomplete .ui-menu-item a.ui-state-focus .readerOptions .int-he img { height: 18px; } +.rightButtons .readerOptionsTooltip.tooltip-toggle::before { + top: 47px; + left: -50px; +} .rightButtons .readerOptions { vertical-align: middle; } diff --git a/static/js/Misc.jsx b/static/js/Misc.jsx index 6776233aea..d194a87220 100644 --- a/static/js/Misc.jsx +++ b/static/js/Misc.jsx @@ -1350,7 +1350,7 @@ class DisplaySettingsButton extends Component { let style = this.props.placeholder ? {visibility: "hidden"} : {}; let icon; const altText = Sefaria._('Text display options') - const classes = classNames({saveButton: 1, "tooltip-toggle": true}); + const classes = classNames({readerOptionsTooltip: 1, "tooltip-toggle": true}); if (Sefaria._siteSettings.TORAH_SPECIFIC) { icon = From 6faae1e92f428c05daae64d256b891f2e055dd2e Mon Sep 17 00:00:00 2001 From: YishaiGlasner Date: Mon, 3 Jun 2024 09:07:16 +0300 Subject: [PATCH 096/210] chore: remove final commas in json. --- docs/openAPI.json | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/docs/openAPI.json b/docs/openAPI.json index fc374ba44d..fe97c22686 100644 --- a/docs/openAPI.json +++ b/docs/openAPI.json @@ -6660,7 +6660,7 @@ "purchaseInformationImage": "", "purchaseInformationURL": "", "shortVersionTitle": "", - "shortVersionTitleInHebrew": "", + "shortVersionTitleInHebrew": "" }, { "title": "Mishnah Peah", @@ -6679,7 +6679,7 @@ "purchaseInformationImage": "", "purchaseInformationURL": "", "shortVersionTitle": "", - "shortVersionTitleInHebrew": "", + "shortVersionTitleInHebrew": "" }, { "title": "Mishnah Peah", @@ -6698,7 +6698,7 @@ "purchaseInformationImage": "", "purchaseInformationURL": "", "shortVersionTitle": "Dr. Joshua Kulp", - "shortVersionTitleInHebrew": "", + "shortVersionTitleInHebrew": "" }, { "title": "Mishnah Peah", @@ -6717,7 +6717,7 @@ "purchaseInformationImage": "", "purchaseInformationURL": "", "shortVersionTitle": "", - "shortVersionTitleInHebrew": "", + "shortVersionTitleInHebrew": "" }, { "title": "Mishnah Peah", @@ -6736,7 +6736,7 @@ "purchaseInformationImage": "", "purchaseInformationURL": "", "shortVersionTitle": "Lazarus Goldschmidt, 1929 ", - "shortVersionTitleInHebrew": "", + "shortVersionTitleInHebrew": "" }, { "title": "Mishnah Peah", @@ -6755,7 +6755,7 @@ "purchaseInformationImage": "", "purchaseInformationURL": "", "shortVersionTitle": "", - "shortVersionTitleInHebrew": "", + "shortVersionTitleInHebrew": "" }, { "title": "Mishnah Peah", @@ -6774,7 +6774,7 @@ "purchaseInformationImage": "", "purchaseInformationURL": "", "shortVersionTitle": "", - "shortVersionTitleInHebrew": "", + "shortVersionTitleInHebrew": "" }, { "title": "Mishnah Peah", @@ -6793,7 +6793,7 @@ "purchaseInformationImage": "", "purchaseInformationURL": "", "shortVersionTitle": "", - "shortVersionTitleInHebrew": "", + "shortVersionTitleInHebrew": "" } ], "textDepth": 2, From 949a2e853ce96ea1dbe6f63826686c0003367f1e Mon Sep 17 00:00:00 2001 From: YishaiGlasner Date: Mon, 3 Jun 2024 09:38:10 +0300 Subject: [PATCH 097/210] refactor(version normalization): shorter conditions for default values of languageFamilyName and isPrimary. --- sefaria/model/text.py | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/sefaria/model/text.py b/sefaria/model/text.py index 4b3100d2f4..e5f3fb8540 100644 --- a/sefaria/model/text.py +++ b/sefaria/model/text.py @@ -1376,20 +1376,12 @@ def _normalize(self): actualLanguage = self.language self.actualLanguage = actualLanguage - if not getattr(self, 'languageFamilyName', None): - try: - self.languageFamilyName = constants.LANGUAGE_CODES[self.actualLanguage] - except KeyError: - self.languageFamilyName = constants.LANGUAGE_CODES[self.language] - + if not hasattr(self, 'languageFamilyName'): + self.languageFamilyName = constants.LANGUAGE_CODES.get(self.actualLanguage) or constants.LANGUAGE_CODES[self.language] self.isSource = getattr(self, "isSource", False) - if not getattr(self, "isPrimary", False): - if self.isSource or not VersionSet({'title': self.title}): - self.isPrimary = True - else: - self.isPrimary = False - - if not getattr(self, 'direction', None): + if not hasattr(self, "isPrimary"): + self.isPrimary = self.isSource or not VersionSet({'title': self.title}) #first version is primary + if not hasattr(self, 'direction'): self.direction = 'rtl' if self.language == 'he' else 'ltr' if getattr(self, "priority", None): From 1bfebe8fb74cce25727069ee439294a61544614f Mon Sep 17 00:00:00 2001 From: Brendan Galloway Date: Thu, 30 May 2024 09:52:22 +0200 Subject: [PATCH 098/210] ci: fix logic for image build on draft PR --- .github/workflows/continuous.yaml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/continuous.yaml b/.github/workflows/continuous.yaml index 41c3b8fa70..85e59e0bfa 100644 --- a/.github/workflows/continuous.yaml +++ b/.github/workflows/continuous.yaml @@ -15,7 +15,6 @@ jobs: contents: 'read' id-token: 'write' runs-on: ubuntu-latest - if: github.event.pull_request.draft == false strategy: matrix: app: [ web, node ] @@ -89,7 +88,6 @@ jobs: strategy: matrix: app: [ asset, linker ] - if: github.event.pull_request.draft == false steps: - uses: actions/checkout@v4 - name: Set up QEMU From e4859e4b8647856942f12cde039a62ccb9651375 Mon Sep 17 00:00:00 2001 From: yonadavGit Date: Mon, 3 Jun 2024 13:05:44 +0300 Subject: [PATCH 099/210] chore(reader header): removed className and fixed indentation --- static/js/Misc.jsx | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/static/js/Misc.jsx b/static/js/Misc.jsx index 821e1f9c67..e9c5cb5cfc 100644 --- a/static/js/Misc.jsx +++ b/static/js/Misc.jsx @@ -1350,7 +1350,7 @@ class DisplaySettingsButton extends Component { let style = this.props.placeholder ? {visibility: "hidden"} : {}; let icon; const altText = Sefaria._('Text display options') - const classes = classNames({readerOptionsTooltip: 1, "tooltip-toggle": true}); + const classes = "readerOptionsTooltip tooltip-toggle"; if (Sefaria._siteSettings.TORAH_SPECIFIC) { icon = @@ -1374,9 +1374,8 @@ class DisplaySettingsButton extends Component { onKeyPress={function(e) {e.charCode == 13 ? this.props.onClick(e):null}.bind(this)}> {icon} - - - ); + + ); } } DisplaySettingsButton.propTypes = { From b374a5ced3634beab428c4949e0f80fa4f92b98f Mon Sep 17 00:00:00 2001 From: YishaiGlasner Date: Mon, 3 Jun 2024 14:28:30 +0300 Subject: [PATCH 100/210] refactor(version normalization): shorter conditions for default values of actualLanguage. --- sefaria/model/text.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/sefaria/model/text.py b/sefaria/model/text.py index e5f3fb8540..f47f7122eb 100644 --- a/sefaria/model/text.py +++ b/sefaria/model/text.py @@ -1372,9 +1372,7 @@ def _normalize(self): languageCode = re.search(r"\[([a-z]{2})\]$", versionTitle) if languageCode and languageCode.group(1): actualLanguage = languageCode.group(1) - if not actualLanguage: - actualLanguage = self.language - self.actualLanguage = actualLanguage + self.actualLanguage = actualLanguage or self.language if not hasattr(self, 'languageFamilyName'): self.languageFamilyName = constants.LANGUAGE_CODES.get(self.actualLanguage) or constants.LANGUAGE_CODES[self.language] From 714355d93f8e9cb851fd5510635521b59b8d19e2 Mon Sep 17 00:00:00 2001 From: yonadavGit Date: Tue, 4 Jun 2024 13:16:42 +0300 Subject: [PATCH 101/210] chore(autocomplete tests): changed locator to getByText to conform with new autocomplete rendering --- e2e-tests/tests/search.spec.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/e2e-tests/tests/search.spec.ts b/e2e-tests/tests/search.spec.ts index 3ea65428d6..692e27fabf 100644 --- a/e2e-tests/tests/search.spec.ts +++ b/e2e-tests/tests/search.spec.ts @@ -5,7 +5,7 @@ test('Search auto complete', async ({ context }) => { const page = await goToPageWithLang(context, '/'); await page.getByPlaceholder('Search').fill('אהבה'); await page.waitForSelector('text=אהבה', { state: 'visible' }); - await page.getByRole('option', { name: 'אהבה', exact: true }).click(); + await page.getByText('אהבה', { exact: true }).click(); await expect(page).toHaveTitle(/Love/); }); From 4b9ba7f550f7ed9e3e536f592c854ba5230aaba6 Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Sun, 9 Jun 2024 15:47:14 +0300 Subject: [PATCH 102/210] fix(topics indexing): also catch AttributeError because timeperiod.start might not be defined --- sefaria/helper/topic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sefaria/helper/topic.py b/sefaria/helper/topic.py index e721a01072..19ef0de400 100644 --- a/sefaria/helper/topic.py +++ b/sefaria/helper/topic.py @@ -682,7 +682,7 @@ def calculate_other_ref_scores(ref_topic_map): try: tp = oref.index.best_time_period() year = int(tp.start) if tp else 3000 - except ValueError: + except (ValueError, AttributeError): year = 3000 comp_date_map[(topic, tref)] = year order_id_map[(topic, tref)] = oref.order_id() From c3dc541dc6802d3471ea3d08aae0c79c5a9fb200 Mon Sep 17 00:00:00 2001 From: YishaiGlasner Date: Mon, 10 Jun 2024 12:49:26 +0300 Subject: [PATCH 103/210] fix(version normalization): version with actualLanguage 'he' and without isSource should have isSource True. --- sefaria/model/text.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sefaria/model/text.py b/sefaria/model/text.py index 187fe3c7b4..7d5bf72d8f 100644 --- a/sefaria/model/text.py +++ b/sefaria/model/text.py @@ -1387,7 +1387,7 @@ def _normalize(self): if not hasattr(self, 'languageFamilyName'): self.languageFamilyName = constants.LANGUAGE_CODES.get(self.actualLanguage) or constants.LANGUAGE_CODES[self.language] - self.isSource = getattr(self, "isSource", False) + self.isSource = getattr(self, "isSource", self.actualLanguage == 'he') if not hasattr(self, "isPrimary"): self.isPrimary = self.isSource or not VersionSet({'title': self.title}) #first version is primary if not hasattr(self, 'direction'): From cce5f4a5994c4df543add48515ef8c192cbc50f9 Mon Sep 17 00:00:00 2001 From: YishaiGlasner Date: Mon, 10 Jun 2024 12:50:56 +0300 Subject: [PATCH 104/210] test(version normalization): test version normalization. --- sefaria/model/tests/text_test.py | 51 +++++++++++++++++++------------- 1 file changed, 30 insertions(+), 21 deletions(-) diff --git a/sefaria/model/tests/text_test.py b/sefaria/model/tests/text_test.py index 860a68edf3..eaecf9b989 100644 --- a/sefaria/model/tests/text_test.py +++ b/sefaria/model/tests/text_test.py @@ -743,53 +743,62 @@ def setup_class(cls): "sectionNames": ["Chapter", "Paragraph"], "categories": ["Musar"], }).save() - cls.versionWithTranslation = model.Version( + cls.firstTranslationVersion = model.Version( { - "chapter": cls.myIndex.nodes.create_skeleton(), + "chapter": [['1'], ['2'], ["original text", "2nd"]], "versionTitle": "Version 1 TEST [fr]", "versionSource": "blabla", - "language": "he", + "language": "en", "title": cls.myIndexTitle } - ) - cls.versionWithTranslation.chapter = [['1'], ['2'], ["original text", "2nd"]] - cls.versionWithTranslation.save() - cls.versionWithoutTranslation = model.Version( + ).save() + cls.sourceVersion = model.Version( { - "chapter": cls.myIndex.nodes.create_skeleton(), + "chapter":cls.myIndex.nodes.create_skeleton(), "versionTitle": "Version 1 TEST", "versionSource": "blabla", "language": "he", "title": cls.myIndexTitle } ) - cls.versionWithoutTranslation.chapter = [['1'], ['2'], ["original text", "2nd"]] - cls.versionWithoutTranslation.save() + cls.sourceVersion.chapter = [['1'], ['2'], ["original text", "2nd"]] + cls.sourceVersion.save() cls.versionWithLangCodeMismatch = model.Version( { "chapter": cls.myIndex.nodes.create_skeleton(), "versionTitle": "Version 1 TEST [ar]", "versionSource": "blabla", - "language": "he", - "actualLanguage": "fr", + "language": "en", + 'actualLanguage': 'fr', "title": cls.myIndexTitle } ) @classmethod def teardown_class(cls): - for c in [cls.myIndex, cls.versionWithTranslation, cls.versionWithoutTranslation, cls.versionWithLangCodeMismatch]: + for c in [cls.myIndex, cls.sourceVersion, cls.firstTranslationVersion, cls.versionWithLangCodeMismatch]: try: c.delete() except Exception: pass - def test_normalizes_actualLanguage_from_brackets(self): - assert self.versionWithTranslation.actualLanguage == "fr" - - def test_normalizes_language_from_language(self): - assert self.versionWithoutTranslation.actualLanguage == "he" + def test_normalize(self): + assert self.firstTranslationVersion.actualLanguage == 'fr' + assert self.firstTranslationVersion.direction == 'ltr' + assert self.firstTranslationVersion.languageFamilyName == 'french' + assert self.firstTranslationVersion.isPrimary is True + assert self.firstTranslationVersion.isSource is False + assert self.sourceVersion.actualLanguage == 'he' + assert self.sourceVersion.direction == 'rtl' + assert self.sourceVersion.languageFamilyName == 'hebrew' + assert self.sourceVersion.isSource is True + assert self.sourceVersion.isPrimary is True + self.versionWithLangCodeMismatch._normalize() + assert self.versionWithLangCodeMismatch.actualLanguage == 'fr' + assert self.versionWithLangCodeMismatch.direction == 'ltr' + assert self.versionWithLangCodeMismatch.languageFamilyName == 'french' + assert self.versionWithLangCodeMismatch.isSource is False + assert self.versionWithLangCodeMismatch.isPrimary is False + + - def test_save_when_language_mismatch(self): - self.versionWithLangCodeMismatch.save() - assert self.versionWithLangCodeMismatch.actualLanguage == "ar" \ No newline at end of file From c60f16639d87227f7a3329eb1860ceead9aa9a8b Mon Sep 17 00:00:00 2001 From: yonadavGit Date: Mon, 10 Jun 2024 15:12:43 +0300 Subject: [PATCH 105/210] fix(search bar): enable search with input from virtual keyboard --- static/js/Autocomplete.jsx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/static/js/Autocomplete.jsx b/static/js/Autocomplete.jsx index 1bdb9bc916..5a212fd0d2 100644 --- a/static/js/Autocomplete.jsx +++ b/static/js/Autocomplete.jsx @@ -198,7 +198,7 @@ const SearchInputBox = ({getInputProps, suggestions, highlightedIndex, hideHebre const handleSearchButtonClick = (event) => { - const inputQuery = otherDownShiftProps.value + const inputQuery = otherDownShiftProps.value || document.getElementsByClassName('keyboardInput')[0].value; if (inputQuery) { submitSearch(inputQuery); } else { From 68ab19ac9a96a60bf6cb9d42d174fe8b1d0c2b19 Mon Sep 17 00:00:00 2001 From: stevekaplan123 Date: Mon, 10 Jun 2024 20:45:01 +0300 Subject: [PATCH 106/210] fix(History): handlePopState should clone state We eed to clone state and panels; if we don't clone them, when we run setState, it will make it so that this.state refers to the same object as history.state, and this.state.panels refers to the same object as history.state.panels, a situation which causes back button bugs --- static/js/ReaderApp.jsx | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/static/js/ReaderApp.jsx b/static/js/ReaderApp.jsx index b1ee4e29aa..be4dcd15fd 100644 --- a/static/js/ReaderApp.jsx +++ b/static/js/ReaderApp.jsx @@ -282,8 +282,14 @@ class ReaderApp extends Component { } else { state.panels = []; } - this.setState(state, () => { - if (state.scrollPosition) { + + // need to clone state and panels; if we don't clone them, when we run setState, it will make it so that + // this.state.panels refers to the same object as history.state.panels, which cause back button bugs + const newState = {...state}; + newState.panels = newState.panels.map(panel => this.clonePanel(panel)); + + this.setState(newState, () => { + if (newState.scrollPosition) { $(".content").scrollTop(event.state.scrollPosition) .trigger("scroll"); } From 54a7aff440fca29ee945a5ac5a64064d53b2621a Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Thu, 13 Jun 2024 13:17:56 +0300 Subject: [PATCH 107/210] fix(topics): fix issue where wrong link is edited when duplicate ref/slug pairs exist. Note, this is a patch which needs to make the assumption that you're interested in editing the link with the highest curatedPrimacy. --- sefaria/helper/topic.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/sefaria/helper/topic.py b/sefaria/helper/topic.py index e721a01072..9932f2f2b5 100644 --- a/sefaria/helper/topic.py +++ b/sefaria/helper/topic.py @@ -1184,10 +1184,11 @@ def edit_topic_source(slug, orig_tref, new_tref="", creating_new_link=True, if topic_obj is None: return {"error": "Topic does not exist."} ref_topic_dict = {"toTopic": slug, "linkType": linkType, "ref": orig_tref} - link = RefTopicLink().load(ref_topic_dict) - link_already_existed = link is not None - if not link_already_existed: - link = RefTopicLink(ref_topic_dict) + # we don't know what link is being targeted b/c we don't know the dataSource. + # we'll guess that the most likely candidate is the link with the highest curatedPrimacy + link_set = RefTopicLinkSet(ref_topic_dict, sort=[["order.curatedPrimacy", -1]]).array() + link_already_existed = len(link_set) > 0 + link = link_set[0] if link_already_existed else RefTopicLink(ref_topic_dict) if not hasattr(link, 'order'): link.order = {} From d0f8625d51d1af25b284e2157d8843216b4d7e1a Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Thu, 13 Jun 2024 13:41:23 +0300 Subject: [PATCH 108/210] fix(topics): remove filtering of sources by interfaceLanguage. curatedPrimacy can override language sorting so filtering out sources that don't exist in the interface language can lead to inconsistent sortings --- static/js/Misc.jsx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/static/js/Misc.jsx b/static/js/Misc.jsx index e9c5cb5cfc..cd99a78aed 100644 --- a/static/js/Misc.jsx +++ b/static/js/Misc.jsx @@ -1177,7 +1177,7 @@ const ReorderEditorWrapper = ({toggle, type, data}) => { return []; } // a topic can be connected to refs in one language and not in another so filter out those that are not in current interface lang - refs = refs.filter((x) => !x.is_sheet && x?.order?.availableLangs?.includes(Sefaria.interfaceLang.slice(0, 2))); + refs = refs.filter((x) => !x.is_sheet); // then sort the refs and take only first 30 sources because admins don't want to reorder hundreds of sources return refs.sort((a, b) => refSort('relevance', [a.ref, a], [b.ref, b])).slice(0, 30); } From 7ffbda3daef167ecaafffea581d4de41b2ed428b Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Thu, 13 Jun 2024 13:44:35 +0300 Subject: [PATCH 109/210] refactor(topics): move language vars lower to where they're used --- static/js/TopicPage.jsx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/static/js/TopicPage.jsx b/static/js/TopicPage.jsx index 51686b1a37..38b72453fe 100644 --- a/static/js/TopicPage.jsx +++ b/static/js/TopicPage.jsx @@ -104,8 +104,6 @@ const refSort = (currSortOption, a, b) => { return a.order.comp_date - b.order.comp_date; } else { - const aAvailLangs = a.order.availableLangs; - const bAvailLangs = b.order.availableLangs; if ((Sefaria.interfaceLang === 'english') && (a.order.curatedPrimacy.en > 0 || b.order.curatedPrimacy.en > 0)) { return b.order.curatedPrimacy.en - a.order.curatedPrimacy.en; } @@ -113,6 +111,8 @@ const refSort = (currSortOption, a, b) => { (a.order.curatedPrimacy.he > 0 || b.order.curatedPrimacy.he > 0)) { return b.order.curatedPrimacy.he - a.order.curatedPrimacy.he; } + const aAvailLangs = a.order.availableLangs; + const bAvailLangs = b.order.availableLangs; if (Sefaria.interfaceLang === 'english' && aAvailLangs.length !== bAvailLangs.length) { if (aAvailLangs.indexOf('en') > -1) { return -1; } if (bAvailLangs.indexOf('en') > -1) { return 1; } From e14bc5aaa022fc6e7a7761e9112b2eab65da2bff Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Thu, 13 Jun 2024 13:45:05 +0300 Subject: [PATCH 110/210] chore(topics): remove unused attribute custom_order. There are no documents with this attribute --- static/js/TopicPage.jsx | 1 - 1 file changed, 1 deletion(-) diff --git a/static/js/TopicPage.jsx b/static/js/TopicPage.jsx index 38b72453fe..44c25fc2b0 100644 --- a/static/js/TopicPage.jsx +++ b/static/js/TopicPage.jsx @@ -118,7 +118,6 @@ const refSort = (currSortOption, a, b) => { if (bAvailLangs.indexOf('en') > -1) { return 1; } return 0; } - else if (a.order.custom_order !== b.order.custom_order) { return b.order.custom_order - a.order.custom_order; } // custom_order, when present, should trump other data else if (a.order.pr !== b.order.pr) { return b.order.pr - a.order.pr; } From 8bd281ab1b20ae75c356265aea5887d8bf5c6431 Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Thu, 13 Jun 2024 14:44:54 +0300 Subject: [PATCH 111/210] fix(topics): make server-side topic link relevance sort dependent on interface lang, just like on the client side. This should avoid some strange inconsistencies seen in sorting. --- reader/views.py | 12 ++++++------ sefaria/helper/topic.py | 34 +++++++++++++++++++++++----------- 2 files changed, 29 insertions(+), 17 deletions(-) diff --git a/reader/views.py b/reader/views.py index e26dd0ee0a..45580457ec 100644 --- a/reader/views.py +++ b/reader/views.py @@ -2167,7 +2167,7 @@ def related_api(request, tref): "sheets": get_sheets_for_ref(tref), "notes": [], # get_notes(oref, public=True) # Hiding public notes for now "webpages": get_webpages_for_ref(tref), - "topics": get_topics_for_ref(tref, annotate=True), + "topics": get_topics_for_ref(tref, request.interfaceLang, annotate=True), "manuscripts": ManuscriptPageSet.load_set_for_client(tref), "media": get_media_for_ref(tref), "guides": GuideSet.load_set_for_client(tref) @@ -3057,7 +3057,7 @@ def topic_page(request, topic, test_version=None): "en": topic_obj.get_primary_title('en'), "he": topic_obj.get_primary_title('he') }, - "topicData": _topic_page_data(topic), + "topicData": _topic_page_data(topic, request.interfaceLang), } if test_version is not None: @@ -3167,7 +3167,7 @@ def topics_api(request, topic, v2=False): annotate_time_period = bool(int(request.GET.get("annotate_time_period", False))) with_indexes = bool(int(request.GET.get("with_indexes", False))) ref_link_type_filters = set(filter(lambda x: len(x) > 0, request.GET.get("ref_link_type_filters", "").split("|"))) - response = get_topic(v2, topic, with_html=with_html, with_links=with_links, annotate_links=annotate_links, with_refs=with_refs, group_related=group_related, annotate_time_period=annotate_time_period, ref_link_type_filters=ref_link_type_filters, with_indexes=with_indexes) + response = get_topic(v2, topic, request.interfaceLang, with_html=with_html, with_links=with_links, annotate_links=annotate_links, with_refs=with_refs, group_related=group_related, annotate_time_period=annotate_time_period, ref_link_type_filters=ref_link_type_filters, with_indexes=with_indexes) return jsonResponse(response, callback=request.GET.get("callback", None)) elif request.method == "POST": if not request.user.is_staff: @@ -3253,7 +3253,7 @@ def topic_ref_api(request, tref): annotate = bool(int(data.get("annotate", False))) if request.method == "GET": - response = get_topics_for_ref(tref, annotate) + response = get_topics_for_ref(tref, request.interfaceLang, annotate) return jsonResponse(response, callback=request.GET.get("callback", None)) else: if not request.user.is_staff: @@ -3279,8 +3279,8 @@ def reorder_sources(request): 'authors': ['popular-writing-of'], } -def _topic_page_data(topic): - _topic_data(topic=topic, annotate_time_period=True) +def _topic_page_data(topic, lang): + _topic_data(topic=topic, lang=lang, annotate_time_period=True) def _topic_data(**kwargs): diff --git a/sefaria/helper/topic.py b/sefaria/helper/topic.py index 9932f2f2b5..629854a83c 100644 --- a/sefaria/helper/topic.py +++ b/sefaria/helper/topic.py @@ -3,7 +3,7 @@ from pymongo import UpdateOne, InsertOne from typing import Optional, Union from collections import defaultdict -from functools import cmp_to_key +from functools import cmp_to_key, partial from sefaria.model import * from sefaria.model.place import process_topic_place_change from sefaria.system.exceptions import InputError @@ -16,7 +16,7 @@ from sefaria.helper.descriptions import create_era_link logger = structlog.get_logger(__name__) -def get_topic(v2, topic, with_html=True, with_links=True, annotate_links=True, with_refs=True, group_related=True, annotate_time_period=False, ref_link_type_filters=None, with_indexes=True): +def get_topic(v2, topic, lang, with_html=True, with_links=True, annotate_links=True, with_refs=True, group_related=True, annotate_time_period=False, ref_link_type_filters=None, with_indexes=True): topic_obj = Topic.init(topic) if topic_obj is None: return {} @@ -45,7 +45,7 @@ def get_topic(v2, topic, with_html=True, with_links=True, annotate_links=True, w if with_links: response['links'] = group_links_by_type('intraTopic', intra_links, annotate_links, group_related) if with_refs: - ref_links = sort_and_group_similar_refs(ref_links) + ref_links = sort_and_group_similar_refs(ref_links, lang) if v2: ref_links = group_links_by_type('refTopic', ref_links, False, False) response['refs'] = ref_links @@ -169,8 +169,8 @@ def iterate_and_merge(new_ref_links, new_link, subset_ref_map, temp_subset_refs) new_ref_links[index] = merge_props_for_similar_refs(new_ref_links[index], new_link) return new_ref_links -def sort_and_group_similar_refs(ref_links): - ref_links.sort(key=cmp_to_key(sort_refs_by_relevance)) +def sort_and_group_similar_refs(ref_links, lang): + ref_links.sort(key=cmp_to_key(partial(sort_refs_by_relevance, lang=lang))) subset_ref_map = defaultdict(list) new_ref_links = [] for link in ref_links: @@ -233,15 +233,27 @@ def get_topic_by_parasha(parasha:str) -> Topic: return Topic().load({"parasha": parasha}) -def sort_refs_by_relevance(a, b): +def sort_refs_by_relevance(a, b, lang="english"): + """ + This function should mimic behavior of `refSort` in TopicPage.jsx. + @param lang: language to sort by. Defaults to "english". + @return: + """ aord = a.get('order', {}) bord = b.get('order', {}) + def curated_primacy(order_dict, lang): + return order_dict.get("curatedPrimacy", {}).get(lang, 0) + if not aord and not bord: return 0 if bool(aord) != bool(bord): - return len(bord) - len(aord) - if aord.get("curatedPrimacy") or bord.get("curatedPrimacy"): - return len(bord.get("curatedPrimacy", {})) - len(aord.get("curatedPrimacy", {})) + return int(bool(bord)) - int(bool(aord)) + for curr_lang in ("english", "hebrew"): + short_lang = curr_lang[:2] + aprimacy = curated_primacy(aord, short_lang) + bprimacy = curated_primacy(bord, short_lang) + if lang == curr_lang and (aprimacy > 0 or bprimacy > 0): + return bprimacy - aprimacy if aord.get('pr', 0) != bord.get('pr', 0): return bord.get('pr', 0) - aord.get('pr', 0) return (bord.get('numDatasource', 0) * bord.get('tfidf', 0)) - (aord.get('numDatasource', 0) * aord.get('tfidf', 0)) @@ -318,7 +330,7 @@ def ref_topic_link_prep(link): link['dataSource']['slug'] = data_source_slug return link -def get_topics_for_ref(tref, annotate=False): +def get_topics_for_ref(tref, lang="english", annotate=False): serialized = [l.contents() for l in Ref(tref).topiclinkset()] if annotate: if len(serialized) > 0: @@ -329,7 +341,7 @@ def get_topics_for_ref(tref, annotate=False): for link in serialized: ref_topic_link_prep(link) - serialized.sort(key=cmp_to_key(sort_refs_by_relevance)) + serialized.sort(key=cmp_to_key(partial(sort_refs_by_relevance, lang=lang))) return serialized From c025856a22f539f3294fe399704302acbe36bf9c Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Thu, 13 Jun 2024 14:50:25 +0300 Subject: [PATCH 112/210] refactor(topics): remove stupid for loop --- sefaria/helper/topic.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/sefaria/helper/topic.py b/sefaria/helper/topic.py index 629854a83c..c1b7ff34cf 100644 --- a/sefaria/helper/topic.py +++ b/sefaria/helper/topic.py @@ -248,12 +248,11 @@ def curated_primacy(order_dict, lang): return 0 if bool(aord) != bool(bord): return int(bool(bord)) - int(bool(aord)) - for curr_lang in ("english", "hebrew"): - short_lang = curr_lang[:2] - aprimacy = curated_primacy(aord, short_lang) - bprimacy = curated_primacy(bord, short_lang) - if lang == curr_lang and (aprimacy > 0 or bprimacy > 0): - return bprimacy - aprimacy + short_lang = lang[:2] + aprimacy = curated_primacy(aord, short_lang) + bprimacy = curated_primacy(bord, short_lang) + if aprimacy > 0 or bprimacy > 0: + return bprimacy - aprimacy if aord.get('pr', 0) != bord.get('pr', 0): return bord.get('pr', 0) - aord.get('pr', 0) return (bord.get('numDatasource', 0) * bord.get('tfidf', 0)) - (aord.get('numDatasource', 0) * aord.get('tfidf', 0)) From a1b797464f417878a39549fb417f090b7b0ea649 Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Thu, 13 Jun 2024 15:14:55 +0300 Subject: [PATCH 113/210] fix(topics): fix fallacy in how availableLangs works. When deleting a link, you just want to delete it. --- sefaria/helper/topic.py | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/sefaria/helper/topic.py b/sefaria/helper/topic.py index c1b7ff34cf..43022e9eb8 100644 --- a/sefaria/helper/topic.py +++ b/sefaria/helper/topic.py @@ -1297,20 +1297,11 @@ def delete_ref_topic_link(tref, to_topic, link_type, lang): if link is None: return {"error": f"Link between {tref} and {to_topic} doesn't exist."} - if lang in link.order.get('availableLangs', []): - link.order['availableLangs'].remove(lang) - if lang in link.order.get('curatedPrimacy', []): - link.order['curatedPrimacy'].pop(lang) - - if len(link.order.get('availableLangs', [])) > 0: - link.save() + if link.can_delete(): + link.delete() return {"status": "ok"} - else: # deleted in both hebrew and english so delete link object - if link.can_delete(): - link.delete() - return {"status": "ok"} - else: - return {"error": f"Cannot delete link between {tref} and {to_topic}."} + else: + return {"error": f"Cannot delete link between {tref} and {to_topic}."} def add_image_to_topic(topic_slug, image_uri, en_caption, he_caption): From f6077668ef456e1a585eef7cebd20a5af5179371 Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Thu, 13 Jun 2024 15:16:49 +0300 Subject: [PATCH 114/210] fix(topics): remove unnecessary validation of availableLangs --- sefaria/helper/topic.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/sefaria/helper/topic.py b/sefaria/helper/topic.py index 43022e9eb8..a85962a53d 100644 --- a/sefaria/helper/topic.py +++ b/sefaria/helper/topic.py @@ -1265,8 +1265,6 @@ def update_order_of_topic_sources(topic, sources, uid, lang='en'): if link is None: return {"error": f"Link between {topic} and {s['ref']} doesn't exist."} order = getattr(link, 'order', {}) - if lang not in order.get('availableLangs', []) : - return {"error": f"Link between {topic} and {s['ref']} does not exist in '{lang}'."} ref_to_link[s['ref']] = link # now update curatedPrimacy data From 7b20c7c0766973911f7c51d7616f3f13498472b5 Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Thu, 13 Jun 2024 19:00:02 +0300 Subject: [PATCH 115/210] fix(topics): use tabs which is used to render the sources on the topics page after some processing. --- static/js/Misc.jsx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/static/js/Misc.jsx b/static/js/Misc.jsx index cd99a78aed..f5645ee130 100644 --- a/static/js/Misc.jsx +++ b/static/js/Misc.jsx @@ -1186,7 +1186,7 @@ const ReorderEditorWrapper = ({toggle, type, data}) => { return { url: `/api/source/reorder?topic=${data.slug}&lang=${Sefaria.interfaceLang}`, redirect: `/topics/${data.slug}`, - origItems: _filterAndSortRefs(data.refs?.about?.refs) || [], + origItems: _filterAndSortRefs(data.tabs?.sources?.refs) || [], } } switch (type) { // at /texts or /topics From ae44354af10cf02855e9f9c5966382eeb277819a Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Thu, 13 Jun 2024 19:01:35 +0300 Subject: [PATCH 116/210] Revert "fix(topics): fix fallacy in how availableLangs works." Didn't realize that there is some behavior here that's desirable even though it doesn't work exactly like the spec. This reverts commit a1b797464f417878a39549fb417f090b7b0ea649. --- sefaria/helper/topic.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/sefaria/helper/topic.py b/sefaria/helper/topic.py index a85962a53d..caf03c332f 100644 --- a/sefaria/helper/topic.py +++ b/sefaria/helper/topic.py @@ -1295,11 +1295,20 @@ def delete_ref_topic_link(tref, to_topic, link_type, lang): if link is None: return {"error": f"Link between {tref} and {to_topic} doesn't exist."} - if link.can_delete(): - link.delete() + if lang in link.order.get('availableLangs', []): + link.order['availableLangs'].remove(lang) + if lang in link.order.get('curatedPrimacy', []): + link.order['curatedPrimacy'].pop(lang) + + if len(link.order.get('availableLangs', [])) > 0: + link.save() return {"status": "ok"} - else: - return {"error": f"Cannot delete link between {tref} and {to_topic}."} + else: # deleted in both hebrew and english so delete link object + if link.can_delete(): + link.delete() + return {"status": "ok"} + else: + return {"error": f"Cannot delete link between {tref} and {to_topic}."} def add_image_to_topic(topic_slug, image_uri, en_caption, he_caption): From 5a66db6b7f4d3394ac4ab7c0016dbc031cd5cf8e Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Thu, 13 Jun 2024 22:38:30 +0300 Subject: [PATCH 117/210] docs(topics): add docs to complex sorting function and `get_topic()` --- sefaria/helper/topic.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/sefaria/helper/topic.py b/sefaria/helper/topic.py index caf03c332f..33c28e1f16 100644 --- a/sefaria/helper/topic.py +++ b/sefaria/helper/topic.py @@ -17,6 +17,22 @@ logger = structlog.get_logger(__name__) def get_topic(v2, topic, lang, with_html=True, with_links=True, annotate_links=True, with_refs=True, group_related=True, annotate_time_period=False, ref_link_type_filters=None, with_indexes=True): + """ + Helper function for api/topics/ + TODO fill in rest of parameters + @param v2: + @param topic: slug of topic to get data for + @param lang: the language of the user to sort the ref links by + @param with_html: True if description should be returned with HTML. If false, HTML is stripped. + @param with_links: Should intra-topic links be returned. If true, return dict has a `links` key + @param annotate_links: + @param with_refs: + @param group_related: + @param annotate_time_period: + @param ref_link_type_filters: + @param with_indexes: + @return: + """ topic_obj = Topic.init(topic) if topic_obj is None: return {} @@ -236,6 +252,12 @@ def get_topic_by_parasha(parasha:str) -> Topic: def sort_refs_by_relevance(a, b, lang="english"): """ This function should mimic behavior of `refSort` in TopicPage.jsx. + It is a comparison function that takes two items from the list and returns the corresponding integer to indicate which should go first. To be used with `cmp_to_key`. + It considers the following criteria in order: + - If one object has an `order` key and another doesn't, the one with the `order` key comes first + - curatedPrimacy, higher comes first + - pagerank, higher comes first + - numDatasource (how many distinct links have this ref/topic pair) multiplied by tfidf (a bit complex, in short how "central" to this topic is the vocab used in this ref), higher comes first @param lang: language to sort by. Defaults to "english". @return: """ From 1768df0f609c6f62f2cca8276963212361fa1a10 Mon Sep 17 00:00:00 2001 From: Skyler C Date: Fri, 14 Jun 2024 04:46:01 -0400 Subject: [PATCH 118/210] Update English about.html for 2023 updates --- templates/static/en/about.html | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/templates/static/en/about.html b/templates/static/en/about.html index 11cf8b0a87..6aa11f3d12 100644 --- a/templates/static/en/about.html +++ b/templates/static/en/about.html @@ -272,6 +272,18 @@

Sefaria adds a French Jerusalem Talmud to its collection of translated texts, which includes a German translation of the Babylonian Talmud. +
+

2023

+
+ Against the somber backdrop of the October 7 attacks on Israel, Jews worldwide turn to Sefaria and October user traffic rises to 899,295, demonstrating the power and enduring need for Torah among Jews everywhere. +
+
+ Sefaria’s R&D arm, Sefaria Ventures, launches a groundbreaking partnership with AppliedAI and the Technical University of Munich (TUM) to explore the possibilities of leveraging AI to significantly expand access to Torah. +
+
+ Sefaria partners with the Steinsaltz Center and Aleph Society to launch a digital collection of Rabbi Adin Steinsaltz’s complete works of commentary, making the renowned Rabbi’s writings available to all who wish to learn. +
+
From 0f8f045dbec2da14b6c9f90ba1f40ee407ec3134 Mon Sep 17 00:00:00 2001 From: Skyler C Date: Fri, 14 Jun 2024 05:03:48 -0400 Subject: [PATCH 119/210] Update Hebrew about.html for 2023 updates --- templates/static/he/about.html | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/templates/static/he/about.html b/templates/static/he/about.html index a48e15d850..e9137c626f 100644 --- a/templates/static/he/about.html +++ b/templates/static/he/about.html @@ -279,6 +279,18 @@

ספריא מוסיפה את התלמוד הירושלמי בצרפתית לאוסף המקורות המתורגמים, הכולל תרגום של התלמוד הבבלי לגרמנית. +
+

2023

+
+ בשבועות הקשות שלאחר השבת השחורה של ה-7 באוקטובר, יהודים בכל העולם פונים לספריא. כמות המשתמשים באתר וביישומון עולה ל-889,295, מספר המעיד על כוחם של מקורות היהדות לחזק ולשמש עוגן ליהודים בכל רחבי תבל. +
+
+ מחלקת המחקר והפיתוח של ספריא משיקה שותפות פורצת דרך עם AppliedAI והאוניברסיטה הטכנית של מינכן (TUM). מטרת השותפות היא בחינת האפשרויות הטמונות במינוף של אינטליגנציה מלאכותית ככלי להרחבה משמעותית של גישה ציבורית לתורה. +
+
+ ספריא יוצרת שותפות עם מרכז שטיינזלץ וה-Aleph Society כדי להשיק אוסף דיגיטלי של כל הפרשנויות שכתב הרב עדין שטיינזלץ, ובכך להנגיש את כלל כתביו של הרב הנודע לכל לומד או לומדת באשר יהיו. +
+
From 77d16be563a7050723ef5b4cb76af765087cad91 Mon Sep 17 00:00:00 2001 From: Skyler C Date: Fri, 14 Jun 2024 05:13:06 -0400 Subject: [PATCH 120/210] chore: Fix link to annual impact report --- templates/static/link-to-annual-report.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/templates/static/link-to-annual-report.html b/templates/static/link-to-annual-report.html index fb4e72013a..111388ad80 100644 --- a/templates/static/link-to-annual-report.html +++ b/templates/static/link-to-annual-report.html @@ -15,7 +15,7 @@

Annual Report דו"ח שנתי

{% endif %} - + From c5e81ed951c5c15300a6ecfc7cd5ec0b1fd3bcdc Mon Sep 17 00:00:00 2001 From: Skyler C Date: Fri, 14 Jun 2024 06:11:52 -0400 Subject: [PATCH 121/210] chore: Fix teams page view on mobile devices --- static/css/static.css | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/static/css/static.css b/static/css/static.css index 3ba84ceb05..5dc57ffe1f 100644 --- a/static/css/static.css +++ b/static/css/static.css @@ -814,6 +814,17 @@ p.registration-links a:hover{ width: 200px; content: ""; } + +@media (max-width: 450px) { + #teamPage .team-members { + justify-content: center; + } + + #teamPage .team-members::after { + width: auto !important; + } +} + #teamPage .teamMember { flex: 0 0 30%; } @@ -3663,4 +3674,4 @@ form.globalUpdateForm + div.notificationsList { } .about.section { padding-top: 40px; -} \ No newline at end of file +} From f5d07ab1b526c5c3e8b3c54fc3aceb2824c7c39e Mon Sep 17 00:00:00 2001 From: stevekaplan123 Date: Sun, 16 Jun 2024 11:22:34 +0300 Subject: [PATCH 122/210] fix(Topic Editor): allow 300 character captions --- static/js/TopicEditor.jsx | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/static/js/TopicEditor.jsx b/static/js/TopicEditor.jsx index a916a71685..88fc3f1092 100644 --- a/static/js/TopicEditor.jsx +++ b/static/js/TopicEditor.jsx @@ -93,12 +93,12 @@ const TopicEditor = ({origData, onCreateSuccess, close, origWasCat}) => { alert(Sefaria._("Title must be provided.")); return false; } - if (data.enImgCaption.length > 150) { - alert("English caption is too long. It should not be more than 150 characters"); + if (data.enImgCaption.length > 300) { + alert("English caption is too long. It should not be more than 300 characters"); return false; } - if (data.heImgCaption.length > 150) { - alert("Hebrew caption is too long. It should not be more than 150 characters") + if (data.heImgCaption.length > 300) { + alert("Hebrew caption is too long. It should not be more than 300 characters") return false; } if (sortedSubtopics.length > 0 && !isNew) { From bc2b7e932144654b8d536028486325b2d67ab975 Mon Sep 17 00:00:00 2001 From: yonadavGit Date: Tue, 18 Jun 2024 12:27:47 +0300 Subject: [PATCH 123/210] fix(searchbar): enable search by clicking Enter in virtual keyboard --- static/js/Autocomplete.jsx | 4 ++-- static/js/lib/keyboard.js | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/static/js/Autocomplete.jsx b/static/js/Autocomplete.jsx index 5a212fd0d2..d43cd83b3d 100644 --- a/static/js/Autocomplete.jsx +++ b/static/js/Autocomplete.jsx @@ -184,14 +184,14 @@ const SearchInputBox = ({getInputProps, suggestions, highlightedIndex, hideHebre const { onBlur, onKeyDown, ...otherDownShiftProps } = getInputProps(); const handleSearchKeyDown = (event) => { - onKeyDown(event) + onKeyDown(event); if (event.keyCode !== 13) return; const highlightedItem = highlightedIndex > -1 ? suggestions[highlightedIndex] : null if (highlightedItem && highlightedItem.type != 'search'){ redirectToObject(highlightedItem); return; } - const inputQuery = otherDownShiftProps.value + const inputQuery = otherDownShiftProps.value || document.getElementsByClassName('keyboardInput')[0].value; if (!inputQuery) return; submitSearch(inputQuery); }; diff --git a/static/js/lib/keyboard.js b/static/js/lib/keyboard.js index c90c35f00c..0f3cc4353a 100644 --- a/static/js/lib/keyboard.js +++ b/static/js/lib/keyboard.js @@ -1486,6 +1486,8 @@ var VKI_attach, VKI_close; break; case "Enter": VKI_addListener(td, 'click', function() { + let element = document.querySelector('[vki_attached="true"]'); + element.dispatchEvent(new KeyboardEvent('keydown', {key: 'Enter', code: 'Enter', keyCode: 13, which: 13, bubbles: true, cancelable: true})); if (self.VKI_target.nodeName != "TEXTAREA") { if (self.VKI_enterSubmit && self.VKI_target.form) { for (var z = 0, subm = false; z < self.VKI_target.form.elements.length; z++) From d6bfddb717ffe16b546dd136144cf8121ef9b202 Mon Sep 17 00:00:00 2001 From: yonadavGit Date: Tue, 18 Jun 2024 14:04:06 +0300 Subject: [PATCH 124/210] fix(searchbar): prevent deletion of input inserted via virtual-keyboard on blur using downshift setInputValue function --- static/js/Autocomplete.jsx | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/static/js/Autocomplete.jsx b/static/js/Autocomplete.jsx index d43cd83b3d..dc23b2645b 100644 --- a/static/js/Autocomplete.jsx +++ b/static/js/Autocomplete.jsx @@ -174,7 +174,7 @@ const EntitySearchSuggestion = ({label, onClick, type, url, ...props}) => { ); } -const SearchInputBox = ({getInputProps, suggestions, highlightedIndex, hideHebrewKeyboard, +const SearchInputBox = ({getInputProps, suggestions, highlightedIndex, hideHebrewKeyboard, setInputValue, setSearchFocused, searchFocused, submitSearch, redirectToObject}) => { @@ -224,12 +224,14 @@ const SearchInputBox = ({getInputProps, suggestions, highlightedIndex, hideHebre const blurSearch = (e) => { onBlur(e); + const oldValue = document.getElementsByClassName('keyboardInput')[0].value; const parent = document.getElementById('searchBox'); if (!parent.contains(e.relatedTarget) && !document.getElementById('keyboardInputMaster')) { // debug: comment out the following line: setSearchFocused(false); showVirtualKeyboardIcon(false); } + setInputValue(oldValue) }; const inputClasses = classNames({ @@ -353,6 +355,7 @@ const SuggestionsGroup = ({ suggestions, initialIndexForGroup, getItemProps, hig getInputProps, getItemProps, highlightedIndex, + setInputValue } = useCombobox({ items: suggestions, itemToString: (item) => (item ? item.name : ''), @@ -465,6 +468,7 @@ const SuggestionsGroup = ({ suggestions, initialIndexForGroup, getItemProps, hig suggestions={suggestions} hideHebrewKeyboard={hideHebrewKeyboard} highlightedIndex={highlightedIndex} + setInputValue={setInputValue} setSearchFocused={setSearchFocused} searchFocused={searchFocused} From 9e2525864a99371fd4858623abc1f20a0b412e92 Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Tue, 18 Jun 2024 15:26:51 +0300 Subject: [PATCH 125/210] fix(topics): make sure we can only reorder learning team links --- sefaria/helper/topic.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sefaria/helper/topic.py b/sefaria/helper/topic.py index cfa8a9edb0..561357e49b 100644 --- a/sefaria/helper/topic.py +++ b/sefaria/helper/topic.py @@ -1283,9 +1283,11 @@ def update_order_of_topic_sources(topic, sources, uid, lang='en'): Ref(s['ref']).normal() except InputError as e: return {"error": f"Invalid ref {s['ref']}"} - link = RefTopicLink().load({"toTopic": topic, "linkType": "about", "ref": s['ref']}) + link = RefTopicLink().load({"toTopic": topic, "linkType": "about", "ref": s['ref'], "dataSource": "learning-team"}) if link is None: - return {"error": f"Link between {topic} and {s['ref']} doesn't exist."} + # for now, we are focusing on learning team links and the lack of existence isn't considered an error + continue + # return {"error": f"Link between {topic} and {s['ref']} doesn't exist."} order = getattr(link, 'order', {}) ref_to_link[s['ref']] = link From 73dbe22f9a2343bbb4ee6f1648af299fa57388d3 Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Tue, 18 Jun 2024 15:29:08 +0300 Subject: [PATCH 126/210] fix(topics): make sure we can only delete learning-team links --- sefaria/helper/topic.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sefaria/helper/topic.py b/sefaria/helper/topic.py index 561357e49b..b4098d7c2e 100644 --- a/sefaria/helper/topic.py +++ b/sefaria/helper/topic.py @@ -1314,10 +1314,10 @@ def delete_ref_topic_link(tref, to_topic, link_type, lang): if Topic.init(to_topic) is None: return {"error": f"Topic {to_topic} doesn't exist."} - topic_link = {"toTopic": to_topic, "linkType": link_type, 'ref': tref} + topic_link = {"toTopic": to_topic, "linkType": link_type, 'ref': tref, "dataSource": "learning-team"} link = RefTopicLink().load(topic_link) if link is None: - return {"error": f"Link between {tref} and {to_topic} doesn't exist."} + return {"error": f"A learning-team link between {tref} and {to_topic} doesn't exist. If you are trying to delete a non-learning-team link, reach out to the engineering team."} if lang in link.order.get('availableLangs', []): link.order['availableLangs'].remove(lang) From 24a6cee3ff68017d7a22b0013174f5ba4c21b0d6 Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Tue, 18 Jun 2024 15:31:28 +0300 Subject: [PATCH 127/210] fix(topics): make sure we can only edit learning-team links --- sefaria/helper/topic.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/sefaria/helper/topic.py b/sefaria/helper/topic.py index b4098d7c2e..b1642cc3d5 100644 --- a/sefaria/helper/topic.py +++ b/sefaria/helper/topic.py @@ -1216,12 +1216,11 @@ def edit_topic_source(slug, orig_tref, new_tref="", creating_new_link=True, topic_obj = Topic.init(slug) if topic_obj is None: return {"error": "Topic does not exist."} - ref_topic_dict = {"toTopic": slug, "linkType": linkType, "ref": orig_tref} - # we don't know what link is being targeted b/c we don't know the dataSource. - # we'll guess that the most likely candidate is the link with the highest curatedPrimacy - link_set = RefTopicLinkSet(ref_topic_dict, sort=[["order.curatedPrimacy", -1]]).array() - link_already_existed = len(link_set) > 0 - link = link_set[0] if link_already_existed else RefTopicLink(ref_topic_dict) + ref_topic_dict = {"toTopic": slug, "linkType": linkType, "ref": orig_tref, "dataSource": "learning-team"} + link = RefTopicLink().load(ref_topic_dict) + link_already_existed = link is not None + if not link_already_existed: + link = RefTopicLink(ref_topic_dict) if not hasattr(link, 'order'): link.order = {} From 1172187b023906be7c5c0d406782bb23e3370e13 Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Tue, 18 Jun 2024 15:47:20 +0300 Subject: [PATCH 128/210] fix(topics): add check if link doesnt exist. This can now happen that we are filtering out none learning-team links --- sefaria/helper/topic.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sefaria/helper/topic.py b/sefaria/helper/topic.py index b1642cc3d5..268c3328ad 100644 --- a/sefaria/helper/topic.py +++ b/sefaria/helper/topic.py @@ -1292,7 +1292,9 @@ def update_order_of_topic_sources(topic, sources, uid, lang='en'): # now update curatedPrimacy data for display_order, s in enumerate(sources[::-1]): - link = ref_to_link[s['ref']] + link = ref_to_link.get(s['ref']) + if not link: + continue order = getattr(link, 'order', {}) curatedPrimacy = order.get('curatedPrimacy', {}) curatedPrimacy[lang] = display_order From 5625fdfc598b6664e6ebe0a3744f4cc28bd2c897 Mon Sep 17 00:00:00 2001 From: yonadavGit Date: Tue, 18 Jun 2024 15:54:40 +0300 Subject: [PATCH 129/210] fix(searchbar): cleaner code, plus avoid setValueInput when vkeyboard not open --- static/js/Autocomplete.jsx | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/static/js/Autocomplete.jsx b/static/js/Autocomplete.jsx index dc23b2645b..d029830495 100644 --- a/static/js/Autocomplete.jsx +++ b/static/js/Autocomplete.jsx @@ -178,6 +178,12 @@ const SearchInputBox = ({getInputProps, suggestions, highlightedIndex, hideHebre setSearchFocused, searchFocused, submitSearch, redirectToObject}) => { + const getInputValue = () =>{ + return otherDownShiftProps.value || document.getElementsByClassName('keyboardInput')[0].value; + } + const getVirtualKeyboardInputValue = () =>{ + return document.getElementsByClassName('keyboardInput')[0].value; + } useEffect(() => { showVirtualKeyboardIcon(false); // Initially hide the virtual keyboard icon }, []); @@ -191,14 +197,14 @@ const SearchInputBox = ({getInputProps, suggestions, highlightedIndex, hideHebre redirectToObject(highlightedItem); return; } - const inputQuery = otherDownShiftProps.value || document.getElementsByClassName('keyboardInput')[0].value; + const inputQuery = getInputValue(); if (!inputQuery) return; submitSearch(inputQuery); }; const handleSearchButtonClick = (event) => { - const inputQuery = otherDownShiftProps.value || document.getElementsByClassName('keyboardInput')[0].value; + const inputQuery = getInputValue(); if (inputQuery) { submitSearch(inputQuery); } else { @@ -224,14 +230,14 @@ const SearchInputBox = ({getInputProps, suggestions, highlightedIndex, hideHebre const blurSearch = (e) => { onBlur(e); - const oldValue = document.getElementsByClassName('keyboardInput')[0].value; + const oldValue = getVirtualKeyboardInputValue(); const parent = document.getElementById('searchBox'); if (!parent.contains(e.relatedTarget) && !document.getElementById('keyboardInputMaster')) { // debug: comment out the following line: setSearchFocused(false); showVirtualKeyboardIcon(false); } - setInputValue(oldValue) + !document.getElementById('keyboardInputMaster') && setInputValue(oldValue) }; const inputClasses = classNames({ From e996d9c4fd9bbc3ae395b945c75075fc29c88960 Mon Sep 17 00:00:00 2001 From: Brendan Galloway Date: Tue, 18 Jun 2024 16:28:17 +0200 Subject: [PATCH 130/210] ci: build on merges to master --- .github/workflows/continuous.yaml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/workflows/continuous.yaml b/.github/workflows/continuous.yaml index 85e59e0bfa..257f2ab995 100644 --- a/.github/workflows/continuous.yaml +++ b/.github/workflows/continuous.yaml @@ -2,6 +2,9 @@ name: Continuous on: pull_request: merge_group: + push: + branches: + - master concurrency: group: ${{ github.ref }} @@ -9,7 +12,7 @@ concurrency: jobs: build-generic: - if: ${{ github.event_name == 'pull_request' }} + if: ${{ github.event_name == 'pull_request' || github.event_name == 'push' }} name: "Continuous Image Build" permissions: contents: 'read' @@ -77,7 +80,7 @@ jobs: tags: ${{ steps.meta.outputs.tags }} labels: ${{ steps.meta.outputs.labels }} build-derived: - if: ${{ github.event_name == 'pull_request' }} + if: ${{ github.event_name == 'pull_request' || github.event_name == 'push' }} name: "Continuous Image Build Stage 2" runs-on: ubuntu-latest permissions: From c359ea47d79015a9e1970f96062af69cb393e6a0 Mon Sep 17 00:00:00 2001 From: Skyler Cohen Date: Tue, 18 Jun 2024 22:04:50 -0400 Subject: [PATCH 131/210] Add embed code for GiveFreely modal --- templates/base.html | 2 ++ 1 file changed, 2 insertions(+) diff --git a/templates/base.html b/templates/base.html index 13238c6d3e..dcca2c1566 100644 --- a/templates/base.html +++ b/templates/base.html @@ -151,6 +151,8 @@ {% endif %} + + From f64264a5f12fe4eb7085a4e3c5cf60becfbd341a Mon Sep 17 00:00:00 2001 From: YishaiGlasner Date: Wed, 19 Jun 2024 09:37:44 +0300 Subject: [PATCH 132/210] tests(Version): tests for version normalization. --- sefaria/model/tests/text_test.py | 45 +++++++++++++++++++------------- 1 file changed, 27 insertions(+), 18 deletions(-) diff --git a/sefaria/model/tests/text_test.py b/sefaria/model/tests/text_test.py index eaecf9b989..9efa426a9a 100644 --- a/sefaria/model/tests/text_test.py +++ b/sefaria/model/tests/text_test.py @@ -783,22 +783,31 @@ def teardown_class(cls): pass def test_normalize(self): - assert self.firstTranslationVersion.actualLanguage == 'fr' - assert self.firstTranslationVersion.direction == 'ltr' - assert self.firstTranslationVersion.languageFamilyName == 'french' - assert self.firstTranslationVersion.isPrimary is True - assert self.firstTranslationVersion.isSource is False - assert self.sourceVersion.actualLanguage == 'he' - assert self.sourceVersion.direction == 'rtl' - assert self.sourceVersion.languageFamilyName == 'hebrew' - assert self.sourceVersion.isSource is True - assert self.sourceVersion.isPrimary is True + expected_attrs = { + 'firstTranslationVersion': { + 'actualLanguage': 'fr', + 'direction': 'ltr', + 'languageFamilyName': 'french', + 'isPrimary': True, + 'isSource': False, + }, + 'sourceVersion': { + 'actualLanguage': 'he', + 'direction': 'rtl', + 'languageFamilyName': 'hebrew', + 'isPrimary': True, + 'isSource': True, + }, + 'versionWithLangCodeMismatch': { + 'actualLanguage': 'fr', + 'direction': 'ltr', + 'languageFamilyName': 'french', + 'isPrimary': False, + 'isSource': False, + }, + } self.versionWithLangCodeMismatch._normalize() - assert self.versionWithLangCodeMismatch.actualLanguage == 'fr' - assert self.versionWithLangCodeMismatch.direction == 'ltr' - assert self.versionWithLangCodeMismatch.languageFamilyName == 'french' - assert self.versionWithLangCodeMismatch.isSource is False - assert self.versionWithLangCodeMismatch.isPrimary is False - - - + for version_key in expected_attrs: + version = getattr(self, version_key) + for attr in expected_attrs[version_key]: + assert getattr(version, attr) == expected_attrs[version_key][attr] From d4f4379f7b570fb4907029cec3e62f104431eaa6 Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Thu, 20 Jun 2024 12:55:50 +0300 Subject: [PATCH 133/210] fix(topics): overwrite RefTopicLink._sanitize. Default inherited from abstract.py mangles Refs with ampersands in them. The new implementation avoids that and instead sanitizes title and prompt which can actually be vectors for attack. --- sefaria/model/topic.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/sefaria/model/topic.py b/sefaria/model/topic.py index 57a04fd70b..008acbccac 100644 --- a/sefaria/model/topic.py +++ b/sefaria/model/topic.py @@ -835,6 +835,16 @@ def _normalize(self): self.ref = Ref(self.ref).normal() self.expandedRefs = [r.normal() for r in Ref(self.ref).all_segment_refs()] + def _sanitize(self): + for lang in ("en", "he"): + description = getattr(self, "descriptions", {}).get(lang) + if description: + for field in ("title", "prompt"): + value = description.get(field) + if value: + description[field] = bleach.clean(value, tags=self.ALLOWED_TAGS, attributes=self.ALLOWED_ATTRS) + self.descriptions[lang] = description + def _validate(self): Topic.validate_slug_exists(self.toTopic) TopicLinkType.validate_slug_exists(self.linkType, 0) From bfd053dca92e5df4c4d55221cc437bd7d28fcc3b Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Thu, 20 Jun 2024 13:15:35 +0300 Subject: [PATCH 134/210] docs(topic): add docs for new _sanitize() function --- sefaria/model/topic.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/sefaria/model/topic.py b/sefaria/model/topic.py index 008acbccac..a5dfe56568 100644 --- a/sefaria/model/topic.py +++ b/sefaria/model/topic.py @@ -836,6 +836,11 @@ def _normalize(self): self.expandedRefs = [r.normal() for r in Ref(self.ref).all_segment_refs()] def _sanitize(self): + """ + Sanitize the "title" and "prompt" for all descriptions. + Since they're human editable they are candidates for XSS. + @return: + """ for lang in ("en", "he"): description = getattr(self, "descriptions", {}).get(lang) if description: From 84599105af51cf1daa8120639df9b7b78d9dee8c Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Thu, 20 Jun 2024 13:53:36 +0300 Subject: [PATCH 135/210] fix(topic): encode ref in case it has question marks or other problematic characters for a URI --- static/js/Story.jsx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/static/js/Story.jsx b/static/js/Story.jsx index 12bf3df64c..5814879375 100644 --- a/static/js/Story.jsx +++ b/static/js/Story.jsx @@ -177,7 +177,7 @@ const markReviewedPostRequest = (lang, topic, topicLink) => { 'interface_lang': lang === 'en' ? 'english' : 'hebrew', 'description' : {...topicLink.descriptions[lang], 'review_state': 'reviewed'} }; - return Sefaria.postToApi(`/api/ref-topic-links/${topicLink.ref}`, {}, postData); + return Sefaria.postToApi(`/api/ref-topic-links/${encodeURIComponent(topicLink.ref)}`, {}, postData); } const useReviewState = (topic, topicLink) => { From fed4e7571096a5e837df128da96d293decb17d3e Mon Sep 17 00:00:00 2001 From: yonadavGit Date: Thu, 20 Jun 2024 14:15:50 +0300 Subject: [PATCH 136/210] fix(sidebar toc): ignore section ref for depth 1 nodes --- static/js/BookPage.jsx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/static/js/BookPage.jsx b/static/js/BookPage.jsx index be769caa79..3d830b608a 100644 --- a/static/js/BookPage.jsx +++ b/static/js/BookPage.jsx @@ -803,8 +803,8 @@ class JaggedArrayNodeSection extends Component { if (this.contentCountIsEmpty(contentCounts[i])) { continue; } let [section, heSection] = Sefaria.getSectionStringByAddressType(this.props.addressTypes[0], i, this.props.offset); let ref = (this.props.refPath + ":" + section).replace(":", " ") + this.refPathTerminal(contentCounts[i]); - let currentPlace = ref == this.props?.currentlyVisibleSectionRef || ref == this.props?.currentlyVisibleRef || Sefaria.refContains(this.props?.currentlyVisibleSectionRef, ref); //the second clause is for depth 1 texts - const linkClasses = classNames({"sectionLink": 1, "current": currentPlace}); + let currentPlace = ref == this.props?.currentlyVisibleSectionRef || ref == this.props?.currentlyVisibleRef || (Sefaria.refContains(this.props?.currentlyVisibleSectionRef, ref) && this.props.depth > 1); //the second clause is for depth 1 texts + const linkClasses = classNames({"sectionLink": 1, "current": currentPlace}); let link = ( From b0fe02c02702686efd069611537e6c430951f270 Mon Sep 17 00:00:00 2001 From: Yosef Kaner Date: Sun, 23 Jun 2024 14:26:21 +0300 Subject: [PATCH 137/210] Update popup.js Fix - title in Hebrew was showing in English. When ctrl+click is triggered open in new tab. --- static/js/linker.v3/popup.js | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/static/js/linker.v3/popup.js b/static/js/linker.v3/popup.js index fe80d935cb..aaeeb01614 100644 --- a/static/js/linker.v3/popup.js +++ b/static/js/linker.v3/popup.js @@ -289,7 +289,7 @@ export class PopupManager { this.linkerHeader.style["border-top-color"] = this.category_colors[primaryCategory]; // TODO is this right? - if (this.contentLang !== "he") { + if (this.contentLang.slice(0, 2) !== "he") { // [].forEach.call(heElems, function(e) {e.style.display = "None"}); this.heTitle.style.display = "None"; [].forEach.call(this.enElems, function(e) {e.style.display = "Block"}); @@ -413,6 +413,9 @@ export class PopupManager { elem.addEventListener('mouseout', this.hidePopup, false); } else if (this.mode === "popup-click") { elem.addEventListener('click', (event) => { + if (event.ctrlKey) { + return; + } event.preventDefault(); event.stopPropagation(); this.showPopup(elem, source); @@ -420,4 +423,4 @@ export class PopupManager { }, false); } } -} \ No newline at end of file +} From eb396e21dfe2cbe3290f9a32091302105a5b4cff Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Mon, 24 Jun 2024 11:37:39 +0300 Subject: [PATCH 138/210] fix(admin editor): refactor usages of requestWithCallback() to use Sefaria.adminEditorApiRequest. This fixes some issues with encoding refs properly in the URL because now all logic goes through the same function. --- sefaria/model/trend.py | 2 -- static/js/BookPage.jsx | 5 +++-- static/js/CategoryEditor.jsx | 13 +++++++++---- static/js/Misc.jsx | 26 +------------------------ static/js/SourceEditor.jsx | 21 +++++++++++++------- static/js/Story.jsx | 4 ++-- static/js/TopicEditor.jsx | 8 +++++--- static/js/TopicPage.jsx | 4 ++-- static/js/TopicSearch.jsx | 18 ++++++------------ static/js/sefaria/sefaria.js | 37 ++++++++++++++++++++++++++++++------ 10 files changed, 73 insertions(+), 65 deletions(-) diff --git a/sefaria/model/trend.py b/sefaria/model/trend.py index ed8518287a..c2d9b8306d 100644 --- a/sefaria/model/trend.py +++ b/sefaria/model/trend.py @@ -7,8 +7,6 @@ import time from datetime import datetime, date, timedelta -from py import process - from . import abstract as abst from . import user_profile from . import text diff --git a/static/js/BookPage.jsx b/static/js/BookPage.jsx index be769caa79..3730d19961 100644 --- a/static/js/BookPage.jsx +++ b/static/js/BookPage.jsx @@ -11,7 +11,7 @@ import { AdminToolHeader, CategoryChooser, TitleVariants, - CategoryHeader, requestWithCallBack + CategoryHeader } from './Misc'; import {ContentText} from "./ContentText"; import {validateMarkdownLinks} from "./AdminEditor"; @@ -1293,7 +1293,8 @@ const EditTextInfo = function({initTitle, close}) { const deleteObj = () => { setSavingStatus(true); const url = `/api/v2/index/${enTitle}`; - requestWithCallBack({url, type: "DELETE", redirect: () => window.location.href = `/texts`}); + Sefaria.adminEditorApiRequest(url, null, null, "DELETE") + .then(() => window.location.href = '/texts'); } const renderCollectiveTitle = () => { if (!creatingCollectiveTitle) { diff --git a/static/js/CategoryEditor.jsx b/static/js/CategoryEditor.jsx index 61f5b2c80e..f7636786aa 100644 --- a/static/js/CategoryEditor.jsx +++ b/static/js/CategoryEditor.jsx @@ -2,7 +2,7 @@ import {CategoryChooser, InterfaceText, ToggleSet} from "./Misc"; import Sefaria from "./sefaria/sefaria"; import $ from "./sefaria/sefariaJquery"; import {AdminEditor} from "./AdminEditor"; -import {requestWithCallBack, AdminToolHeader} from "./Misc"; +import {AdminToolHeader} from "./Misc"; import React, {useState, useRef} from "react"; const displayOptionForSources = (child) => { @@ -84,7 +84,9 @@ const ReorderEditor = ({close, type="", postURL="", redirect="", origItems = []} else if (type === 'sources') { postCategoryData = {sources: tocItems}; } - requestWithCallBack({url: postURL, data: postCategoryData, setSavingStatus, redirect: () => window.location.href = redirect}) + Sefaria.adminEditorApiRequest(postURL, null, postCategoryData) + .then(() => window.location.href = redirect) + .finally(() => setSavingStatus(false)); } return
@@ -187,7 +189,9 @@ const CategoryEditor = ({origData={}, close, origPath=[]}) => { if (urlParams.length > 0) { url += `?${urlParams.join('&')}`; } - requestWithCallBack({url, data: postCategoryData, setSavingStatus, redirect: () => window.location.href = "/texts/"+fullPath}); + Sefaria.adminEditorApiRequest(url, null, postCategoryData) + .then(() => window.location.href = "/texts/"+fullPath) + .finally(() => setSavingStatus(false)); } @@ -197,7 +201,8 @@ const CategoryEditor = ({origData={}, close, origPath=[]}) => { return; } const url = `/api/category/${origPath.concat(origData.origEn).join("/")}`; - requestWithCallBack({url, type: "DELETE", redirect: () => window.location.href = `/texts`}); + Sefaria.adminEditorApiRequest(url, null, null, "DELETE") + .then(() => window.location.href = `/texts`); } const primaryOptions = [ {name: "true", content: Sefaria._("True"), role: "radio", ariaLabel: Sefaria._("Set Primary Status to True") }, diff --git a/static/js/Misc.jsx b/static/js/Misc.jsx index f5645ee130..e0b43c77a0 100644 --- a/static/js/Misc.jsx +++ b/static/js/Misc.jsx @@ -1045,29 +1045,6 @@ class ToggleOption extends Component { } } - //style={this.props.style} - -const requestWithCallBack = ({url, setSavingStatus, redirect, type="POST", data={}, redirect_params}) => { - let ajaxPayload = {url, type}; - if (type === "POST") { - ajaxPayload.data = {json: JSON.stringify(data)}; - } - $.ajax({ - ...ajaxPayload, - success: function(result) { - if ("error" in result) { - if (setSavingStatus) { - setSavingStatus(false); - } - alert(result.error); - } else { - redirect(); - } - } - }).fail(function() { - alert(Sefaria._("Something went wrong. Sorry!")); - }); -} const TopicToCategorySlug = function(topic, category=null) { //helper function for AdminEditor @@ -1676,7 +1653,7 @@ const TopicPictureUploader = ({slug, callback, old_filename, caption}) => { const deleteImage = () => { const old_filename_wout_url = old_filename.split("/").slice(-1); const url = `${Sefaria.apiHost}/api/topics/images/${slug}?old_filename=${old_filename_wout_url}`; - requestWithCallBack({url, type: "DELETE", redirect: () => alert("Deleted image.")}); + Sefaria.adminEditorApiRequest(url, null, null, "DELETE").then(() => alert("Deleted image.")); callback(""); fileInput.current.value = ""; } @@ -3385,7 +3362,6 @@ export { AdminToolHeader, CategoryChooser, TitleVariants, - requestWithCallBack, OnInView, TopicPictureUploader, ImageWithCaption diff --git a/static/js/SourceEditor.jsx b/static/js/SourceEditor.jsx index e95cef8588..c4be41720d 100644 --- a/static/js/SourceEditor.jsx +++ b/static/js/SourceEditor.jsx @@ -1,7 +1,7 @@ import Sefaria from "./sefaria/sefaria"; import $ from "./sefaria/sefariaJquery"; import {AdminEditor} from "./AdminEditor"; -import {requestWithCallBack, Autocompleter, InterfaceText} from "./Misc"; +import {Autocompleter, InterfaceText} from "./Misc"; import React, {useState} from "react"; import {useRef} from "react"; @@ -46,10 +46,16 @@ const SourceEditor = ({topic, close, origData={}}) => { const save = async function () { setSavingStatus(true); let refInUrl = isNew ? displayRef : origData.ref; - let url = `/api/ref-topic-links/${Sefaria.normRef(refInUrl)}`; - let postData = {"topic": topic, "is_new": isNew, 'new_ref': displayRef, 'interface_lang': Sefaria.interfaceLang}; - postData['description'] = {"title": data.enTitle, "prompt": data.prompt, "ai_context": data.ai_context, "review_state": "edited"}; - requestWithCallBack({url, data: postData, setSavingStatus, redirect: () => window.location.href = "/topics/"+topic}); + const payload = { + new_ref: displayRef, + topic, + is_new: isNew, + interface_lang: Sefaria.interfaceLang, + description: {"title": data.enTitle, "prompt": data.prompt, "ai_context": data.ai_context, "review_state": "edited"}, + } + Sefaria.postRefTopicLink(refInUrl, payload) + .then(() => window.location.href = `/topics/${topic}`) + .finally(() => setSavingStatus(false)); } const handleChange = (x) => { @@ -86,8 +92,9 @@ const SourceEditor = ({topic, close, origData={}}) => { } const deleteTopicSource = function() { - const url = `/api/ref-topic-links/${origData.ref}?topic=${topic}&interface_lang=${Sefaria.interfaceLang}`; - requestWithCallBack({url, type: "DELETE", redirect: () => window.location.href = `/topics/${topic}`}); + const url = `/api/ref-topic-links/${encodeURIComponent(origData.ref)}?topic=${topic}&interface_lang=${Sefaria.interfaceLang}`; + Sefaria.adminEditorApiRequest(url, null, null, "DELETE") + .then(() => window.location.href = `/topics/${topic}`); } const previousTitleItemRef = useRef(data.enTitle ? "Previous Title" : null); //use useRef to make value null even if component re-renders const previousPromptItemRef = useRef(data.prompt ? "Previous Prompt" : null); diff --git a/static/js/Story.jsx b/static/js/Story.jsx index 5814879375..8aaacc0d89 100644 --- a/static/js/Story.jsx +++ b/static/js/Story.jsx @@ -170,14 +170,14 @@ const ReviewStateIndicatorLang = ({reviewState, markReviewed}) => { } const markReviewedPostRequest = (lang, topic, topicLink) => { - const postData = { + const payload = { "topic": topic, "is_new": false, 'new_ref': topicLink.ref, 'interface_lang': lang === 'en' ? 'english' : 'hebrew', 'description' : {...topicLink.descriptions[lang], 'review_state': 'reviewed'} }; - return Sefaria.postToApi(`/api/ref-topic-links/${encodeURIComponent(topicLink.ref)}`, {}, postData); + return Sefaria.postRefTopicLink(topicLink.ref, payload); } const useReviewState = (topic, topicLink) => { diff --git a/static/js/TopicEditor.jsx b/static/js/TopicEditor.jsx index 88fc3f1092..2d279ebb76 100644 --- a/static/js/TopicEditor.jsx +++ b/static/js/TopicEditor.jsx @@ -1,5 +1,5 @@ import Sefaria from "./sefaria/sefaria"; -import {InterfaceText, requestWithCallBack, TopicPictureUploader} from "./Misc"; +import {InterfaceText, TopicPictureUploader} from "./Misc"; import $ from "./sefaria/sefariaJquery"; import {AdminEditor} from "./AdminEditor"; import {Reorder} from "./CategoryEditor"; @@ -109,7 +109,9 @@ const TopicEditor = ({origData, onCreateSuccess, close, origWasCat}) => { const saveReorderedSubtopics = function () { const url = `/api/topic/reorder`; const postCategoryData = {topics: sortedSubtopics}; - requestWithCallBack({url, data: postCategoryData, setSavingStatus, redirect: () => window.location.href = "/topics"}); + Sefaria.adminEditorApiRequest(url, null, postCategoryData) + .then(() => window.location.href = "/topics") + .finally(() => setSavingStatus(false)); } const prepData = () => { @@ -189,7 +191,7 @@ const TopicEditor = ({origData, onCreateSuccess, close, origWasCat}) => { const deleteObj = function() { const url = `/api/topic/delete/${data.origSlug}`; - requestWithCallBack({url, type: "DELETE", redirect: () => window.location.href = "/topics"}); + Sefaria.adminEditorApiRequest(url, null, null, "DELETE").then(() => window.location.href = "/topics"); } let items = ["Title", "Hebrew Title", "English Description", "Hebrew Description", "Category Menu", "English Alternate Titles", "Hebrew Alternate Titles",]; if (isCategory) { diff --git a/static/js/TopicPage.jsx b/static/js/TopicPage.jsx index 44c25fc2b0..0d25a0dac8 100644 --- a/static/js/TopicPage.jsx +++ b/static/js/TopicPage.jsx @@ -344,7 +344,7 @@ const generatePrompts = async(topicSlug, linksToGenerate) => { }); const payload = {ref_topic_links: linksToGenerate}; try { - await Sefaria.postToApi(`/api/topics/generate-prompts/${topicSlug}`, {}, payload); + await Sefaria.apiRequestWithBody(`/api/topics/generate-prompts/${topicSlug}`, {}, payload); const refValues = linksToGenerate.map(item => item.ref).join(", "); alert("The following prompts are generating: " + refValues); } catch (error) { @@ -359,7 +359,7 @@ const publishPrompts = async (topicSlug, linksToPublish) => { ref.descriptions[lang]["published"] = true; }); try { - const response = await Sefaria.postToApi(`/api/ref-topic-links/bulk`, {}, linksToPublish); + const response = await Sefaria.apiRequestWithBody(`/api/ref-topic-links/bulk`, {}, linksToPublish); const refValues = response.map(item => item.anchorRef).join(", "); const shouldRefresh = confirm("The following prompts have been published: " + refValues + ". Refresh page to see results?"); if (shouldRefresh) { diff --git a/static/js/TopicSearch.jsx b/static/js/TopicSearch.jsx index c736cc72cf..60e098b1cd 100644 --- a/static/js/TopicSearch.jsx +++ b/static/js/TopicSearch.jsx @@ -68,27 +68,21 @@ class TopicSearch extends Component { const srefs = this.props.srefs; const update = this.props.update; const reset = this.reset; - $.post("/api/ref-topic-links/" + Sefaria.normRef(this.props.srefs), {"json": postJSON}, async function (data) { - if (data.error) { - alert(data.error); - } else { + Sefaria.postRefTopicLink(Sefaria.normRef(this.props.srefs), postJSON).then(async () => { const sectionRef = await Sefaria.getRef(Sefaria.normRef(srefs)).sectionRef; srefs.map(sref => { - if (!Sefaria._refTopicLinks[sref]) { - Sefaria._refTopicLinks[sref] = []; - } - Sefaria._refTopicLinks[sref].push(data); + if (!Sefaria._refTopicLinks[sref]) { + Sefaria._refTopicLinks[sref] = []; + } + Sefaria._refTopicLinks[sref].push(data); }); if (!Sefaria._refTopicLinks[sectionRef]) { - Sefaria._refTopicLinks[sectionRef] = []; + Sefaria._refTopicLinks[sectionRef] = []; } Sefaria._refTopicLinks[sectionRef].push(data); update(); reset(); alert("Topic added."); - } - }).fail(function (xhr, status, errorThrown) { - alert("Unfortunately, there may have been an error saving this topic information: " + errorThrown); }); } diff --git a/static/js/sefaria/sefaria.js b/static/js/sefaria/sefaria.js index ecffc50cd5..611ad3a685 100644 --- a/static/js/sefaria/sefaria.js +++ b/static/js/sefaria/sefaria.js @@ -623,27 +623,52 @@ Sefaria = extend(Sefaria, { firstName: firstName, lastName: lastName, }; - return await Sefaria.postToApi(`/api/subscribe/${email}`, null, payload); + return await Sefaria.apiRequestWithBody(`/api/subscribe/${email}`, null, payload); }, subscribeSteinsaltzNewsletter: async function(firstName, lastName, email) { const payload = {firstName, lastName}; - return await Sefaria.postToApi(`/api/subscribe/steinsaltz/${email}`, null, payload); + return await Sefaria.apiRequestWithBody(`/api/subscribe/steinsaltz/${email}`, null, payload); }, - - postToApi: async function(url, urlParams, payload) { + postRefTopicLink: function(refInUrl, payload) { + const url = `/api/ref-topic-links/${encodeURIComponent(Sefaria.normRef(refInUrl))}`; + // payload will need to be refactored once /api/ref-topic-links takes a more standard input + return Sefaria.adminEditorApiRequest(url, null, payload); + }, + adminEditorApiRequest: async function(url, urlParams, payload, method="POST") { + /** + * Wraps apiRequestWithBody() with basic alerting if response has an error + */ + let result; + try { + result = await Sefaria.apiRequestWithBody(url, urlParams, payload, method); + } catch (e) { + alert(Sefaria._("Something went wrong. Sorry!")); + throw e; + } + if (result.error) { + alert(result.error); + throw result.error; + } else { + return result; + } + }, + apiRequestWithBody: async function(url, urlParams, payload, method="POST") { + /** + * Generic function for performing an API request with a payload. Payload and urlParams are optional and will not be used if falsy. + */ let apiUrl = this.apiHost + url; if (urlParams) { apiUrl += '?' + new URLSearchParams(urlParams).toString(); } const response = await fetch(apiUrl, { - method: "POST", + method, mode: 'same-origin', headers: { 'X-CSRFToken': Cookies.get('csrftoken'), 'Content-Type': 'application/json' }, credentials: 'same-origin', - body: JSON.stringify(payload) + body: payload && JSON.stringify(payload) }); if (!response.ok) { From c4b08ce0f70828cf5da95bfbc676eee95df9d6d1 Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Mon, 24 Jun 2024 11:48:35 +0300 Subject: [PATCH 139/210] fix(admin editor): only use Sefaria.normRef() which also does encodeURIComponent() --- static/js/SourceEditor.jsx | 2 +- static/js/sefaria/sefaria.js | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/static/js/SourceEditor.jsx b/static/js/SourceEditor.jsx index c4be41720d..1755c74efe 100644 --- a/static/js/SourceEditor.jsx +++ b/static/js/SourceEditor.jsx @@ -92,7 +92,7 @@ const SourceEditor = ({topic, close, origData={}}) => { } const deleteTopicSource = function() { - const url = `/api/ref-topic-links/${encodeURIComponent(origData.ref)}?topic=${topic}&interface_lang=${Sefaria.interfaceLang}`; + const url = `/api/ref-topic-links/${Sefaria.normRef(origData.ref)}?topic=${topic}&interface_lang=${Sefaria.interfaceLang}`; Sefaria.adminEditorApiRequest(url, null, null, "DELETE") .then(() => window.location.href = `/topics/${topic}`); } diff --git a/static/js/sefaria/sefaria.js b/static/js/sefaria/sefaria.js index 611ad3a685..231e3e9e37 100644 --- a/static/js/sefaria/sefaria.js +++ b/static/js/sefaria/sefaria.js @@ -630,7 +630,7 @@ Sefaria = extend(Sefaria, { return await Sefaria.apiRequestWithBody(`/api/subscribe/steinsaltz/${email}`, null, payload); }, postRefTopicLink: function(refInUrl, payload) { - const url = `/api/ref-topic-links/${encodeURIComponent(Sefaria.normRef(refInUrl))}`; + const url = `/api/ref-topic-links/${Sefaria.normRef(refInUrl)}`; // payload will need to be refactored once /api/ref-topic-links takes a more standard input return Sefaria.adminEditorApiRequest(url, null, payload); }, From b2f6f4e6174af89f76924c2245ff013058aaf76c Mon Sep 17 00:00:00 2001 From: yonadavGit Date: Mon, 24 Jun 2024 16:19:00 +0300 Subject: [PATCH 140/210] refactor(search bar): make getInputValue use getVirtualKeyboardInputValue --- static/js/Autocomplete.jsx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/static/js/Autocomplete.jsx b/static/js/Autocomplete.jsx index d029830495..75ef1836bd 100644 --- a/static/js/Autocomplete.jsx +++ b/static/js/Autocomplete.jsx @@ -179,7 +179,7 @@ const SearchInputBox = ({getInputProps, suggestions, highlightedIndex, hideHebre submitSearch, redirectToObject}) => { const getInputValue = () =>{ - return otherDownShiftProps.value || document.getElementsByClassName('keyboardInput')[0].value; + return otherDownShiftProps.value || getVirtualKeyboardInputValue(); } const getVirtualKeyboardInputValue = () =>{ return document.getElementsByClassName('keyboardInput')[0].value; From 0502f39ea0932b3e42d6f79c336ed3b040db3959 Mon Sep 17 00:00:00 2001 From: yonadavGit Date: Tue, 25 Jun 2024 10:30:29 +0300 Subject: [PATCH 141/210] refactor(search bar): safer query to get the input search element --- static/js/Autocomplete.jsx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/static/js/Autocomplete.jsx b/static/js/Autocomplete.jsx index 75ef1836bd..06cfac82d2 100644 --- a/static/js/Autocomplete.jsx +++ b/static/js/Autocomplete.jsx @@ -182,7 +182,7 @@ const SearchInputBox = ({getInputProps, suggestions, highlightedIndex, hideHebre return otherDownShiftProps.value || getVirtualKeyboardInputValue(); } const getVirtualKeyboardInputValue = () =>{ - return document.getElementsByClassName('keyboardInput')[0].value; + return document.querySelector('#searchBox .keyboardInput').value; } useEffect(() => { showVirtualKeyboardIcon(false); // Initially hide the virtual keyboard icon From 4a8373b9595d14112c8e775f818c93f10900f78b Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Tue, 25 Jun 2024 15:52:10 +0300 Subject: [PATCH 142/210] fix(linker): dont fallback on re now that re2 is required --- sefaria/model/linker/named_entity_resolver.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/sefaria/model/linker/named_entity_resolver.py b/sefaria/model/linker/named_entity_resolver.py index 085121e557..4e40b128cd 100644 --- a/sefaria/model/linker/named_entity_resolver.py +++ b/sefaria/model/linker/named_entity_resolver.py @@ -1,10 +1,6 @@ import dataclasses from typing import List, Dict, Type, Set -try: - import re2 as re - re.set_fallback_notification(re.FALLBACK_WARNING) -except ImportError: - import re +import re2 as re from functools import reduce from collections import defaultdict from sefaria.model.linker.ref_part import RawNamedEntity From 08bbf825b479f9f4d817701ae86d15481220e178 Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Wed, 26 Jun 2024 09:57:30 +0300 Subject: [PATCH 143/210] fix(linker): revert linker API so it still uses v2 results for English. English v3 results are still not on par with v2. --- sefaria/helper/linker.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sefaria/helper/linker.py b/sefaria/helper/linker.py index fbd9801b9a..0d0d57d57f 100644 --- a/sefaria/helper/linker.py +++ b/sefaria/helper/linker.py @@ -101,7 +101,10 @@ def _add_webpage_hit_for_url(url): @django_cache(cache_type="persistent") def _make_find_refs_response_with_cache(request_text: _FindRefsText, options: _FindRefsTextOptions, meta_data: dict) -> dict: - response = _make_find_refs_response_linker_v3(request_text, options) + if request_text.lang == 'he': + response = _make_find_refs_response_linker_v3(request_text, options) + else: + response = _make_find_refs_response_linker_v2(request_text, options) if meta_data: _, webpage = WebPage.add_or_update_from_linker({ From c045f345c992c54156c2af449682bffcbe2d078c Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Wed, 26 Jun 2024 15:34:55 +0300 Subject: [PATCH 144/210] fix(linker): fix issue with deletion so that it also deletes the description and relies on curatedPrimacy to determine if link has been deleted from all langs. --- sefaria/helper/topic.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/sefaria/helper/topic.py b/sefaria/helper/topic.py index 268c3328ad..ae3337d0c7 100644 --- a/sefaria/helper/topic.py +++ b/sefaria/helper/topic.py @@ -1320,12 +1320,13 @@ def delete_ref_topic_link(tref, to_topic, link_type, lang): if link is None: return {"error": f"A learning-team link between {tref} and {to_topic} doesn't exist. If you are trying to delete a non-learning-team link, reach out to the engineering team."} - if lang in link.order.get('availableLangs', []): - link.order['availableLangs'].remove(lang) if lang in link.order.get('curatedPrimacy', []): link.order['curatedPrimacy'].pop(lang) + if lang in getattr(link, 'descriptions', {}): + link.descriptions.pop(lang) - if len(link.order.get('availableLangs', [])) > 0: + # Note, using curatedPrimacy as a proxy here since we are currently only allowing deletion of learning-team links. + if len(link.order.get('curatedPrimacy', [])) > 0: link.save() return {"status": "ok"} else: # deleted in both hebrew and english so delete link object From a3bbc58a3d89e8c4558fb52bf54de0fd9a1eb80c Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Sun, 30 Jun 2024 12:01:39 +0300 Subject: [PATCH 145/210] fix(topics): load post data using more standard `request.body` --- reader/views.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/reader/views.py b/reader/views.py index e7ed608273..ef8682dd38 100644 --- a/reader/views.py +++ b/reader/views.py @@ -3270,7 +3270,7 @@ def topic_ref_api(request, tref): @staff_member_required def reorder_sources(request): - sources = json.loads(request.POST["json"]).get("sources", []) + sources = json.loads(request.body).get("sources", []) slug = request.GET.get('topic') lang = 'en' if request.GET.get('lang') == 'english' else 'he' return jsonResponse(update_order_of_topic_sources(slug, sources, request.user.id, lang=lang)) From 18114f78e2fdbb85eff45e1ab8f720a9432c2115 Mon Sep 17 00:00:00 2001 From: saengel Date: Tue, 2 Jul 2024 13:30:24 +0300 Subject: [PATCH 146/210] chore(PR Templates): Create template for new PRs --- .github/pull_request_template.md | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 .github/pull_request_template.md diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md new file mode 100644 index 0000000000..4701933c5f --- /dev/null +++ b/.github/pull_request_template.md @@ -0,0 +1,8 @@ +## Description +_A brief description of the PR_ + +## Code Changes +_The following changes were made to the files below_ + +## Notes +_Any additional notes go here_ \ No newline at end of file From b34cd3cff0821c2f04f02b154b60832adaa7556f Mon Sep 17 00:00:00 2001 From: saengel Date: Thu, 4 Jul 2024 11:42:05 +0300 Subject: [PATCH 147/210] fix(api): Remove space when stripping HTML tags in texts API --- sefaria/model/text.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sefaria/model/text.py b/sefaria/model/text.py index 18a1e1a28f..8025be8d14 100644 --- a/sefaria/model/text.py +++ b/sefaria/model/text.py @@ -1174,11 +1174,11 @@ def remove_html(t): if isinstance(t, list): for i, v in enumerate(t): if isinstance(v, str): - t[i] = re.sub('<[^>]+>', " ", v) + t[i] = re.sub('<[^>]+>', "", v) else: t[i] = AbstractTextRecord.remove_html(v) elif isinstance(t, str): - t = re.sub('<[^>]+>', " ", t) + t = re.sub('<[^>]+>', "", t) else: return False return t From d03a96159c62d9861c97b16bd032aa4caae40ca8 Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Thu, 4 Jul 2024 14:42:43 +0300 Subject: [PATCH 148/210] fix(linker): fix start of span to split --- sefaria/model/linker/ref_part.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sefaria/model/linker/ref_part.py b/sefaria/model/linker/ref_part.py index b628310cdd..19d10e8dfc 100644 --- a/sefaria/model/linker/ref_part.py +++ b/sefaria/model/linker/ref_part.py @@ -443,7 +443,7 @@ def split_part(self, part: RawRefPart, str_end) -> Tuple['RawRef', RawRefPart, R """ start_char, end_char = span_char_inds(part.span) pivot = len(part.text) - len(str_end) + start_char - aspan = part.span.doc.char_span(0, pivot, alignment_mode='contract') + aspan = part.span.doc.char_span(start_char, pivot, alignment_mode='contract') bspan = part.span.doc.char_span(pivot, end_char, alignment_mode='contract') if aspan is None or bspan is None: raise InputError(f"Couldn't break on token boundaries for strings '{self.text[0:pivot]}' and '{self.text[pivot:end_char]}'") From 3de6c4aaf0131ec6b10a7220434793f0f4f38c9c Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Thu, 4 Jul 2024 15:02:22 +0300 Subject: [PATCH 149/210] fix(linker): loosen requirement that trie lookups for book names needs to match NAMED parts. Now it can match any type but can only split NAMED parts. --- sefaria/model/linker/ref_resolver.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/sefaria/model/linker/ref_resolver.py b/sefaria/model/linker/ref_resolver.py index 71869ec842..1ae7f54637 100644 --- a/sefaria/model/linker/ref_resolver.py +++ b/sefaria/model/linker/ref_resolver.py @@ -427,24 +427,28 @@ def _apply_context_swaps(self, raw_ref: RawRef, context_swap_map: Dict[str, str] raw_ref.parts_to_match = swapped_ref_parts def _get_unrefined_ref_part_matches_recursive(self, raw_ref: RawRef, title_trie: MatchTemplateTrie = None, ref_parts: list = None, prev_ref_parts: list = None) -> List[ResolvedRef]: + """ + We are now considering all types for trie lookups (not just NAMED) since there seem to be no cases of false positives when we consider all part types + In addition, sometimes the raw ref part type model misclassifies a part type and relaxing the type requirement here allows it to recover. + The exception is we only will split NAMED parts since this causes some odd parts to split. e.g. משנה א can be considered part of the title of book when א is removed + """ title_trie = title_trie or self.get_ref_part_title_trie() prev_ref_parts = prev_ref_parts or [] matches = [] for part in ref_parts: temp_raw_ref = raw_ref - # no need to consider other types at root level - if part.type != RefPartType.NAMED: continue - temp_title_trie, partial_key_end = title_trie.get_continuations(part.key(), allow_partial=True) if temp_title_trie is None: continue if partial_key_end is None: matched_part = part - else: + elif part.type == RefPartType.NAMED: try: temp_raw_ref, apart, bpart = raw_ref.split_part(part, partial_key_end) matched_part = apart except InputError: matched_part = part # fallback on original part + else: + continue temp_prev_ref_parts = prev_ref_parts + [matched_part] if LEAF_TRIE_ENTRY in temp_title_trie: for node in temp_title_trie[LEAF_TRIE_ENTRY]: From ab6f4b541f161a6c0db2c31cce717bc119d6a86b Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Thu, 4 Jul 2024 15:02:48 +0300 Subject: [PATCH 150/210] test(linker): add test for mis-classified NAMED part --- sefaria/model/linker/tests/linker_test.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sefaria/model/linker/tests/linker_test.py b/sefaria/model/linker/tests/linker_test.py index 863631d274..165593cbb9 100644 --- a/sefaria/model/linker/tests/linker_test.py +++ b/sefaria/model/linker/tests/linker_test.py @@ -105,6 +105,9 @@ def test_resolved_raw_ref_clone(): # Base text context [crrd(['@ובתוס\'', '#דכ"ז ע"ב', '*ד"ה והלכתא'], "Rashi on Berakhot 2a"), ("Tosafot on Berakhot 27b:14:2",)], # shared context child via graph context + # Mis-classified part types + [crrd(['@ושו"ע', "#אה״ע", "#סי׳ כ״ח", "#סעיף א"]), ("Shulchan Arukh, Even HaEzer 28:1",)], + # Ibid [crrd(['&שם', '#ז'], prev_trefs=["Genesis 1"]), ["Genesis 7", "Genesis 1:7"]], # ambiguous ibid [crrd(['&Ibid', '#12'], prev_trefs=["Exodus 1:7"], lang='en'), ["Exodus 1:12", "Exodus 12"]], # ambiguous ibid when context is segment level (not clear if this is really ambiguous. maybe should only have segment level result) From 676a1fbe5e7c3e4686f118789dfb966cb31cc12d Mon Sep 17 00:00:00 2001 From: saengel Date: Mon, 8 Jul 2024 11:58:54 +0300 Subject: [PATCH 151/210] test(api): More robust handling of spaces, tests for validation --- sefaria/model/tests/text_test.py | 13 +++++++++++++ sefaria/model/text.py | 8 ++++++++ 2 files changed, 21 insertions(+) diff --git a/sefaria/model/tests/text_test.py b/sefaria/model/tests/text_test.py index 9efa426a9a..5e80c5d13c 100644 --- a/sefaria/model/tests/text_test.py +++ b/sefaria/model/tests/text_test.py @@ -811,3 +811,16 @@ def test_normalize(self): version = getattr(self, version_key) for attr in expected_attrs[version_key]: assert getattr(version, attr) == expected_attrs[version_key][attr] + + +def test_remove_html(): + pasuk_with_html = "בְּרֵאשִׁ֖ית בָּרָ֣א אֱלֹהִ֑ים אֵ֥ת הַשָּׁמַ֖יִם וְאֵ֥ת הָאָֽרֶץ" + pasuk_without_html = "בְּרֵאשִׁ֖ית בָּרָ֣א אֱלֹהִ֑ים אֵ֥ת הַשָּׁמַ֖יִם וְאֵ֥ת הָאָֽרֶץ" + + pasuk_with_br = "Happy is the man who has not followed the counsel of the wicked,
or taken the path of sinners,
or joined the company of the insolent;" + pasuk_without_br = "Happy is the man who has not followed the counsel of the wicked, or taken the path of sinners, or joined the company of the insolent;" + + assert model.TextChunk.remove_html(pasuk_with_html) == pasuk_without_html + assert model.TextChunk.remove_html(pasuk_with_br) == pasuk_without_br + + diff --git a/sefaria/model/text.py b/sefaria/model/text.py index 8025be8d14..174542b6df 100644 --- a/sefaria/model/text.py +++ b/sefaria/model/text.py @@ -1174,10 +1174,18 @@ def remove_html(t): if isinstance(t, list): for i, v in enumerate(t): if isinstance(v, str): + tags = re.findall('<[^>]+>', t[i]) + for tag in tags: + if tag == "
": + t[i] = re.sub("
", " ", v) t[i] = re.sub('<[^>]+>', "", v) else: t[i] = AbstractTextRecord.remove_html(v) elif isinstance(t, str): + tags = re.findall('<[^>]+>', t) + for tag in tags: + if tag == "
": + t = re.sub("
", " ", t) t = re.sub('<[^>]+>', "", t) else: return False From 571208520fabbcccbcd3d56a66f2e9e6fe43daa2 Mon Sep 17 00:00:00 2001 From: yonadavGit Date: Tue, 9 Jul 2024 16:52:57 +0300 Subject: [PATCH 152/210] fix(bulk text truncation): first pass at making sure truncated strings are not too long --- sefaria/utils/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sefaria/utils/util.py b/sefaria/utils/util.py index 28cd48ada1..dd12be8e33 100644 --- a/sefaria/utils/util.py +++ b/sefaria/utils/util.py @@ -472,7 +472,7 @@ def truncate_string(string, min_length, max_length): while min_length <= pos: while pos in html_element_indices: pos = html_element_indices[pos] - 1 - if string[pos] == break_char: + if string[pos] == break_char and pos <= max_length: return string[:pos] + "…" pos -= 1 return string From da70c51891edd203d67bafe257550607a1de4ab1 Mon Sep 17 00:00:00 2001 From: saengel Date: Wed, 10 Jul 2024 13:23:18 +0300 Subject: [PATCH 153/210] fix(api): reduce code redundancies, update tests --- sefaria/model/tests/text_test.py | 2 +- sefaria/model/text.py | 19 +++++++++---------- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/sefaria/model/tests/text_test.py b/sefaria/model/tests/text_test.py index 5e80c5d13c..ddbab992e5 100644 --- a/sefaria/model/tests/text_test.py +++ b/sefaria/model/tests/text_test.py @@ -817,7 +817,7 @@ def test_remove_html(): pasuk_with_html = "
בְּרֵאשִׁ֖ית בָּרָ֣א אֱלֹהִ֑ים אֵ֥ת הַשָּׁמַ֖יִם וְאֵ֥ת הָאָֽרֶץ" pasuk_without_html = "בְּרֵאשִׁ֖ית בָּרָ֣א אֱלֹהִ֑ים אֵ֥ת הַשָּׁמַ֖יִם וְאֵ֥ת הָאָֽרֶץ" - pasuk_with_br = "Happy is the man who has not followed the counsel of the wicked,
or taken the path of sinners,
or joined the company of the insolent;" + pasuk_with_br = "Happy is the man who has not followed the counsel of the wicked,
or taken the path of sinners,
or joined the company of the insolent;" pasuk_without_br = "Happy is the man who has not followed the counsel of the wicked, or taken the path of sinners, or joined the company of the insolent;" assert model.TextChunk.remove_html(pasuk_with_html) == pasuk_without_html diff --git a/sefaria/model/text.py b/sefaria/model/text.py index 174542b6df..2466d90b94 100644 --- a/sefaria/model/text.py +++ b/sefaria/model/text.py @@ -1171,22 +1171,21 @@ def sanitize_text(cls, t): @staticmethod def remove_html(t): + + def conditional_replace(match): + tag = match.group() + if tag == "
": + return " " + return "" + if isinstance(t, list): for i, v in enumerate(t): if isinstance(v, str): - tags = re.findall('<[^>]+>', t[i]) - for tag in tags: - if tag == "
": - t[i] = re.sub("
", " ", v) - t[i] = re.sub('<[^>]+>', "", v) + t[i] = re.sub('<[^>]+>', conditional_replace, v) else: t[i] = AbstractTextRecord.remove_html(v) elif isinstance(t, str): - tags = re.findall('<[^>]+>', t) - for tag in tags: - if tag == "
": - t = re.sub("
", " ", t) - t = re.sub('<[^>]+>', "", t) + t = re.sub('<[^>]+>', conditional_replace, t) else: return False return t From 9f098c2eea80d806bb6d4a160326b0facd60b0d4 Mon Sep 17 00:00:00 2001 From: saengel Date: Wed, 10 Jul 2024 14:45:01 +0300 Subject: [PATCH 154/210] chore(api): Add handling of
, adjust tests --- sefaria/model/tests/text_test.py | 2 +- sefaria/model/text.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sefaria/model/tests/text_test.py b/sefaria/model/tests/text_test.py index ddbab992e5..13313e6e37 100644 --- a/sefaria/model/tests/text_test.py +++ b/sefaria/model/tests/text_test.py @@ -817,7 +817,7 @@ def test_remove_html(): pasuk_with_html = "
בְּרֵאשִׁ֖ית בָּרָ֣א אֱלֹהִ֑ים אֵ֥ת הַשָּׁמַ֖יִם וְאֵ֥ת הָאָֽרֶץ" pasuk_without_html = "בְּרֵאשִׁ֖ית בָּרָ֣א אֱלֹהִ֑ים אֵ֥ת הַשָּׁמַ֖יִם וְאֵ֥ת הָאָֽרֶץ" - pasuk_with_br = "Happy is the man who has not followed the counsel of the wicked,
or taken the path of sinners,
or joined the company of the insolent;" + pasuk_with_br = "Happy is the man who has not followed the counsel of the wicked,
or taken the path of sinners,
or joined the company of the insolent;" pasuk_without_br = "Happy is the man who has not followed the counsel of the wicked, or taken the path of sinners, or joined the company of the insolent;" assert model.TextChunk.remove_html(pasuk_with_html) == pasuk_without_html diff --git a/sefaria/model/text.py b/sefaria/model/text.py index 2466d90b94..1255a7a6c3 100644 --- a/sefaria/model/text.py +++ b/sefaria/model/text.py @@ -1174,7 +1174,7 @@ def remove_html(t): def conditional_replace(match): tag = match.group() - if tag == "
": + if tag in ["
", "
"]: return " " return "" From 82c5fe4f9088b831f3adad1659f4233c75cb5c54 Mon Sep 17 00:00:00 2001 From: Skyler C Date: Wed, 10 Jul 2024 17:24:19 -0400 Subject: [PATCH 155/210] chore(about page): Update English copy --- templates/static/en/about.html | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/templates/static/en/about.html b/templates/static/en/about.html index 6aa11f3d12..f8ec4373fd 100644 --- a/templates/static/en/about.html +++ b/templates/static/en/about.html @@ -275,10 +275,10 @@

2023

- Against the somber backdrop of the October 7 attacks on Israel, Jews worldwide turn to Sefaria and October user traffic rises to 899,295, demonstrating the power and enduring need for Torah among Jews everywhere. + Along with the rest of the Jewish world, the Sefaria team mourned the horrific attacks carried out on October 7th. In the difficult months that followed, Sefaria worked to support our colleagues in Israel as well as to continue expanding access to Torah as a source of comfort and strength for the Jewish people.
- Sefaria’s R&D arm, Sefaria Ventures, launches a groundbreaking partnership with AppliedAI and the Technical University of Munich (TUM) to explore the possibilities of leveraging AI to significantly expand access to Torah. + Sefaria’s R&D arm, Sefaria Ventures, launches a groundbreaking partnership with AppliedAI and the Technical University of Munich (TUM) to explore the possibilities of leveraging AI to significantly expand access to Torah.
Sefaria partners with the Steinsaltz Center and Aleph Society to launch a digital collection of Rabbi Adin Steinsaltz’s complete works of commentary, making the renowned Rabbi’s writings available to all who wish to learn. From df83f632b232ddaa021b783161decdf2aa589c0a Mon Sep 17 00:00:00 2001 From: Skyler C Date: Wed, 10 Jul 2024 17:28:00 -0400 Subject: [PATCH 156/210] chore(about page): Update Hebrew copy --- templates/static/he/about.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/templates/static/he/about.html b/templates/static/he/about.html index e9137c626f..b3a7ee5736 100644 --- a/templates/static/he/about.html +++ b/templates/static/he/about.html @@ -282,7 +282,7 @@

2023

- בשבועות הקשות שלאחר השבת השחורה של ה-7 באוקטובר, יהודים בכל העולם פונים לספריא. כמות המשתמשים באתר וביישומון עולה ל-889,295, מספר המעיד על כוחם של מקורות היהדות לחזק ולשמש עוגן ליהודים בכל רחבי תבל. + יחד עם כל העולם היהודי, צוות ספריא התאבל על הטבח הנורא של ה-7 באוקטובר. בחודשים הקשים שלאחר מכן פעלנו כדי לתמוך בעמיתינו בישראל וכדי להמשיך את הגדלת הספריה והנגישות למקורות היהדות שמהווים עבור רבים בעם היהודי מקור לנחמה, כוח ותקווה.
מחלקת המחקר והפיתוח של ספריא משיקה שותפות פורצת דרך עם AppliedAI והאוניברסיטה הטכנית של מינכן (TUM). מטרת השותפות היא בחינת האפשרויות הטמונות במינוף של אינטליגנציה מלאכותית ככלי להרחבה משמעותית של גישה ציבורית לתורה. From 0dd4b880be3d8cbf56df3108a9b22b9b8a0e7c10 Mon Sep 17 00:00:00 2001 From: yonadavGit Date: Thu, 11 Jul 2024 11:17:56 +0300 Subject: [PATCH 157/210] chore(bulk text truncation): added test that would've failed before the fix --- sefaria/utils/tests/util_test.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/sefaria/utils/tests/util_test.py b/sefaria/utils/tests/util_test.py index 2c3e73f4e9..70992adab0 100644 --- a/sefaria/utils/tests/util_test.py +++ b/sefaria/utils/tests/util_test.py @@ -59,3 +59,10 @@ def test_string_length_equals_max(self): max_length = 24 expected_output = "string with length of 24" assert truncate_string(string, min_length, max_length) == expected_output + + def test_long_string_with_html_closing_tag_after_max_length(self): + string = 'This is a long string aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa a' + min_length = 10 + max_length = 22 + expected_output = "This is a long string…" + assert truncate_string(string, min_length, max_length) == expected_output From 62b293ec9952acd833812ed23eb3134e87b8b061 Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Thu, 11 Jul 2024 13:21:35 +0300 Subject: [PATCH 158/210] fix(topics): Fix add topics from sidebar feature. This feature broke due to a recent refactor. JSON stringifying now happens lower down in the call stack. --- static/js/TopicSearch.jsx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/static/js/TopicSearch.jsx b/static/js/TopicSearch.jsx index 60e098b1cd..6f4578805d 100644 --- a/static/js/TopicSearch.jsx +++ b/static/js/TopicSearch.jsx @@ -64,7 +64,7 @@ class TopicSearch extends Component { } post(slug) { - const postJSON = JSON.stringify({"topic": slug, 'interface_lang': Sefaria.interfaceLang}); + const postJSON = {"topic": slug, 'interface_lang': Sefaria.interfaceLang}; const srefs = this.props.srefs; const update = this.props.update; const reset = this.reset; From 373041578e2470cc5f0d14f5556d8994bed96168 Mon Sep 17 00:00:00 2001 From: nsantacruz Date: Sun, 14 Jul 2024 12:13:38 +0300 Subject: [PATCH 159/210] test(library): fix failing test due to title change of Ibn Ezra. Also, refactor test so data is pulled out --- sefaria/model/tests/text_test.py | 92 +++++++++++++++++--------------- 1 file changed, 49 insertions(+), 43 deletions(-) diff --git a/sefaria/model/tests/text_test.py b/sefaria/model/tests/text_test.py index 9efa426a9a..4fc489f0f2 100644 --- a/sefaria/model/tests/text_test.py +++ b/sefaria/model/tests/text_test.py @@ -327,15 +327,6 @@ def test_merge(): def test_text_helpers(): - res = model.library.get_dependant_indices() - assert 'Rashbam on Genesis' in res - assert 'Rashi on Bava Batra' in res - assert 'Bartenura on Mishnah Oholot' in res - assert 'Onkelos Leviticus' in res - assert 'Chizkuni' in res - assert 'Akeidat Yitzchak' not in res - assert 'Berakhot' not in res - res = model.library.get_indices_by_collective_title("Rashi") assert 'Rashi on Bava Batra' in res assert 'Rashi on Genesis' in res @@ -346,46 +337,61 @@ def test_text_helpers(): assert 'Bartenura on Mishnah Oholot' in res assert 'Rashbam on Genesis' not in res - res = model.library.get_dependant_indices(book_title="Exodus") - assert 'Ibn Ezra on Exodus' in res - assert 'Ramban on Exodus' in res - assert 'Meshekh Chokhmah' in res - assert 'Abarbanel on Torah' in res - assert 'Targum Jonathan on Exodus' in res - assert 'Onkelos Exodus' in res - assert 'Harchev Davar on Exodus' in res - - assert 'Exodus' not in res - assert 'Rashi on Genesis' not in res - - res = model.library.get_dependant_indices(book_title="Exodus", dependence_type='Commentary') - assert 'Ibn Ezra on Exodus' in res - assert 'Ramban on Exodus' in res - assert 'Meshekh Chokhmah' in res - assert 'Abarbanel on Torah' in res - assert 'Harchev Davar on Exodus' in res - - assert 'Targum Jonathan on Exodus' not in res - assert 'Onkelos Exodus' not in res - assert 'Exodus' not in res - assert 'Rashi on Genesis' not in res - - res = model.library.get_dependant_indices(book_title="Exodus", dependence_type='Commentary', structure_match=True) - assert 'Ibn Ezra on Exodus' in res - assert 'Ramban on Exodus' in res - - assert 'Harchev Davar on Exodus' not in res - assert 'Meshekh Chokhmah' not in res - assert 'Abarbanel on Torah' not in res - assert 'Exodus' not in res - assert 'Rashi on Genesis' not in res - cats = model.library.get_text_categories() assert 'Tanakh' in cats assert 'Torah' in cats assert 'Prophets' in cats assert 'Commentary' in cats +@pytest.mark.parametrize(('book_title', 'dependence_type', 'structure_match', 'expected_titles', 'not_expected_titles'), [ + [None, None, False, [ + 'Rashbam on Genesis', + 'Rashi on Bava Batra', + 'Bartenura on Mishnah Oholot', + 'Onkelos Leviticus', + 'Chizkuni', + ], [ + 'Akeidat Yitzchak', + 'Berakhot'] + ], + ['Exodus', None, False, ['Ibn Ezra on Exodus; Perush HaArokh', + 'Ramban on Exodus', + 'Abarbanel on Torah', + 'Meshekh Chokhmah', + 'Targum Jonathan on Exodus', + 'Onkelos Exodus', + 'Harchev Davar on Exodus' + ], ['Exodus', + 'Rashi on Genesis'] + ], + ['Exodus', 'Commentary', False, ['Ibn Ezra on Exodus; Perush HaArokh', + 'Ramban on Exodus', + 'Abarbanel on Torah', + 'Meshekh Chokhmah', + 'Harchev Davar on Exodus' + ], ['Targum Jonathan on Exodus', + 'Onkelos Exodus', + 'Exodus', + 'Rashi on Genesis'] + ], + ['Exodus', 'Commentary', True, ['Ibn Ezra on Exodus; Perush HaArokh', + 'Ramban on Exodus' + ], ['Abarbanel on Torah', + 'Meshekh Chokhmah', + 'Targum Jonathan on Exodus', + 'Onkelos Exodus', + 'Harchev Davar on Exodus', + 'Exodus', + 'Rashi on Genesis'] + ], +]) +def test_get_dependent_indices(book_title, dependence_type, structure_match, expected_titles, not_expected_titles): + res = model.library.get_dependant_indices(book_title=book_title, dependence_type=dependence_type, structure_match=structure_match) + for title in expected_titles: + assert title in res + for title in not_expected_titles: + assert title not in res + def test_index_update(): ''' From 01e370b7e531b1342c2a4a1e187465fc82bd9b65 Mon Sep 17 00:00:00 2001 From: yonadavGit Date: Sun, 14 Jul 2024 16:20:54 +0300 Subject: [PATCH 160/210] fix(topic model): change generation of titles for ambiguous topics to include [] not () --- sefaria/model/topic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sefaria/model/topic.py b/sefaria/model/topic.py index a5dfe56568..b965dbcaf5 100644 --- a/sefaria/model/topic.py +++ b/sefaria/model/topic.py @@ -380,7 +380,7 @@ def get_primary_title(self, lang='en', with_disambiguation=True): if disambig_text: title += f' ({disambig_text})' elif getattr(self, 'isAmbiguous', False) and len(title) > 0: - title += ' (Ambiguous)' + title += ' [Ambiguous]' return title def get_titles(self, lang=None, with_disambiguation=True): From 5f9a9adef34f50bf049876dbf5e5166a6562601d Mon Sep 17 00:00:00 2001 From: saengel Date: Mon, 15 Jul 2024 10:48:50 +0300 Subject: [PATCH 161/210] chore(api): add pytest parameterize to tests --- sefaria/model/tests/text_test.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/sefaria/model/tests/text_test.py b/sefaria/model/tests/text_test.py index 13313e6e37..9d44717c4e 100644 --- a/sefaria/model/tests/text_test.py +++ b/sefaria/model/tests/text_test.py @@ -812,15 +812,16 @@ def test_normalize(self): for attr in expected_attrs[version_key]: assert getattr(version, attr) == expected_attrs[version_key][attr] - -def test_remove_html(): - pasuk_with_html = "בְּרֵאשִׁ֖ית בָּרָ֣א אֱלֹהִ֑ים אֵ֥ת הַשָּׁמַ֖יִם וְאֵ֥ת הָאָֽרֶץ" - pasuk_without_html = "בְּרֵאשִׁ֖ית בָּרָ֣א אֱלֹהִ֑ים אֵ֥ת הַשָּׁמַ֖יִם וְאֵ֥ת הָאָֽרֶץ" - - pasuk_with_br = "Happy is the man who has not followed the counsel of the wicked,
or taken the path of sinners,
or joined the company of the insolent;" - pasuk_without_br = "Happy is the man who has not followed the counsel of the wicked, or taken the path of sinners, or joined the company of the insolent;" - - assert model.TextChunk.remove_html(pasuk_with_html) == pasuk_without_html - assert model.TextChunk.remove_html(pasuk_with_br) == pasuk_without_br +@pytest.mark.parametrize(('text_with_html', 'text_without_html'), + [ + ["
בְּרֵאשִׁ֖ית בָּרָ֣א אֱלֹהִ֑ים אֵ֥ת הַשָּׁמַ֖יִם וְאֵ֥ת הָאָֽרֶץ", + "בְּרֵאשִׁ֖ית בָּרָ֣א אֱלֹהִ֑ים אֵ֥ת הַשָּׁמַ֖יִם וְאֵ֥ת הָאָֽרֶץ"], + [ + "Happy is the man who has not followed the counsel of the wicked,
or taken the path of sinners,
or joined the company of the insolent;", + "Happy is the man who has not followed the counsel of the wicked, or taken the path of sinners, or joined the company of the insolent;"] + ]) + +def test_remove_html(text_with_html, text_without_html): + assert model.TextChunk.remove_html(text_with_html) == text_without_html From 910289e12aebe9efdbd4a2e1653c2758114d84b7 Mon Sep 17 00:00:00 2001 From: YishaiGlasner Date: Mon, 15 Jul 2024 11:05:46 +0300 Subject: [PATCH 162/210] fix: only staff member can add index or edit text info. --- reader/views.py | 1 + 1 file changed, 1 insertion(+) diff --git a/reader/views.py b/reader/views.py index ef8682dd38..8c3bf05091 100644 --- a/reader/views.py +++ b/reader/views.py @@ -1242,6 +1242,7 @@ def edit_text(request, ref=None, lang=None, version=None): }) @ensure_csrf_cookie +@staff_member_required @sanitize_get_params def edit_text_info(request, title=None, new_title=None): """ From d3ca9d149ebd20900f4792e30114186273e5f33b Mon Sep 17 00:00:00 2001 From: yonadavGit Date: Mon, 15 Jul 2024 15:41:14 +0300 Subject: [PATCH 163/210] fix(topic editor): first pass at sending titles in POST in a similar to their DB schema --- static/js/TopicEditor.jsx | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/static/js/TopicEditor.jsx b/static/js/TopicEditor.jsx index 2d279ebb76..a374fd3bd8 100644 --- a/static/js/TopicEditor.jsx +++ b/static/js/TopicEditor.jsx @@ -116,10 +116,25 @@ const TopicEditor = ({origData, onCreateSuccess, close, origWasCat}) => { const prepData = () => { // always add category, title, heTitle, altTitles - let postData = { category: data.catSlug, title: data.enTitle, heTitle: data.heTitle, altTitles: {}}; + let postData = { category: data.catSlug, title: data.enTitle, heTitle: data.heTitle, altTitles: {}, titles: []}; postData.altTitles.en = data.enAltTitles.map(x => x.name); // alt titles implemented using TitleVariants which contains list of objects with 'name' property. postData.altTitles.he = data.heAltTitles.map(x => x.name); + postData['titles'].push({'text': data['enTitle'], "lang": 'en', 'primary': true}) + postData['titles'].push({'text': data['heTitle'], "lang": 'he', 'primary': true}) + const enAltTitles = data['enAltTitles']; + const heAltTitles = data['heAltTitles']; + if (Array.isArray(enAltTitles)) { + enAltTitles.forEach((title, index) => { + postData['titles'].push({'text': title['name'], "lang": 'en'}) + }); + } + if (Array.isArray(heAltTitles)) { + heAltTitles.forEach((title, index) => { + postData['titles'].push({'text': title['name'], "lang": 'he'}) + }); + }; + // add image if image or caption changed const origImageURI = origData?.origImage?.image_uri || ""; const origEnCaption = origData?.origImage?.image_caption?.en || ""; From 4416a9427e08753ab21b0c4f2f5ec1099a796692 Mon Sep 17 00:00:00 2001 From: yonadavGit Date: Mon, 15 Jul 2024 15:56:02 +0300 Subject: [PATCH 164/210] fix(topic editor): first pass at parsing primary titles server side --- sefaria/helper/topic.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/sefaria/helper/topic.py b/sefaria/helper/topic.py index ae3337d0c7..46d870b2c8 100644 --- a/sefaria/helper/topic.py +++ b/sefaria/helper/topic.py @@ -1072,6 +1072,10 @@ def topic_change_category(topic_obj, new_category, old_category="", rebuild=Fals def update_topic_titles(topic, title="", heTitle="", **kwargs): new_primary = {"en": title, "he": heTitle} + titles = kwargs['titles'] + enPrimary = [title['text'] for title in titles if title.get('primary', False) and title['lang'] == 'en'][0] + hePrimary = [title['text'] for title in titles if title.get('primary', False) and title['lang'] == 'he'][0] + new_primary = {"en": enPrimary, "he": hePrimary} for lang in ['en', 'he']: # first remove all titles and add new primary and then alt titles for title in topic.get_titles(lang): topic.remove_title(title, lang) From 8352d6369ce9a0553fc57924f22dcddc5cf66a66 Mon Sep 17 00:00:00 2001 From: yonadavGit Date: Mon, 15 Jul 2024 16:21:40 +0300 Subject: [PATCH 165/210] fix(topic editor): first pass at enabling at enabling server to update disambiguation field in title --- sefaria/helper/topic.py | 21 +++++++++++++++------ sefaria/model/schema.py | 10 ++++++---- 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/sefaria/helper/topic.py b/sefaria/helper/topic.py index 46d870b2c8..fc2ed5f7b5 100644 --- a/sefaria/helper/topic.py +++ b/sefaria/helper/topic.py @@ -1073,16 +1073,25 @@ def topic_change_category(topic_obj, new_category, old_category="", rebuild=Fals def update_topic_titles(topic, title="", heTitle="", **kwargs): new_primary = {"en": title, "he": heTitle} titles = kwargs['titles'] - enPrimary = [title['text'] for title in titles if title.get('primary', False) and title['lang'] == 'en'][0] - hePrimary = [title['text'] for title in titles if title.get('primary', False) and title['lang'] == 'he'][0] + # enPrimary = [title['text'] for title in titles if title.get('primary', False) and title['lang'] == 'en'][0] + # hePrimary = [title['text'] for title in titles if title.get('primary', False) and title['lang'] == 'he'][0] + enPrimary = [title for title in titles if title.get('primary', False) and title['lang'] == 'en'][0] + hePrimary = [title for title in titles if title.get('primary', False) and title['lang'] == 'he'][0] new_primary = {"en": enPrimary, "he": hePrimary} + + enNonPrimary = [title for title in titles if not title.get('primary', False) and title['lang'] == 'en'] + heNonPrimary = [title for title in titles if not title.get('primary', False) and title['lang'] == 'he'] + nonPrimary = {'en': enNonPrimary, 'he': heNonPrimary} + for lang in ['en', 'he']: # first remove all titles and add new primary and then alt titles for title in topic.get_titles(lang): topic.remove_title(title, lang) - topic.add_title(new_primary[lang], lang, True, False) - if 'altTitles' in kwargs: - for title in kwargs['altTitles'][lang]: - topic.add_title(title, lang) + topic.add_title(new_primary[lang]['text'], lang, True, False, disambiguation=new_primary[lang].get("disambiguation", "")) + # if 'altTitles' in kwargs: + if nonPrimary: + # for title in kwargs['altTitles'][lang]: + for title in nonPrimary[lang]: + topic.add_title(title['text'], lang, disambiguation=title.get("disambiguation", "")) return topic diff --git a/sefaria/model/schema.py b/sefaria/model/schema.py index 77cb1f4f07..b0817cb916 100644 --- a/sefaria/model/schema.py +++ b/sefaria/model/schema.py @@ -123,7 +123,7 @@ def remove_title(self, text, lang): self.titles = [t for t in self.titles if not (t["lang"] == lang and t["text"] == text)] return self - def add_title(self, text, lang, primary=False, replace_primary=False, presentation="combined"): + def add_title(self, text, lang, primary=False, replace_primary=False, presentation="combined", disambiguation=''): """ :param text: Text of the title :param language: Language code of the title (e.g. "en" or "he") @@ -150,7 +150,8 @@ def add_title(self, text, lang, primary=False, replace_primary=False, presentati if presentation == "alone" or presentation == "both": d["presentation"] = presentation - + if disambiguation: + d['disambiguation'] = disambiguation has_primary = any([x for x in self.titles if x["lang"] == lang and x.get("primary")]) if has_primary and primary: if not replace_primary: @@ -171,15 +172,16 @@ def add_primary_titles(self, en_title, he_title): self.add_title(en_title, 'en', primary=True) self.add_title(he_title, 'he', primary=True) - def add_title(self, text, lang, primary=False, replace_primary=False): + def add_title(self, text, lang, primary=False, replace_primary=False, disambiguation=''): """ :param text: Text of the title :param language: Language code of the title (e.g. "en" or "he") :param primary: Is this a primary title? :param replace_primary: must be true to replace an existing primary title + :param disambiguation: :return: the object """ - return self.title_group.add_title(text, lang, primary, replace_primary) + return self.title_group.add_title(text, lang, primary, replace_primary, disambiguation=disambiguation) def remove_title(self, text, lang): return self.title_group.remove_title(text, lang) From ef7dd03e3ae2606053b0439164b80655f92bccbf Mon Sep 17 00:00:00 2001 From: yonadavGit Date: Mon, 15 Jul 2024 17:01:03 +0300 Subject: [PATCH 166/210] fix(topic editor): first pass at making client extract disambiguation from title string --- static/js/TopicEditor.jsx | 33 +++++++++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 4 deletions(-) diff --git a/static/js/TopicEditor.jsx b/static/js/TopicEditor.jsx index a374fd3bd8..fd9101a751 100644 --- a/static/js/TopicEditor.jsx +++ b/static/js/TopicEditor.jsx @@ -114,24 +114,49 @@ const TopicEditor = ({origData, onCreateSuccess, close, origWasCat}) => { .finally(() => setSavingStatus(false)); } + const extractDisambiguationFromTitle = function(titleText){ + let regex = /\((.+)\)$/; + let matches = titleText.match(regex); + let insideBrackets = matches ? matches[1] : null; + // let newString = titleText.replace(regex, ""); + return insideBrackets + } + const removeDisambiguationFromTitle = function(titleText){ + let regex = /\((.+)\)$/; + let newString = titleText.replace(regex, ""); + console.log(newString); + return newString + } const prepData = () => { // always add category, title, heTitle, altTitles let postData = { category: data.catSlug, title: data.enTitle, heTitle: data.heTitle, altTitles: {}, titles: []}; postData.altTitles.en = data.enAltTitles.map(x => x.name); // alt titles implemented using TitleVariants which contains list of objects with 'name' property. postData.altTitles.he = data.heAltTitles.map(x => x.name); - postData['titles'].push({'text': data['enTitle'], "lang": 'en', 'primary': true}) - postData['titles'].push({'text': data['heTitle'], "lang": 'he', 'primary': true}) + let enPrimaryTitleObj = {'text': removeDisambiguationFromTitle(data.enTitle), "lang": 'en', "primary": true}; + let enDisambiguation = extractDisambiguationFromTitle(data.enTitle); + if (enDisambiguation) {enPrimaryTitleObj["disambiguation"]=enDisambiguation}; + let hePrimaryTitleObj = {'text': removeDisambiguationFromTitle(data.heTitle), "lang": 'he', "primary": true}; + let heDisambiguation = extractDisambiguationFromTitle(data.heTitle); + if (heDisambiguation) {hePrimaryTitleObj["disambiguation"]=heDisambiguation}; + postData['titles'].push(enPrimaryTitleObj) + postData['titles'].push(hePrimaryTitleObj) const enAltTitles = data['enAltTitles']; const heAltTitles = data['heAltTitles']; if (Array.isArray(enAltTitles)) { enAltTitles.forEach((title, index) => { - postData['titles'].push({'text': title['name'], "lang": 'en'}) + let titleObj = {'text': removeDisambiguationFromTitle(title['name']), "lang": 'en'}; + let disambiguation = extractDisambiguationFromTitle(title['name']); + if (disambiguation) {titleObj["disambiguation"]=disambiguation} + postData['titles'].push(titleObj) }); } if (Array.isArray(heAltTitles)) { heAltTitles.forEach((title, index) => { - postData['titles'].push({'text': title['name'], "lang": 'he'}) + let titleObj = {'text': removeDisambiguationFromTitle(title['name']), "lang": 'he'}; + let disambiguation = extractDisambiguationFromTitle(title['name']); + if (disambiguation) {titleObj["disambiguation"]=disambiguation}; + postData['titles'].push(titleObj); }); }; From 68e977d84fe978b0cc8283281fbf312c5a3050fd Mon Sep 17 00:00:00 2001 From: yonadavGit Date: Mon, 15 Jul 2024 22:56:45 +0300 Subject: [PATCH 167/210] fix(topic editor): make server remove disambiguated titles as well before trying to add new ones --- sefaria/helper/topic.py | 1 - sefaria/model/topic.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/sefaria/helper/topic.py b/sefaria/helper/topic.py index fc2ed5f7b5..e90abd28e3 100644 --- a/sefaria/helper/topic.py +++ b/sefaria/helper/topic.py @@ -1071,7 +1071,6 @@ def topic_change_category(topic_obj, new_category, old_category="", rebuild=Fals return topic_obj def update_topic_titles(topic, title="", heTitle="", **kwargs): - new_primary = {"en": title, "he": heTitle} titles = kwargs['titles'] # enPrimary = [title['text'] for title in titles if title.get('primary', False) and title['lang'] == 'en'][0] # hePrimary = [title['text'] for title in titles if title.get('primary', False) and title['lang'] == 'he'][0] diff --git a/sefaria/model/topic.py b/sefaria/model/topic.py index b965dbcaf5..9f06b95ae3 100644 --- a/sefaria/model/topic.py +++ b/sefaria/model/topic.py @@ -383,7 +383,7 @@ def get_primary_title(self, lang='en', with_disambiguation=True): title += ' [Ambiguous]' return title - def get_titles(self, lang=None, with_disambiguation=True): + def get_titles(self, lang=None, with_disambiguation=False): if with_disambiguation: titles = [] for title in self.get_titles_object(): From c77f6f8947e54ad564695f15dda15d00fbb94aed Mon Sep 17 00:00:00 2001 From: Skyler Cohen Date: Tue, 16 Jul 2024 00:29:20 -0400 Subject: [PATCH 168/210] bug(jobs page): Fix showing flash of 'no jobs' when there might be job listings available. Wait until network call completes --- static/js/StaticPages.jsx | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/static/js/StaticPages.jsx b/static/js/StaticPages.jsx index 2dcbc3d63f..fbefb227e5 100644 --- a/static/js/StaticPages.jsx +++ b/static/js/StaticPages.jsx @@ -3021,6 +3021,7 @@ const NoJobsNotice = () => { const JobsPage = memo(() => { const [groupedJobPostings, setGroupedJobPostings] = useState({}); const [error, setError] = useState(null); + const [loading, setLoading] = useState(true); const fetchJobsJSON = async () => { const currentDateTime = new Date().toISOString(); @@ -3069,6 +3070,7 @@ const JobsPage = memo(() => { }; const loadJobPostings = async () => { + setLoading(true); if (typeof STRAPI_INSTANCE !== "undefined" && STRAPI_INSTANCE) { try { const jobsData = await fetchJobsJSON(); @@ -3102,20 +3104,24 @@ const JobsPage = memo(() => { } else { setError("Error: Sefaria's CMS cannot be reached"); } + setLoading(false); }; useEffect(() => { loadJobPostings(); }, []); + const jobsAvailable = Object.keys(groupedJobPostings)?.length; return (
{error ? (

{error}

+ ) : loading ? ( +

Loading...

) : ( <> - - {Object.keys(groupedJobPostings)?.length ? ( + + {jobsAvailable ? ( ) : ( From ea2bb9a45cc9bb372b056849608774db7753213f Mon Sep 17 00:00:00 2001 From: yonadavGit Date: Tue, 16 Jul 2024 10:35:59 +0300 Subject: [PATCH 169/210] refactor(topic editor): cleaner client side prep of data for POST --- static/js/TopicEditor.jsx | 60 +++++++++++++++++++-------------------- 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/static/js/TopicEditor.jsx b/static/js/TopicEditor.jsx index fd9101a751..5f1f5f27c3 100644 --- a/static/js/TopicEditor.jsx +++ b/static/js/TopicEditor.jsx @@ -118,47 +118,47 @@ const TopicEditor = ({origData, onCreateSuccess, close, origWasCat}) => { let regex = /\((.+)\)$/; let matches = titleText.match(regex); let insideBrackets = matches ? matches[1] : null; - // let newString = titleText.replace(regex, ""); return insideBrackets } const removeDisambiguationFromTitle = function(titleText){ let regex = /\((.+)\)$/; - let newString = titleText.replace(regex, ""); - console.log(newString); - return newString + let newTitle = titleText.replace(regex, ""); + return newTitle } + + const createPrimaryTitleObj = function(data, lang){ + const fieldName = lang == 'en' ? 'enTitle' : 'heTitle'; + let primaryTitleObj = {'text': removeDisambiguationFromTitle(data[fieldName]), "lang": lang, "primary": true}; + let disambiguation = extractDisambiguationFromTitle(data[fieldName]); + if (disambiguation) {primaryTitleObj["disambiguation"]=disambiguation}; + return primaryTitleObj; + }; + const createNonPrimaryTitleObjArray = function(data, lang){ + const fieldName = lang == 'en' ? 'enAltTitles' : 'heAltTitles'; + const altTitles = data[fieldName]; + const titleObjArray = [] + if (Array.isArray(altTitles)) { + altTitles.forEach((title, index) => { + let titleObj = {'text': removeDisambiguationFromTitle(title['name']), "lang": lang}; + let disambiguation = extractDisambiguationFromTitle(title['name']); + if (disambiguation) {titleObj["disambiguation"]=disambiguation} + titleObjArray.push(titleObj) + }); + } + return titleObjArray + }; + const prepData = () => { // always add category, title, heTitle, altTitles let postData = { category: data.catSlug, title: data.enTitle, heTitle: data.heTitle, altTitles: {}, titles: []}; postData.altTitles.en = data.enAltTitles.map(x => x.name); // alt titles implemented using TitleVariants which contains list of objects with 'name' property. postData.altTitles.he = data.heAltTitles.map(x => x.name); - let enPrimaryTitleObj = {'text': removeDisambiguationFromTitle(data.enTitle), "lang": 'en', "primary": true}; - let enDisambiguation = extractDisambiguationFromTitle(data.enTitle); - if (enDisambiguation) {enPrimaryTitleObj["disambiguation"]=enDisambiguation}; - let hePrimaryTitleObj = {'text': removeDisambiguationFromTitle(data.heTitle), "lang": 'he', "primary": true}; - let heDisambiguation = extractDisambiguationFromTitle(data.heTitle); - if (heDisambiguation) {hePrimaryTitleObj["disambiguation"]=heDisambiguation}; - postData['titles'].push(enPrimaryTitleObj) - postData['titles'].push(hePrimaryTitleObj) - const enAltTitles = data['enAltTitles']; - const heAltTitles = data['heAltTitles']; - if (Array.isArray(enAltTitles)) { - enAltTitles.forEach((title, index) => { - let titleObj = {'text': removeDisambiguationFromTitle(title['name']), "lang": 'en'}; - let disambiguation = extractDisambiguationFromTitle(title['name']); - if (disambiguation) {titleObj["disambiguation"]=disambiguation} - postData['titles'].push(titleObj) - }); - } - if (Array.isArray(heAltTitles)) { - heAltTitles.forEach((title, index) => { - let titleObj = {'text': removeDisambiguationFromTitle(title['name']), "lang": 'he'}; - let disambiguation = extractDisambiguationFromTitle(title['name']); - if (disambiguation) {titleObj["disambiguation"]=disambiguation}; - postData['titles'].push(titleObj); - }); - }; + //convert title and altTitles to the database format, including extraction of disambiguation from title string + postData['titles'].push(createPrimaryTitleObj(data, 'en')); + postData['titles'].push(createPrimaryTitleObj(data, 'he')); + postData['titles'].concat(createNonPrimaryTitleObjArray(data, 'en')); + postData['titles'].concat(createNonPrimaryTitleObjArray(data, 'he')); // add image if image or caption changed const origImageURI = origData?.origImage?.image_uri || ""; From 6f26f6f0f959e988920c8a85db4a4e2fb55bfe2d Mon Sep 17 00:00:00 2001 From: yonadavGit Date: Tue, 16 Jul 2024 11:10:09 +0300 Subject: [PATCH 170/210] refactor(topic editor): cleaner server side prep of data for POST, fix bug in client --- sefaria/helper/topic.py | 24 ++++++++++-------------- static/js/TopicEditor.jsx | 4 ++-- 2 files changed, 12 insertions(+), 16 deletions(-) diff --git a/sefaria/helper/topic.py b/sefaria/helper/topic.py index e90abd28e3..e434dbbce4 100644 --- a/sefaria/helper/topic.py +++ b/sefaria/helper/topic.py @@ -1070,26 +1070,22 @@ def topic_change_category(topic_obj, new_category, old_category="", rebuild=Fals rebuild_topic_toc(topic_obj, category_changed=True) return topic_obj +def get_primary_title(titles, lang): + return next((title for title in titles if title.get('primary', False) and title['lang'] == lang), None) + +def get_non_primary_titles(titles, lang): + return [title for title in titles if not title.get('primary', False) and title['lang'] == lang] def update_topic_titles(topic, title="", heTitle="", **kwargs): titles = kwargs['titles'] - # enPrimary = [title['text'] for title in titles if title.get('primary', False) and title['lang'] == 'en'][0] - # hePrimary = [title['text'] for title in titles if title.get('primary', False) and title['lang'] == 'he'][0] - enPrimary = [title for title in titles if title.get('primary', False) and title['lang'] == 'en'][0] - hePrimary = [title for title in titles if title.get('primary', False) and title['lang'] == 'he'][0] - new_primary = {"en": enPrimary, "he": hePrimary} - - enNonPrimary = [title for title in titles if not title.get('primary', False) and title['lang'] == 'en'] - heNonPrimary = [title for title in titles if not title.get('primary', False) and title['lang'] == 'he'] - nonPrimary = {'en': enNonPrimary, 'he': heNonPrimary} + new_primary_titles = {"en": get_primary_title(titles, 'en'), "he": get_primary_title(titles, 'he')} + new_non_primary_titles = {"en": get_non_primary_titles(titles, 'en'), "he": get_non_primary_titles(titles, 'he')} for lang in ['en', 'he']: # first remove all titles and add new primary and then alt titles for title in topic.get_titles(lang): topic.remove_title(title, lang) - topic.add_title(new_primary[lang]['text'], lang, True, False, disambiguation=new_primary[lang].get("disambiguation", "")) - # if 'altTitles' in kwargs: - if nonPrimary: - # for title in kwargs['altTitles'][lang]: - for title in nonPrimary[lang]: + topic.add_title(new_primary_titles[lang]['text'], lang, True, False, disambiguation=new_primary_titles[lang].get("disambiguation", "")) + if new_non_primary_titles: + for title in new_non_primary_titles[lang]: topic.add_title(title['text'], lang, disambiguation=title.get("disambiguation", "")) return topic diff --git a/static/js/TopicEditor.jsx b/static/js/TopicEditor.jsx index 5f1f5f27c3..c1f284b3c2 100644 --- a/static/js/TopicEditor.jsx +++ b/static/js/TopicEditor.jsx @@ -157,8 +157,8 @@ const TopicEditor = ({origData, onCreateSuccess, close, origWasCat}) => { //convert title and altTitles to the database format, including extraction of disambiguation from title string postData['titles'].push(createPrimaryTitleObj(data, 'en')); postData['titles'].push(createPrimaryTitleObj(data, 'he')); - postData['titles'].concat(createNonPrimaryTitleObjArray(data, 'en')); - postData['titles'].concat(createNonPrimaryTitleObjArray(data, 'he')); + postData['titles'] = postData['titles'].concat(createNonPrimaryTitleObjArray(data, 'en')); + postData['titles'] = postData['titles'].concat(createNonPrimaryTitleObjArray(data, 'he')); // add image if image or caption changed const origImageURI = origData?.origImage?.image_uri || ""; From 7ca5e903e268cff34ba60fdb31b25ce406128a96 Mon Sep 17 00:00:00 2001 From: yonadavGit Date: Tue, 16 Jul 2024 13:40:26 +0300 Subject: [PATCH 171/210] refactor(topic editor): remove update_topic_titles --- reader/views.py | 6 ++++-- sefaria/helper/topic.py | 24 +++--------------------- 2 files changed, 7 insertions(+), 23 deletions(-) diff --git a/reader/views.py b/reader/views.py index ef8682dd38..9fa09af2c1 100644 --- a/reader/views.py +++ b/reader/views.py @@ -78,7 +78,7 @@ from sefaria.utils.user import delete_user_account from django.core.mail import EmailMultiAlternatives from babel import Locale -from sefaria.helper.topic import update_topic, update_topic_titles +from sefaria.helper.topic import update_topic from sefaria.helper.category import update_order_of_category_children, check_term if USE_VARNISH: @@ -3108,7 +3108,9 @@ def add_new_topic_api(request): data = json.loads(request.POST["json"]) isTopLevelDisplay = data["category"] == Topic.ROOT t = Topic({'slug': "", "isTopLevelDisplay": isTopLevelDisplay, "data_source": "sefaria", "numSources": 0}) - update_topic_titles(t, **data) + titles = data.get('titles') + if titles: + t.set_titles(titles) t.set_slug_to_primary_title() if not isTopLevelDisplay: # not Top Level so create an IntraTopicLink to category new_link = IntraTopicLink({"toTopic": data["category"], "fromTopic": t.slug, "linkType": "displays-under", "dataSource": "sefaria"}) diff --git a/sefaria/helper/topic.py b/sefaria/helper/topic.py index e434dbbce4..27867f97da 100644 --- a/sefaria/helper/topic.py +++ b/sefaria/helper/topic.py @@ -1070,26 +1070,6 @@ def topic_change_category(topic_obj, new_category, old_category="", rebuild=Fals rebuild_topic_toc(topic_obj, category_changed=True) return topic_obj -def get_primary_title(titles, lang): - return next((title for title in titles if title.get('primary', False) and title['lang'] == lang), None) - -def get_non_primary_titles(titles, lang): - return [title for title in titles if not title.get('primary', False) and title['lang'] == lang] -def update_topic_titles(topic, title="", heTitle="", **kwargs): - titles = kwargs['titles'] - new_primary_titles = {"en": get_primary_title(titles, 'en'), "he": get_primary_title(titles, 'he')} - new_non_primary_titles = {"en": get_non_primary_titles(titles, 'en'), "he": get_non_primary_titles(titles, 'he')} - - for lang in ['en', 'he']: # first remove all titles and add new primary and then alt titles - for title in topic.get_titles(lang): - topic.remove_title(title, lang) - topic.add_title(new_primary_titles[lang]['text'], lang, True, False, disambiguation=new_primary_titles[lang].get("disambiguation", "")) - if new_non_primary_titles: - for title in new_non_primary_titles[lang]: - topic.add_title(title['text'], lang, disambiguation=title.get("disambiguation", "")) - return topic - - def update_authors_place_and_time(topic, dataSource='learning-team-editing-tool', **kwargs): # update place info added to author, then update year and era info if not hasattr(topic, 'properties'): @@ -1129,7 +1109,9 @@ def update_topic(topic, **kwargs): """ old_category = "" orig_slug = topic.slug - update_topic_titles(topic, **kwargs) + new_titles = kwargs.get('titles') + if new_titles: + topic.set_titles(new_titles) if kwargs.get('category') == 'authors': topic = update_authors_place_and_time(topic, **kwargs) From 959cca9e14dbb01a06b9bab0431f283672743da0 Mon Sep 17 00:00:00 2001 From: yonadavGit Date: Tue, 16 Jul 2024 15:44:41 +0300 Subject: [PATCH 172/210] chore(topic editor): revert changes to updating single title functions --- sefaria/model/schema.py | 10 ++++------ sefaria/model/topic.py | 2 +- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/sefaria/model/schema.py b/sefaria/model/schema.py index b0817cb916..77cb1f4f07 100644 --- a/sefaria/model/schema.py +++ b/sefaria/model/schema.py @@ -123,7 +123,7 @@ def remove_title(self, text, lang): self.titles = [t for t in self.titles if not (t["lang"] == lang and t["text"] == text)] return self - def add_title(self, text, lang, primary=False, replace_primary=False, presentation="combined", disambiguation=''): + def add_title(self, text, lang, primary=False, replace_primary=False, presentation="combined"): """ :param text: Text of the title :param language: Language code of the title (e.g. "en" or "he") @@ -150,8 +150,7 @@ def add_title(self, text, lang, primary=False, replace_primary=False, presentati if presentation == "alone" or presentation == "both": d["presentation"] = presentation - if disambiguation: - d['disambiguation'] = disambiguation + has_primary = any([x for x in self.titles if x["lang"] == lang and x.get("primary")]) if has_primary and primary: if not replace_primary: @@ -172,16 +171,15 @@ def add_primary_titles(self, en_title, he_title): self.add_title(en_title, 'en', primary=True) self.add_title(he_title, 'he', primary=True) - def add_title(self, text, lang, primary=False, replace_primary=False, disambiguation=''): + def add_title(self, text, lang, primary=False, replace_primary=False): """ :param text: Text of the title :param language: Language code of the title (e.g. "en" or "he") :param primary: Is this a primary title? :param replace_primary: must be true to replace an existing primary title - :param disambiguation: :return: the object """ - return self.title_group.add_title(text, lang, primary, replace_primary, disambiguation=disambiguation) + return self.title_group.add_title(text, lang, primary, replace_primary) def remove_title(self, text, lang): return self.title_group.remove_title(text, lang) diff --git a/sefaria/model/topic.py b/sefaria/model/topic.py index 9f06b95ae3..b965dbcaf5 100644 --- a/sefaria/model/topic.py +++ b/sefaria/model/topic.py @@ -383,7 +383,7 @@ def get_primary_title(self, lang='en', with_disambiguation=True): title += ' [Ambiguous]' return title - def get_titles(self, lang=None, with_disambiguation=False): + def get_titles(self, lang=None, with_disambiguation=True): if with_disambiguation: titles = [] for title in self.get_titles_object(): From da7473bb1086460ca893f10e1b897301a6e805d0 Mon Sep 17 00:00:00 2001 From: yonadavGit Date: Tue, 16 Jul 2024 15:51:03 +0300 Subject: [PATCH 173/210] refactor(topic editor): cleaner regex extraction --- static/js/TopicEditor.jsx | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/static/js/TopicEditor.jsx b/static/js/TopicEditor.jsx index c1f284b3c2..376c757298 100644 --- a/static/js/TopicEditor.jsx +++ b/static/js/TopicEditor.jsx @@ -35,6 +35,8 @@ const TopicEditor = ({origData, onCreateSuccess, close, origWasCat}) => { const [isChanged, setIsChanged] = useState(false); const [changedPicture, setChangedPicture] = useState(false); + const disambiguationExtractionRegex = /\((.+)\)$/; + const toggle = function() { setSavingStatus(savingStatus => !savingStatus); } @@ -115,15 +117,10 @@ const TopicEditor = ({origData, onCreateSuccess, close, origWasCat}) => { } const extractDisambiguationFromTitle = function(titleText){ - let regex = /\((.+)\)$/; - let matches = titleText.match(regex); - let insideBrackets = matches ? matches[1] : null; - return insideBrackets + return titleText.match(disambiguationExtractionRegex); } const removeDisambiguationFromTitle = function(titleText){ - let regex = /\((.+)\)$/; - let newTitle = titleText.replace(regex, ""); - return newTitle + return titleText.replace(disambiguationExtractionRegex, "").trimEnd(); } const createPrimaryTitleObj = function(data, lang){ From cec0b2d10fc9ed5c3ff263fa3a38a912af0004e9 Mon Sep 17 00:00:00 2001 From: yonadavGit Date: Tue, 16 Jul 2024 16:07:08 +0300 Subject: [PATCH 174/210] refactor(topic editor): cleaner client side prep of data --- static/js/TopicEditor.jsx | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/static/js/TopicEditor.jsx b/static/js/TopicEditor.jsx index 376c757298..fb8d3db0ef 100644 --- a/static/js/TopicEditor.jsx +++ b/static/js/TopicEditor.jsx @@ -117,22 +117,19 @@ const TopicEditor = ({origData, onCreateSuccess, close, origWasCat}) => { } const extractDisambiguationFromTitle = function(titleText){ - return titleText.match(disambiguationExtractionRegex); + return titleText.match(disambiguationExtractionRegex)?.[1]; } const removeDisambiguationFromTitle = function(titleText){ return titleText.replace(disambiguationExtractionRegex, "").trimEnd(); } - const createPrimaryTitleObj = function(data, lang){ - const fieldName = lang == 'en' ? 'enTitle' : 'heTitle'; - let primaryTitleObj = {'text': removeDisambiguationFromTitle(data[fieldName]), "lang": lang, "primary": true}; - let disambiguation = extractDisambiguationFromTitle(data[fieldName]); + const createPrimaryTitleObj = function(rawTitle, lang){ + let primaryTitleObj = {'text': removeDisambiguationFromTitle(rawTitle), "lang": lang, "primary": true}; + let disambiguation = extractDisambiguationFromTitle(rawTitle); if (disambiguation) {primaryTitleObj["disambiguation"]=disambiguation}; return primaryTitleObj; }; - const createNonPrimaryTitleObjArray = function(data, lang){ - const fieldName = lang == 'en' ? 'enAltTitles' : 'heAltTitles'; - const altTitles = data[fieldName]; + const createNonPrimaryTitleObjArray = function(altTitles, lang){ const titleObjArray = [] if (Array.isArray(altTitles)) { altTitles.forEach((title, index) => { @@ -147,15 +144,15 @@ const TopicEditor = ({origData, onCreateSuccess, close, origWasCat}) => { const prepData = () => { // always add category, title, heTitle, altTitles - let postData = { category: data.catSlug, title: data.enTitle, heTitle: data.heTitle, altTitles: {}, titles: []}; - postData.altTitles.en = data.enAltTitles.map(x => x.name); // alt titles implemented using TitleVariants which contains list of objects with 'name' property. - postData.altTitles.he = data.heAltTitles.map(x => x.name); + let postData = { category: data.catSlug, titles: []}; + // postData.altTitles.en = data.enAltTitles.map(x => x.name); // alt titles implemented using TitleVariants which contains list of objects with 'name' property. + // postData.altTitles.he = data.heAltTitles.map(x => x.name); //convert title and altTitles to the database format, including extraction of disambiguation from title string - postData['titles'].push(createPrimaryTitleObj(data, 'en')); - postData['titles'].push(createPrimaryTitleObj(data, 'he')); - postData['titles'] = postData['titles'].concat(createNonPrimaryTitleObjArray(data, 'en')); - postData['titles'] = postData['titles'].concat(createNonPrimaryTitleObjArray(data, 'he')); + postData['titles'].push(createPrimaryTitleObj(data.enTitle, 'en')); + postData['titles'].push(createPrimaryTitleObj(data.heTitle, 'he')); + postData['titles'] = postData['titles'].concat(createNonPrimaryTitleObjArray(data.enAltTitles, 'en')); + postData['titles'] = postData['titles'].concat(createNonPrimaryTitleObjArray(data.heAltTitles, 'he')); // add image if image or caption changed const origImageURI = origData?.origImage?.image_uri || ""; From c134853e38717b22a1ee69163bc650a01df16a63 Mon Sep 17 00:00:00 2001 From: Skyler Cohen Date: Tue, 16 Jul 2024 17:44:58 -0400 Subject: [PATCH 175/210] chore(jobs page): Use Loading and LoadingRing components to allow a different message for other language users --- static/js/StaticPages.jsx | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/static/js/StaticPages.jsx b/static/js/StaticPages.jsx index fbefb227e5..fe61a86595 100644 --- a/static/js/StaticPages.jsx +++ b/static/js/StaticPages.jsx @@ -4,6 +4,8 @@ import { TwoOrThreeBox, ResponsiveNBox, NBox, InterfaceText, + LoadingMessage, + LoadingRing, } from './Misc'; import {NewsletterSignUpForm} from "./NewsletterSignUpForm"; import palette from './sefaria/palette'; @@ -3117,7 +3119,10 @@ const JobsPage = memo(() => { {error ? (

{error}

) : loading ? ( -

Loading...

+ <> + + + ) : ( <> From a9a27887d5a48686986b068734d18811a0aac252 Mon Sep 17 00:00:00 2001 From: yonadavGit Date: Wed, 17 Jul 2024 14:16:43 +0300 Subject: [PATCH 176/210] chore(topic page editor): PR fixes --- sefaria/helper/topic.py | 4 ++-- static/js/TopicEditor.jsx | 20 ++++++++------------ 2 files changed, 10 insertions(+), 14 deletions(-) diff --git a/sefaria/helper/topic.py b/sefaria/helper/topic.py index 27867f97da..9b4b0569ba 100644 --- a/sefaria/helper/topic.py +++ b/sefaria/helper/topic.py @@ -1102,8 +1102,8 @@ def update_topic(topic, **kwargs): """ Can update topic object's title, hebrew title, category, description, and categoryDescription fields :param topic: (Topic) The topic to update - :param **kwargs can be title, heTitle, category, description, categoryDescription, and rebuild_toc where `title`, `heTitle`, - and `category` are strings. `description` and `categoryDescription` are dictionaries where the fields are `en` and `he`. + :param **kwargs can be titles, category, description, categoryDescription, and rebuild_toc where `titles` is a list + of title objects as they are represented in the database, and `category` is a string. `description` and `categoryDescription` are dictionaries where the fields are `en` and `he`. The `category` parameter should be the slug of the new category. `rebuild_topic_toc` is a boolean and is assumed to be True :return: (model.Topic) The modified topic """ diff --git a/static/js/TopicEditor.jsx b/static/js/TopicEditor.jsx index fb8d3db0ef..9680617df1 100644 --- a/static/js/TopicEditor.jsx +++ b/static/js/TopicEditor.jsx @@ -131,28 +131,24 @@ const TopicEditor = ({origData, onCreateSuccess, close, origWasCat}) => { }; const createNonPrimaryTitleObjArray = function(altTitles, lang){ const titleObjArray = [] - if (Array.isArray(altTitles)) { - altTitles.forEach((title, index) => { - let titleObj = {'text': removeDisambiguationFromTitle(title['name']), "lang": lang}; - let disambiguation = extractDisambiguationFromTitle(title['name']); - if (disambiguation) {titleObj["disambiguation"]=disambiguation} - titleObjArray.push(titleObj) - }); - } + altTitles.forEach((title) => { + let titleObj = {'text': removeDisambiguationFromTitle(title), "lang": lang}; + let disambiguation = extractDisambiguationFromTitle(title); + if (disambiguation) {titleObj["disambiguation"]=disambiguation} + titleObjArray.push(titleObj) + }); return titleObjArray }; const prepData = () => { // always add category, title, heTitle, altTitles let postData = { category: data.catSlug, titles: []}; - // postData.altTitles.en = data.enAltTitles.map(x => x.name); // alt titles implemented using TitleVariants which contains list of objects with 'name' property. - // postData.altTitles.he = data.heAltTitles.map(x => x.name); //convert title and altTitles to the database format, including extraction of disambiguation from title string postData['titles'].push(createPrimaryTitleObj(data.enTitle, 'en')); postData['titles'].push(createPrimaryTitleObj(data.heTitle, 'he')); - postData['titles'] = postData['titles'].concat(createNonPrimaryTitleObjArray(data.enAltTitles, 'en')); - postData['titles'] = postData['titles'].concat(createNonPrimaryTitleObjArray(data.heAltTitles, 'he')); + postData['titles'] = postData['titles'].concat(createNonPrimaryTitleObjArray(data.enAltTitles.map(x => x.name), 'en')); + postData['titles'] = postData['titles'].concat(createNonPrimaryTitleObjArray(data.heAltTitles.map(x => x.name), 'he')); // add image if image or caption changed const origImageURI = origData?.origImage?.image_uri || ""; From 61a1d0605d3435b026256dd0cf0d17d65d43c3f6 Mon Sep 17 00:00:00 2001 From: yonadavGit Date: Wed, 17 Jul 2024 16:36:13 +0300 Subject: [PATCH 177/210] chore(topic page editor): simple test for update topic --- sefaria/model/tests/topic_test.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/sefaria/model/tests/topic_test.py b/sefaria/model/tests/topic_test.py index 59f49d205b..0594043cca 100644 --- a/sefaria/model/tests/topic_test.py +++ b/sefaria/model/tests/topic_test.py @@ -3,6 +3,7 @@ from sefaria.model.text import Ref from sefaria.system.database import db from sefaria.system.exceptions import SluggedMongoRecordMissingError +from sefaria.helper.topic import update_topic def make_topic(slug): @@ -164,6 +165,16 @@ def test_sanitize(self): assert " {% endif %} - - - From 084621f7e4de1634b54dfdf65c50bf411b26c521 Mon Sep 17 00:00:00 2001 From: Skyler Cohen Date: Mon, 29 Jul 2024 23:03:38 -0400 Subject: [PATCH 198/210] chore(strapi-cms): Disable usage of Strapi for Banners, Modals, and Sidebar Ads during Unbounce trial --- static/js/Promotions.jsx | 3 ++- static/js/ReaderApp.jsx | 26 ++++++++++---------------- static/js/context.js | 3 ++- 3 files changed, 14 insertions(+), 18 deletions(-) diff --git a/static/js/Promotions.jsx b/static/js/Promotions.jsx index 508fb24cf9..ac1a489110 100644 --- a/static/js/Promotions.jsx +++ b/static/js/Promotions.jsx @@ -11,7 +11,8 @@ const Promotions = () => { const context = useContext(AdContext); const strapi = useContext(StrapiDataContext); useEffect(() => { - if (strapi.dataFromStrapiHasBeenReceived) { + // Disable Strapi for Sidebar Ads during Unbounce trial + if (false && strapi.dataFromStrapiHasBeenReceived) { Sefaria._inAppAds = []; const sidebarAds = strapi.strapiData?.sidebarAds?.data; diff --git a/static/js/ReaderApp.jsx b/static/js/ReaderApp.jsx index be4dcd15fd..51707d9c6e 100644 --- a/static/js/ReaderApp.jsx +++ b/static/js/ReaderApp.jsx @@ -2248,23 +2248,17 @@ toggleSignUpModal(modalContentKind = SignUpModalKind.Default) { var classes = classNames(classDict); return ( - // The Strapi context is put at the highest level of scope so any component or children within ReaderApp can use the static content received - // InterruptingMessage modals and Banners will always render if available but stay hidden initially - - -
- - -
- {header} - {panels} - {signUpModal} - {communityPagePreviewControls} - -
+ +
+
+ {header} + {panels} + {signUpModal} + {communityPagePreviewControls} +
- - +
+
); } } diff --git a/static/js/context.js b/static/js/context.js index 8c9f1f03f4..9b9d0778bf 100644 --- a/static/js/context.js +++ b/static/js/context.js @@ -19,7 +19,8 @@ function StrapiDataProvider({ children }) { const [modal, setModal] = useState(null); const [banner, setBanner] = useState(null); useEffect(() => { - if (STRAPI_INSTANCE) { + // Disable Strapi API calls during Unbounce trial + if (false && typeof STRAPI_INSTANCE !== "undefined" && STRAPI_INSTANCE) { const getStrapiData = async () => { let getDateWithoutTime = (date) => date.toISOString().split("T")[0]; let getJSONDateStringInLocalTimeZone = (date) => { From 83cfe810d5ebe068e79fc9f5067358a4f652b988 Mon Sep 17 00:00:00 2001 From: Skyler Cohen Date: Mon, 29 Jul 2024 23:07:55 -0400 Subject: [PATCH 199/210] chore(unbounce): Add Unbounce embed code for trial --- templates/base.html | 2 ++ 1 file changed, 2 insertions(+) diff --git a/templates/base.html b/templates/base.html index 17e26ba1fd..9aba8e05ff 100644 --- a/templates/base.html +++ b/templates/base.html @@ -151,6 +151,8 @@ {% endif %} + + From 42ab12ae76a059fe67cf0931a07983150b3cf2cb Mon Sep 17 00:00:00 2001 From: yonadavGit Date: Tue, 30 Jul 2024 11:16:33 +0300 Subject: [PATCH 200/210] refactor(topic tests): cleaner test remove list comprehension --- sefaria/helper/tests/topic_test.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/sefaria/helper/tests/topic_test.py b/sefaria/helper/tests/topic_test.py index 204089642e..e5e5477e0d 100644 --- a/sefaria/helper/tests/topic_test.py +++ b/sefaria/helper/tests/topic_test.py @@ -97,13 +97,18 @@ def actual_author(author_root): def test_title_and_desc(author_root, actual_author, root_with_self_link, child_of_root_with_self_link, grandchild_of_root_with_self_link): for count, t in enumerate([author_root, actual_author, root_with_self_link, child_of_root_with_self_link, grandchild_of_root_with_self_link]): - new_values = {"titles": [{"text": f"new title {count+1}", "primary": True, "lang": 'en'}, - {"lang": "en", "text": f"New Alt title {count+1}"}, {"lang": "he", "text": f"New He Alt Title {count+1}"}, - {"lang": "he", "text": f"new hebrew title {count+1}", "primary": True}], "description": {"en": f"new desc", "he": "new hebrew desc"}} + en_primary_title = {"text": f"new title {count+1}", "primary": True, "lang": 'en'} + he_primary_title = {"lang": "he", "text": f"new hebrew title {count+1}", "primary": True} + + en_alt_title = {"lang": "en", "text": f"New Alt title {count+1}"} + he_alt_title = {"lang": "he", "text": f"New He Alt Title {count+1}"} + + new_values = {"titles": [en_primary_title, en_alt_title, he_alt_title, he_primary_title], + "description": {"en": f"new desc", "he": "new hebrew desc"}} topic.update_topic(t["topic"], **new_values) assert t["topic"].description == new_values["description"] - assert t["topic"].get_primary_title('he') == [title['text'] for title in new_values["titles"] if title["lang"]=='he' and title.get('primary', None)][0] - assert t["topic"].get_titles('en') == [title["text"] for title in new_values['titles'] if title["lang"] == 'en'] + assert t["topic"].get_primary_title('he') == he_primary_title['text'] + assert t["topic"].get_titles('en') == [en_primary_title['text'], en_alt_title['text']] def test_author_root(author_root, actual_author): new_values = {"category": "authors", "titles": [ From 06236d248e9d359fd8f1e6e1d2a710be4952610e Mon Sep 17 00:00:00 2001 From: yonadavGit Date: Tue, 30 Jul 2024 13:39:39 +0300 Subject: [PATCH 201/210] refactor(topic tests): cleaner getting of keys from kwargs --- sefaria/helper/topic.py | 4 ++-- sefaria/model/place.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sefaria/helper/topic.py b/sefaria/helper/topic.py index 9f3d39aeca..3afdccc844 100644 --- a/sefaria/helper/topic.py +++ b/sefaria/helper/topic.py @@ -1085,11 +1085,11 @@ def update_properties(topic_obj, dataSource, k, v): def update_author_era(topic_obj, dataSource='learning-team-editing-tool', **kwargs): for k in ["birthYear", "deathYear"]: - if k in kwargs.keys() and kwargs[k]: # only change property value if key exists, otherwise it indicates no change + if kwargs.get(k, False): # only change property value if key exists, otherwise it indicates no change year = kwargs[k] update_properties(topic_obj, dataSource, k, year) - if 'era' in kwargs.keys() and kwargs['era']: # only change property value if key is in data, otherwise it indicates no change + if kwargs.get('era', False): # only change property value if key is in data, otherwise it indicates no change prev_era = topic_obj.properties.get('era', {}).get('value') era = kwargs['era'] update_properties(topic_obj, dataSource, 'era', era) diff --git a/sefaria/model/place.py b/sefaria/model/place.py index 93a161a390..c5f24ac58c 100644 --- a/sefaria/model/place.py +++ b/sefaria/model/place.py @@ -112,7 +112,7 @@ def process_index_place_change(indx, **kwargs): def process_topic_place_change(topic_obj, **kwargs): keys = ["birthPlace", "deathPlace"] for key in keys: - if key in kwargs.keys() and kwargs[key]: # only change property value if key is in data, otherwise it indicates no change + if kwargs.get(key, False): # only change property value if key is in data, otherwise it indicates no change new_val = kwargs[key] if new_val != '': he_key = get_he_key(key) From bf2f35bd895965f9624279f3a0d4a4fa8303aaae Mon Sep 17 00:00:00 2001 From: yonadavGit Date: Wed, 31 Jul 2024 12:06:31 +0300 Subject: [PATCH 202/210] fix(topic editing): have update_topic accept also arbitrary kwargs so as not fail when sent other arguments --- sefaria/helper/topic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sefaria/helper/topic.py b/sefaria/helper/topic.py index 3afdccc844..a49393de0b 100644 --- a/sefaria/helper/topic.py +++ b/sefaria/helper/topic.py @@ -1100,7 +1100,7 @@ def update_author_era(topic_obj, dataSource='learning-team-editing-tool', **kwar def update_topic(topic, titles=None, category=None, origCategory=None, categoryDescritpion=None, description=None, birthPlace=None, deathPlace=None, birthYear=None, era=None, - rebuild_toc=True, manual=False, image=None): + rebuild_toc=True, manual=False, image=None, **kwargs): """ Can update topic object's titles, category, description, and categoryDescription fields :param topic: (Topic) The topic to update From 9a64468ac84455244f7c17d6858531850bb17c9d Mon Sep 17 00:00:00 2001 From: stevekaplan123 Date: Wed, 31 Jul 2024 12:35:44 +0300 Subject: [PATCH 203/210] fix(Topic Pages): getBulkText bug that led it to generate URLs over 3800 characters caused by pipe character being 3 not 1 chars --- static/js/sefaria/sefaria.js | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/static/js/sefaria/sefaria.js b/static/js/sefaria/sefaria.js index 231e3e9e37..0e3b13e4d5 100644 --- a/static/js/sefaria/sefaria.js +++ b/static/js/sefaria/sefaria.js @@ -479,9 +479,10 @@ Sefaria = extend(Sefaria, { let refStrs = [""]; refs.map(ref => { let last = refStrs[refStrs.length-1]; - const encodedRef = encodeURIComponent(ref) - if (`${hostStr}${last}|${encodedRef}${paramStr}`.length > MAX_URL_LENGTH) { - refStrs.push(encodedRef) + const encodedRef = encodeURIComponent(ref); + const encodedFullURL = encodeURIComponent(`${hostStr}${last}|${ref}${paramStr}`); + if (encodedFullURL.length > MAX_URL_LENGTH) { + refStrs.push(encodedRef); } else { refStrs[refStrs.length-1] += last.length ? `|${encodedRef}` : encodedRef; } From 0fd573c50d9b8ee02348b4f74a2ca264ea495153 Mon Sep 17 00:00:00 2001 From: yonadavGit Date: Wed, 31 Jul 2024 13:12:10 +0300 Subject: [PATCH 204/210] fix(topic editing): add deathYear parameter --- sefaria/helper/topic.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sefaria/helper/topic.py b/sefaria/helper/topic.py index a49393de0b..d6f8aa92bc 100644 --- a/sefaria/helper/topic.py +++ b/sefaria/helper/topic.py @@ -1099,7 +1099,7 @@ def update_author_era(topic_obj, dataSource='learning-team-editing-tool', **kwar def update_topic(topic, titles=None, category=None, origCategory=None, categoryDescritpion=None, description=None, - birthPlace=None, deathPlace=None, birthYear=None, era=None, + birthPlace=None, deathPlace=None, birthYear=None, deathYear=None, era=None, rebuild_toc=True, manual=False, image=None, **kwargs): """ Can update topic object's titles, category, description, and categoryDescription fields @@ -1115,7 +1115,7 @@ def update_topic(topic, titles=None, category=None, origCategory=None, categoryD if titles: topic.set_titles(titles) if category == 'authors': - topic = update_authors_place_and_time(topic, birthPlace=birthPlace, birthYear=birthYear, deathPlace=deathPlace, era=era) + topic = update_authors_place_and_time(topic, birthPlace=birthPlace, birthYear=birthYear, deathPlace=deathPlace, deathYear=deathYear, era=era) if category and origCategory and origCategory != category: orig_link = IntraTopicLink().load({"linkType": "displays-under", "fromTopic": topic.slug, "toTopic": {"$ne": topic.slug}}) From 4b613530e3a1f90eb38367befd71cebdb8b1a3f4 Mon Sep 17 00:00:00 2001 From: stevekaplan123 Date: Wed, 31 Jul 2024 15:51:11 +0300 Subject: [PATCH 205/210] Revert "fix(Topic Pages): getBulkText bug that led it to generate URLs over 3800 characters caused by pipe character being 3 not 1 chars" This reverts commit 9a64468ac84455244f7c17d6858531850bb17c9d. --- static/js/sefaria/sefaria.js | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/static/js/sefaria/sefaria.js b/static/js/sefaria/sefaria.js index 0e3b13e4d5..231e3e9e37 100644 --- a/static/js/sefaria/sefaria.js +++ b/static/js/sefaria/sefaria.js @@ -479,10 +479,9 @@ Sefaria = extend(Sefaria, { let refStrs = [""]; refs.map(ref => { let last = refStrs[refStrs.length-1]; - const encodedRef = encodeURIComponent(ref); - const encodedFullURL = encodeURIComponent(`${hostStr}${last}|${ref}${paramStr}`); - if (encodedFullURL.length > MAX_URL_LENGTH) { - refStrs.push(encodedRef); + const encodedRef = encodeURIComponent(ref) + if (`${hostStr}${last}|${encodedRef}${paramStr}`.length > MAX_URL_LENGTH) { + refStrs.push(encodedRef) } else { refStrs[refStrs.length-1] += last.length ? `|${encodedRef}` : encodedRef; } From 6a0c5d7cdff83cf257dca32a1f69d6dd09fb2751 Mon Sep 17 00:00:00 2001 From: stevekaplan123 Date: Wed, 31 Jul 2024 16:05:07 +0300 Subject: [PATCH 206/210] fix(Topic Pages): encode full URL so that pipe character also gets encoded --- static/js/sefaria/sefaria.js | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/static/js/sefaria/sefaria.js b/static/js/sefaria/sefaria.js index 231e3e9e37..c9b99d7673 100644 --- a/static/js/sefaria/sefaria.js +++ b/static/js/sefaria/sefaria.js @@ -479,16 +479,16 @@ Sefaria = extend(Sefaria, { let refStrs = [""]; refs.map(ref => { let last = refStrs[refStrs.length-1]; - const encodedRef = encodeURIComponent(ref) - if (`${hostStr}${last}|${encodedRef}${paramStr}`.length > MAX_URL_LENGTH) { - refStrs.push(encodedRef) + const encodedFullURL = encodeURI(`${hostStr}${last}|${ref}${paramStr}`); + if (encodedFullURL.length > MAX_URL_LENGTH) { + refStrs.push(ref) } else { - refStrs[refStrs.length-1] += last.length ? `|${encodedRef}` : encodedRef; + refStrs[refStrs.length-1] += last.length ? `|${ref}` : ref; } }); let promises = refStrs.map(refStr => this._cachedApiPromise({ - url: `${hostStr}${refStr}${paramStr}`, + url: `${hostStr}${encodeURIComponent(refStr)}${paramStr}`, key: refStr + paramStr, store: this._bulkTexts })); From ca4097f27b87acf1358201548582f8a7f51bbcf9 Mon Sep 17 00:00:00 2001 From: yonadavGit Date: Wed, 31 Jul 2024 16:20:38 +0300 Subject: [PATCH 207/210] chore(image gen): simulate old bidi invoke --- sefaria/image_generator.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/sefaria/image_generator.py b/sefaria/image_generator.py index cbd430d329..e7ec9f0fba 100644 --- a/sefaria/image_generator.py +++ b/sefaria/image_generator.py @@ -1,10 +1,6 @@ from PIL import Image, ImageDraw, ImageFont import textwrap -try: - from bidi import get_display -except: - from bidi.algorithm import get_display - +from bidi.algorithm import get_display import re from django.http import HttpResponse import io From d1ce56e3d86f6852a7d9e60dc58ff6e44765de40 Mon Sep 17 00:00:00 2001 From: yonadavGit Date: Wed, 31 Jul 2024 16:22:57 +0300 Subject: [PATCH 208/210] chore(image gen): pin requirment python-bidi==0.4.2 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index d363cbaa8e..40736e93b3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -53,7 +53,7 @@ psycopg2==2.8.6 #for dev: psycopg2-binary==2.8.6 py2-py3-django-email-as-username==1.7.1 pymongo==3.12.* pytest==6.1.1 -python-bidi #for devs on intel devices:python-bidi==0.4.2 +python-bidi==0.4.2 pytz pyyaml==6.0.1 rauth==0.7.3 From 9cf38357cb73a0b0e838fab3ea13e570f6408282 Mon Sep 17 00:00:00 2001 From: yonadavGit Date: Thu, 1 Aug 2024 10:21:32 +0300 Subject: [PATCH 209/210] chore(image gen): pin requirment python-bidi==0.6.0 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 40736e93b3..f98e40499a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -53,7 +53,7 @@ psycopg2==2.8.6 #for dev: psycopg2-binary==2.8.6 py2-py3-django-email-as-username==1.7.1 pymongo==3.12.* pytest==6.1.1 -python-bidi==0.4.2 +python-bidi==0.6.0 pytz pyyaml==6.0.1 rauth==0.7.3 From 4f5c3648a53fd881e020b1fdb66c221e047e8bcb Mon Sep 17 00:00:00 2001 From: yonadavGit Date: Thu, 1 Aug 2024 11:54:44 +0300 Subject: [PATCH 210/210] chore(image gen): unpin requirment python-bidi --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index f98e40499a..c1219d9635 100644 --- a/requirements.txt +++ b/requirements.txt @@ -53,7 +53,7 @@ psycopg2==2.8.6 #for dev: psycopg2-binary==2.8.6 py2-py3-django-email-as-username==1.7.1 pymongo==3.12.* pytest==6.1.1 -python-bidi==0.6.0 +python-bidi pytz pyyaml==6.0.1 rauth==0.7.3