diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md new file mode 100644 index 0000000000..4701933c5f --- /dev/null +++ b/.github/pull_request_template.md @@ -0,0 +1,8 @@ +## Description +_A brief description of the PR_ + +## Code Changes +_The following changes were made to the files below_ + +## Notes +_Any additional notes go here_ \ No newline at end of file diff --git a/.github/workflows/continuous.yaml b/.github/workflows/continuous.yaml index d9821422be..257f2ab995 100644 --- a/.github/workflows/continuous.yaml +++ b/.github/workflows/continuous.yaml @@ -1,6 +1,10 @@ name: Continuous on: pull_request: + merge_group: + push: + branches: + - master concurrency: group: ${{ github.ref }} @@ -8,12 +12,12 @@ concurrency: jobs: build-generic: + if: ${{ github.event_name == 'pull_request' || github.event_name == 'push' }} name: "Continuous Image Build" permissions: contents: 'read' id-token: 'write' runs-on: ubuntu-latest - if: github.event.pull_request.draft == false strategy: matrix: app: [ web, node ] @@ -76,6 +80,7 @@ jobs: tags: ${{ steps.meta.outputs.tags }} labels: ${{ steps.meta.outputs.labels }} build-derived: + if: ${{ github.event_name == 'pull_request' || github.event_name == 'push' }} name: "Continuous Image Build Stage 2" runs-on: ubuntu-latest permissions: @@ -86,7 +91,6 @@ jobs: strategy: matrix: app: [ asset, linker ] - if: github.event.pull_request.draft == false steps: - uses: actions/checkout@v4 - name: Set up QEMU diff --git a/build/ci/sandbox-values.yaml b/build/ci/sandbox-values.yaml index f729cd82f5..f6984b77f0 100644 --- a/build/ci/sandbox-values.yaml +++ b/build/ci/sandbox-values.yaml @@ -44,7 +44,7 @@ monitor: tag: secrets: localSettings: - ref: local-settings-secrets-dev + ref: local-settings-secrets backupManager: ref: backup-manager slackWebhook: diff --git a/docs/openAPI.json b/docs/openAPI.json index 7816c243bd..fe97c22686 100644 --- a/docs/openAPI.json +++ b/docs/openAPI.json @@ -59,7 +59,6 @@ "hasManuallyWrappedRefs": "", "actualLanguage": "he", "languageFamilyName": "hebrew", - "isBaseText": true, "isSource": true, "isPrimary": true, "direction": "rtl", @@ -91,7 +90,6 @@ "hasManuallyWrappedRefs": "", "actualLanguage": "he", "languageFamilyName": "hebrew", - "isBaseText": true, "isSource": true, "isPrimary": true, "direction": "rtl", @@ -121,9 +119,8 @@ "hasManuallyWrappedRefs": "", "actualLanguage": "en", "languageFamilyName": "english", - "isBaseText": false, "isSource": false, - "isPrimary": "", + "isPrimary": false, "direction": "ltr", "language": "en", "title": "Esther", @@ -151,9 +148,8 @@ "hasManuallyWrappedRefs": "", "actualLanguage": "en", "languageFamilyName": "english", - "isBaseText": false, "isSource": false, - "isPrimary": "", + "isPrimary": false, "direction": "ltr", "language": "en", "title": "Esther", @@ -181,7 +177,6 @@ "hasManuallyWrappedRefs": "", "actualLanguage": "he", "languageFamilyName": "hebrew", - "isBaseText": true, "isSource": true, "isPrimary": true, "direction": "rtl", @@ -211,7 +206,6 @@ "hasManuallyWrappedRefs": "", "actualLanguage": "he", "languageFamilyName": "hebrew", - "isBaseText": true, "isSource": true, "isPrimary": true, "direction": "rtl", @@ -241,7 +235,6 @@ "hasManuallyWrappedRefs": "", "actualLanguage": "he", "languageFamilyName": "hebrew", - "isBaseText": true, "isSource": true, "isPrimary": true, "direction": "rtl", @@ -271,9 +264,8 @@ "hasManuallyWrappedRefs": "", "actualLanguage": "en", "languageFamilyName": "english", - "isBaseText": false, "isSource": false, - "isPrimary": "", + "isPrimary": false, "direction": "ltr", "language": "en", "title": "Esther", @@ -301,9 +293,8 @@ "hasManuallyWrappedRefs": "", "actualLanguage": "en", "languageFamilyName": "english", - "isBaseText": false, "isSource": false, - "isPrimary": "", + "isPrimary": false, "direction": "ltr", "language": "en", "title": "Esther", @@ -331,9 +322,8 @@ "hasManuallyWrappedRefs": "", "actualLanguage": "en", "languageFamilyName": "english", - "isBaseText": false, "isSource": false, - "isPrimary": "", + "isPrimary": false, "direction": "ltr", "language": "en", "title": "Esther", @@ -361,9 +351,8 @@ "hasManuallyWrappedRefs": "", "actualLanguage": "en", "languageFamilyName": "english", - "isBaseText": false, "isSource": false, - "isPrimary": "", + "isPrimary": false, "direction": "ltr", "language": "en", "title": "Esther", @@ -391,7 +380,6 @@ "hasManuallyWrappedRefs": "", "actualLanguage": "he", "languageFamilyName": "hebrew", - "isBaseText": true, "isSource": true, "isPrimary": true, "direction": "rtl", @@ -421,7 +409,6 @@ "hasManuallyWrappedRefs": "", "actualLanguage": "he", "languageFamilyName": "hebrew", - "isBaseText": true, "isSource": true, "isPrimary": true, "direction": "rtl", @@ -451,9 +438,8 @@ "hasManuallyWrappedRefs": "", "actualLanguage": "fr", "languageFamilyName": "french", - "isBaseText": false, "isSource": false, - "isPrimary": "", + "isPrimary": false, "direction": "ltr", "language": "en", "title": "Esther", @@ -481,7 +467,6 @@ "hasManuallyWrappedRefs": "", "actualLanguage": "he", "languageFamilyName": "hebrew", - "isBaseText": true, "isSource": true, "isPrimary": true, "direction": "rtl", @@ -511,9 +496,8 @@ "hasManuallyWrappedRefs": "", "actualLanguage": "en", "languageFamilyName": "english", - "isBaseText": false, "isSource": false, - "isPrimary": "", + "isPrimary": false, "direction": "ltr", "language": "en", "title": "Esther", @@ -541,9 +525,8 @@ "hasManuallyWrappedRefs": "", "actualLanguage": "pl", "languageFamilyName": "polish", - "isBaseText": false, "isSource": false, - "isPrimary": "", + "isPrimary": false, "direction": "ltr", "language": "en", "title": "Esther", @@ -571,9 +554,8 @@ "hasManuallyWrappedRefs": "", "actualLanguage": "it", "languageFamilyName": "italian", - "isBaseText": false, "isSource": false, - "isPrimary": "", + "isPrimary": false, "direction": "ltr", "language": "en", "title": "Esther", @@ -601,9 +583,8 @@ "hasManuallyWrappedRefs": "", "actualLanguage": "fa", "languageFamilyName": "persian", - "isBaseText": false, "isSource": false, - "isPrimary": "", + "isPrimary": false, "direction": "rtl", "language": "he", "title": "Esther", @@ -631,9 +612,8 @@ "hasManuallyWrappedRefs": "", "actualLanguage": "fa", "languageFamilyName": "persian", - "isBaseText": false, "isSource": false, - "isPrimary": "", + "isPrimary": false, "direction": "ltr", "language": "en", "title": "Esther", @@ -661,9 +641,8 @@ "hasManuallyWrappedRefs": "", "actualLanguage": "yi", "languageFamilyName": "yiddish", - "isBaseText": false, "isSource": false, - "isPrimary": "", + "isPrimary": false, "direction": "rtl", "language": "he", "title": "Esther", @@ -691,9 +670,8 @@ "hasManuallyWrappedRefs": "", "actualLanguage": "es", "languageFamilyName": "spanish", - "isBaseText": false, "isSource": false, - "isPrimary": "", + "isPrimary": false, "direction": "ltr", "language": "en", "title": "Esther", @@ -721,9 +699,8 @@ "hasManuallyWrappedRefs": "", "actualLanguage": "fr", "languageFamilyName": "french", - "isBaseText": false, "isSource": false, - "isPrimary": "", + "isPrimary": false, "direction": "ltr", "language": "en", "title": "Esther", @@ -751,9 +728,8 @@ "hasManuallyWrappedRefs": "", "actualLanguage": "eo", "languageFamilyName": "esperanto", - "isBaseText": false, "isSource": false, - "isPrimary": "", + "isPrimary": false, "direction": "ltr", "language": "en", "title": "Esther", @@ -781,9 +757,8 @@ "hasManuallyWrappedRefs": "", "actualLanguage": "de", "languageFamilyName": "german", - "isBaseText": false, "isSource": false, - "isPrimary": "", + "isPrimary": false, "direction": "ltr", "language": "en", "title": "Esther", @@ -811,7 +786,6 @@ "hasManuallyWrappedRefs": "", "actualLanguage": "he", "languageFamilyName": "hebrew", - "isBaseText": "", "isSource": true, "isPrimary": true, "direction": "rtl", @@ -841,9 +815,8 @@ "hasManuallyWrappedRefs": "", "actualLanguage": "en", "languageFamilyName": "english", - "isBaseText": "", "isSource": false, - "isPrimary": "", + "isPrimary": false, "direction": "ltr", "language": "en", "title": "Esther", @@ -1374,7 +1347,6 @@ "purchaseInformationURL": "https://www.israelbookshoppublications.com/store/pc/Metsudah-Five-Megilloth-w-Rashi-26p309.htm", "shortVersionTitle": "", "shortVersionTitleInHebrew": "", - "isBaseText": true, "firstSectionRef": "Rashi on Ecclesiastes 1:1" }, { @@ -1395,7 +1367,6 @@ "purchaseInformationURL": "https://www.israelbookshoppublications.com/store/pc/Metsudah-Five-Megilloth-w-Rashi-26p309.htm", "shortVersionTitle": "", "shortVersionTitleInHebrew": "", - "isBaseText": false, "firstSectionRef": "Rashi on Ecclesiastes 1:1" }, { @@ -1416,7 +1387,6 @@ "purchaseInformationURL": "", "shortVersionTitle": "", "shortVersionTitleInHebrew": "", - "isBaseText": true, "firstSectionRef": "Rashi on Ecclesiastes 1:1" }, { @@ -1437,7 +1407,6 @@ "purchaseInformationURL": "", "shortVersionTitle": "", "shortVersionTitleInHebrew": "", - "isBaseText": false, "firstSectionRef": "Rashi on Ecclesiastes 3:3" }, { @@ -1458,7 +1427,6 @@ "purchaseInformationURL": "", "shortVersionTitle": "Abraham Cohen, Soncino Press, 1946", "shortVersionTitleInHebrew": "", - "isBaseText": false, "firstSectionRef": "Rashi on Ecclesiastes 1:1" } ] @@ -5641,9 +5609,6 @@ "shortVersionTitleInHebrew": { "type": "string" }, - "isBaseText": { - "type": "boolean" - }, "firstSectionRef": { "type": "string" } @@ -5666,7 +5631,6 @@ "purchaseInformationURL": "https://jps.org/books/contemporary-torah/", "shortVersionTitle": "The Contemporary Torah, JPS, 2006", "shortVersionTitleInHebrew": "", - "isBaseText": false, "firstSectionRef": "Genesis 1" } }, @@ -6696,8 +6660,7 @@ "purchaseInformationImage": "", "purchaseInformationURL": "", "shortVersionTitle": "", - "shortVersionTitleInHebrew": "", - "isBaseText": true + "shortVersionTitleInHebrew": "" }, { "title": "Mishnah Peah", @@ -6716,8 +6679,7 @@ "purchaseInformationImage": "", "purchaseInformationURL": "", "shortVersionTitle": "", - "shortVersionTitleInHebrew": "", - "isBaseText": true + "shortVersionTitleInHebrew": "" }, { "title": "Mishnah Peah", @@ -6736,8 +6698,7 @@ "purchaseInformationImage": "", "purchaseInformationURL": "", "shortVersionTitle": "Dr. Joshua Kulp", - "shortVersionTitleInHebrew": "", - "isBaseText": false + "shortVersionTitleInHebrew": "" }, { "title": "Mishnah Peah", @@ -6756,8 +6717,7 @@ "purchaseInformationImage": "", "purchaseInformationURL": "", "shortVersionTitle": "", - "shortVersionTitleInHebrew": "", - "isBaseText": false + "shortVersionTitleInHebrew": "" }, { "title": "Mishnah Peah", @@ -6776,8 +6736,7 @@ "purchaseInformationImage": "", "purchaseInformationURL": "", "shortVersionTitle": "Lazarus Goldschmidt, 1929 ", - "shortVersionTitleInHebrew": "", - "isBaseText": false + "shortVersionTitleInHebrew": "" }, { "title": "Mishnah Peah", @@ -6796,8 +6755,7 @@ "purchaseInformationImage": "", "purchaseInformationURL": "", "shortVersionTitle": "", - "shortVersionTitleInHebrew": "", - "isBaseText": false + "shortVersionTitleInHebrew": "" }, { "title": "Mishnah Peah", @@ -6816,8 +6774,7 @@ "purchaseInformationImage": "", "purchaseInformationURL": "", "shortVersionTitle": "", - "shortVersionTitleInHebrew": "", - "isBaseText": false + "shortVersionTitleInHebrew": "" }, { "title": "Mishnah Peah", @@ -6836,8 +6793,7 @@ "purchaseInformationImage": "", "purchaseInformationURL": "", "shortVersionTitle": "", - "shortVersionTitleInHebrew": "", - "isBaseText": "" + "shortVersionTitleInHebrew": "" } ], "textDepth": 2, @@ -7378,11 +7334,6 @@ "description": "A Hebrew short title for the version", "type": "string" }, - "isBaseText": { - "description": "Is this a text that has commentary. `true` indicates that there are commentaries on this text, while `false` indicates that the text does not have any commentary. ", - "type": "boolean", - "example": "True" - }, "firstSectionRef": { "description": "The first `ref` for this title on a section level (as opposed to the more granular segment level). For example, `Genesis 1` would be the first section level ref of Genesis (as opposed to the segment, `Genesis 1:1`), and `Rashi on Kohelet 1:1` is the first section level ref of `Rashi on Kohelet` (as opposed to the segment level of `Rashi on Kohelet 1:1:1`)", "type": "string", @@ -7407,7 +7358,6 @@ "purchaseInformationURL": "", "shortVersionTitle": "", "shortVersionTitleInHebrew": "", - "isBaseText": false, "firstSectionRef": "Shulchan Arukh, Orach Chayim 1" } }, @@ -9089,7 +9039,7 @@ "items": { "$ref": "#/components/schemas/v3TextVersionsJSON" }, - "example": "{\n\"status\": \"locked\",\n\"priority\": 2,\n\"license\": \"CC-BY-SA\",\n\"versionNotes\": \"Miqra According to the Masorah (MAM) is a digital Hebrew edition of the Tanakh based on the Aleppo Codex and related manuscripts. It is designed for readers, and as such it contains added elements to aid vocalization of the text. For instance: When an accent is marked in an unstressed syllable, an extra accent is added in the proper place (pashta, zarqa, segol, telisha). Legarmeih and paseq are visibly distinguished. Qamaz qatan is indicated by its designated Unicode character (alternatives are documented where traditions differ about its application).
The text of MAM is fully documented. The complete introduction to the edition (Hebrew) explains the types of editorial decisions that have been made and the reasons for them (English abstract). In addition, every word in the Bible about which there is some textual concern or ambiguity includes a documentation note; these notes can be viewed conveniently here. If an error is discovered, it may be reported to User:Dovi at Hebrew Wikisource. Please check the documentation notes before reporting an error.\",\n\"formatAsPoetry\": \"\",\n\"digitizedBySefaria\": \"\",\n\"method\": \"\",\n\"heversionSource\": \"\",\n\"versionUrl\": \"\",\n\"versionTitleInHebrew\": \"מקרא על פי המסורה\",\n\"versionNotesInHebrew\": \"\",\n\"shortVersionTitle\": \"\",\n\"shortVersionTitleInHebrew\": \"\",\n\"extendedNotes\": \"\",\n\"extendedNotesHebrew\": \"\",\n\"purchaseInformationImage\": \"\",\n\"purchaseInformationURL\": \"\",\n\"hasManuallyWrappedRefs\": \"\",\n\"actualLanguage\": \"he\",\n\"languageFamilyName\": \"hebrew\",\n\"isBaseText\": true,\n\"isSource\": true,\n\"isPrimary\": true,\n\"direction\": \"rtl\",\n\"language\": \"he\",\n\"versionSource\": \"https://he.wikisource.org/wiki/%D7%9E%D7%A9%D7%AA%D7%9E%D7%A9:Dovi/%D7%9E%D7%A7%D7%A8%D7%90_%D7%A2%D7%9C_%D7%A4%D7%99_%D7%94%D7%9E%D7%A1%D7%95%D7%A8%D7%94\",\n\"versionTitle\": \"Miqra according to the Masorah\",\n\"text\": [\n[\n\"חֲז֖וֹן עֹֽבַדְיָ֑ה כֹּֽה־אָמַר֩ אֲדֹנָ֨י יֱהֹוִ֜ה לֶאֱד֗וֹם שְׁמוּעָ֨ה שָׁמַ֜עְנוּ מֵאֵ֤ת יְהֹוָה֙ וְצִיר֙ בַּגּוֹיִ֣ם שֻׁלָּ֔ח ק֛וּמוּ וְנָק֥וּמָה עָלֶ֖יהָ לַמִּלְחָמָֽה׃\",\n\"הִנֵּ֥ה קָטֹ֛ן נְתַתִּ֖יךָ בַּגּוֹיִ֑ם בָּז֥וּי אַתָּ֖ה מְאֹֽד׃\",\n\"זְד֤וֹן לִבְּךָ֙ הִשִּׁיאֶ֔ךָ שֹׁכְנִ֥י בְחַגְוֵי־סֶ֖לַע מְר֣וֹם שִׁבְתּ֑וֹ אֹמֵ֣ר בְּלִבּ֔וֹ מִ֥י יוֹרִדֵ֖נִי אָֽרֶץ׃\",\n\"אִם־תַּגְבִּ֣יהַּ כַּנֶּ֔שֶׁר וְאִם־בֵּ֥ין כּֽוֹכָבִ֖ים שִׂ֣ים קִנֶּ֑ךָ מִשָּׁ֥ם אוֹרִֽידְךָ֖ נְאֻם־יְהֹוָֽה׃\",\n\"אִם־גַּנָּבִ֤ים בָּאֽוּ־לְךָ֙ אִם־שׁ֣וֹדְדֵי לַ֔יְלָה אֵ֣יךְ נִדְמֵ֔יתָה הֲל֥וֹא יִגְנְב֖וּ דַּיָּ֑ם אִם־בֹּֽצְרִים֙ בָּ֣אוּ לָ֔ךְ הֲל֖וֹא יַשְׁאִ֥ירוּ עֹלֵלֽוֹת׃\",\n\"אֵ֚יךְ נֶחְפְּשׂ֣וּ עֵשָׂ֔ו נִבְע֖וּ מַצְפֻּנָֽיו׃\",\n\"עַֽד־הַגְּב֣וּל שִׁלְּח֗וּךָ כֹּ֚ל אַנְשֵׁ֣י בְרִיתֶ֔ךָ הִשִּׁיא֛וּךָ יָכְל֥וּ לְךָ֖ אַנְשֵׁ֣י שְׁלֹמֶ֑ךָ לַחְמְךָ֗ יָשִׂ֤ימוּ מָזוֹר֙ תַּחְתֶּ֔יךָ אֵ֥ין תְּבוּנָ֖ה בּֽוֹ׃\",\n\"הֲל֛וֹא בַּיּ֥וֹם הַה֖וּא נְאֻם־יְהֹוָ֑ה וְהַאֲבַדְתִּ֤י חֲכָמִים֙ מֵֽאֱד֔וֹם וּתְבוּנָ֖ה מֵהַ֥ר עֵשָֽׂו׃\",\n\"וְחַתּ֥וּ גִבּוֹרֶ֖יךָ תֵּימָ֑ן לְמַ֧עַן יִכָּֽרֶת־אִ֛ישׁ מֵהַ֥ר עֵשָׂ֖ו מִקָּֽטֶל׃\",\n\"מֵחֲמַ֛ס אָחִ֥יךָ יַעֲקֹ֖ב תְּכַסְּךָ֣ בוּשָׁ֑ה וְנִכְרַ֖תָּ לְעוֹלָֽם׃\",\n\"בְּיוֹם֙ עֲמׇֽדְךָ֣ מִנֶּ֔גֶד בְּי֛וֹם שְׁב֥וֹת זָרִ֖ים חֵיל֑וֹ וְנׇכְרִ֞ים בָּ֣אוּ שְׁעָרָ֗ו וְעַל־יְרוּשָׁלַ֙͏ִם֙ יַדּ֣וּ גוֹרָ֔ל גַּם־אַתָּ֖ה כְּאַחַ֥ד מֵהֶֽם׃\",\n\"וְאַל־תֵּ֤רֶא בְיוֹם־אָחִ֙יךָ֙ בְּי֣וֹם נׇכְר֔וֹ וְאַל־תִּשְׂמַ֥ח לִבְנֵֽי־יְהוּדָ֖ה בְּי֣וֹם אׇבְדָ֑ם וְאַל־תַּגְדֵּ֥ל פִּ֖יךָ בְּי֥וֹם צָרָֽה׃\",\n\"אַל־תָּב֤וֹא בְשַֽׁעַר־עַמִּי֙ בְּי֣וֹם אֵידָ֔ם אַל־תֵּ֧רֶא גַם־אַתָּ֛ה בְּרָעָת֖וֹ בְּי֣וֹם אֵיד֑וֹ וְאַל־תִּשְׁלַ֥חְנָה בְחֵיל֖וֹ בְּי֥וֹם אֵידֽוֹ׃\",\n\"וְאַֽל־תַּעֲמֹד֙ עַל־הַפֶּ֔רֶק לְהַכְרִ֖ית אֶת־פְּלִיטָ֑יו וְאַל־תַּסְגֵּ֥ר שְׂרִידָ֖יו בְּי֥וֹם צָרָֽה׃\",\n\"כִּֽי־קָר֥וֹב יוֹם־יְהֹוָ֖ה עַל־כׇּל־הַגּוֹיִ֑ם כַּאֲשֶׁ֤ר עָשִׂ֙יתָ֙ יֵעָ֣שֶׂה לָּ֔ךְ גְּמֻלְךָ֖ יָשׁ֥וּב בְּרֹאשֶֽׁךָ׃\",\n\"כִּ֗י כַּֽאֲשֶׁ֤ר שְׁתִיתֶם֙ עַל־הַ֣ר קׇדְשִׁ֔י יִשְׁתּ֥וּ כׇֽל־הַגּוֹיִ֖ם תָּמִ֑יד וְשָׁת֣וּ וְלָע֔וּ וְהָי֖וּ כְּל֥וֹא הָיֽוּ׃\",\n\"וּבְהַ֥ר צִיּ֛וֹן תִּהְיֶ֥ה פְלֵיטָ֖ה וְהָ֣יָה קֹ֑דֶשׁ וְיָֽרְשׁוּ֙ בֵּ֣ית יַֽעֲקֹ֔ב אֵ֖ת מוֹרָֽשֵׁיהֶֽם׃\",\n\"וְהָיָה֩ בֵית־יַעֲקֹ֨ב אֵ֜שׁ וּבֵ֧ית יוֹסֵ֣ף לֶהָבָ֗ה וּבֵ֤ית עֵשָׂו֙ לְקַ֔שׁ וְדָלְק֥וּ בָהֶ֖ם וַאֲכָל֑וּם וְלֹֽא־יִֽהְיֶ֤ה שָׂרִיד֙ לְבֵ֣ית עֵשָׂ֔ו כִּ֥י יְהֹוָ֖ה דִּבֵּֽר׃\",\n\"וְיָרְשׁ֨וּ הַנֶּ֜גֶב אֶת־הַ֣ר עֵשָׂ֗ו וְהַשְּׁפֵלָה֙ אֶת־פְּלִשְׁתִּ֔ים וְיָרְשׁוּ֙ אֶת־שְׂדֵ֣ה אֶפְרַ֔יִם וְאֵ֖ת שְׂדֵ֣ה שֹׁמְר֑וֹן וּבִנְיָמִ֖ן אֶת־הַגִּלְעָֽד׃\",\n\"וְגָלֻ֣ת הַֽחֵל־הַ֠זֶּ֠ה לִבְנֵ֨י יִשְׂרָאֵ֤ל אֲשֶֽׁר־כְּנַעֲנִים֙ עַד־צָ֣רְפַ֔ת וְגָלֻ֥ת יְרוּשָׁלַ֖͏ִם אֲשֶׁ֣ר בִּסְפָרַ֑ד יִֽרְשׁ֕וּ אֵ֖ת עָרֵ֥י הַנֶּֽגֶב׃\",\n\"וְעָל֤וּ מֽוֹשִׁעִים֙ בְּהַ֣ר צִיּ֔וֹן לִשְׁפֹּ֖ט אֶת־הַ֣ר עֵשָׂ֑ו וְהָיְתָ֥ה לַֽיהֹוָ֖ה הַמְּלוּכָֽה׃\"\n]\n],\n\"firstSectionRef\": \"Obadiah 1\"}\n" + "example": "{\n\"status\": \"locked\",\n\"priority\": 2,\n\"license\": \"CC-BY-SA\",\n\"versionNotes\": \"Miqra According to the Masorah (MAM) is a digital Hebrew edition of the Tanakh based on the Aleppo Codex and related manuscripts. It is designed for readers, and as such it contains added elements to aid vocalization of the text. For instance: When an accent is marked in an unstressed syllable, an extra accent is added in the proper place (pashta, zarqa, segol, telisha). Legarmeih and paseq are visibly distinguished. Qamaz qatan is indicated by its designated Unicode character (alternatives are documented where traditions differ about its application).
The text of MAM is fully documented. The complete introduction to the edition (Hebrew) explains the types of editorial decisions that have been made and the reasons for them (English abstract). In addition, every word in the Bible about which there is some textual concern or ambiguity includes a documentation note; these notes can be viewed conveniently here. If an error is discovered, it may be reported to User:Dovi at Hebrew Wikisource. Please check the documentation notes before reporting an error.\",\n\"formatAsPoetry\": \"\",\n\"digitizedBySefaria\": \"\",\n\"method\": \"\",\n\"heversionSource\": \"\",\n\"versionUrl\": \"\",\n\"versionTitleInHebrew\": \"מקרא על פי המסורה\",\n\"versionNotesInHebrew\": \"\",\n\"shortVersionTitle\": \"\",\n\"shortVersionTitleInHebrew\": \"\",\n\"extendedNotes\": \"\",\n\"extendedNotesHebrew\": \"\",\n\"purchaseInformationImage\": \"\",\n\"purchaseInformationURL\": \"\",\n\"hasManuallyWrappedRefs\": \"\",\n\"actualLanguage\": \"he\",\n\"languageFamilyName\": \"hebrew\",\n\"isSource\": true,\n\"isPrimary\": true,\n\"direction\": \"rtl\",\n\"language\": \"he\",\n\"versionSource\": \"https://he.wikisource.org/wiki/%D7%9E%D7%A9%D7%AA%D7%9E%D7%A9:Dovi/%D7%9E%D7%A7%D7%A8%D7%90_%D7%A2%D7%9C_%D7%A4%D7%99_%D7%94%D7%9E%D7%A1%D7%95%D7%A8%D7%94\",\n\"versionTitle\": \"Miqra according to the Masorah\",\n\"text\": [\n[\n\"חֲז֖וֹן עֹֽבַדְיָ֑ה כֹּֽה־אָמַר֩ אֲדֹנָ֨י יֱהֹוִ֜ה לֶאֱד֗וֹם שְׁמוּעָ֨ה שָׁמַ֜עְנוּ מֵאֵ֤ת יְהֹוָה֙ וְצִיר֙ בַּגּוֹיִ֣ם שֻׁלָּ֔ח ק֛וּמוּ וְנָק֥וּמָה עָלֶ֖יהָ לַמִּלְחָמָֽה׃\",\n\"הִנֵּ֥ה קָטֹ֛ן נְתַתִּ֖יךָ בַּגּוֹיִ֑ם בָּז֥וּי אַתָּ֖ה מְאֹֽד׃\",\n\"זְד֤וֹן לִבְּךָ֙ הִשִּׁיאֶ֔ךָ שֹׁכְנִ֥י בְחַגְוֵי־סֶ֖לַע מְר֣וֹם שִׁבְתּ֑וֹ אֹמֵ֣ר בְּלִבּ֔וֹ מִ֥י יוֹרִדֵ֖נִי אָֽרֶץ׃\",\n\"אִם־תַּגְבִּ֣יהַּ כַּנֶּ֔שֶׁר וְאִם־בֵּ֥ין כּֽוֹכָבִ֖ים שִׂ֣ים קִנֶּ֑ךָ מִשָּׁ֥ם אוֹרִֽידְךָ֖ נְאֻם־יְהֹוָֽה׃\",\n\"אִם־גַּנָּבִ֤ים בָּאֽוּ־לְךָ֙ אִם־שׁ֣וֹדְדֵי לַ֔יְלָה אֵ֣יךְ נִדְמֵ֔יתָה הֲל֥וֹא יִגְנְב֖וּ דַּיָּ֑ם אִם־בֹּֽצְרִים֙ בָּ֣אוּ לָ֔ךְ הֲל֖וֹא יַשְׁאִ֥ירוּ עֹלֵלֽוֹת׃\",\n\"אֵ֚יךְ נֶחְפְּשׂ֣וּ עֵשָׂ֔ו נִבְע֖וּ מַצְפֻּנָֽיו׃\",\n\"עַֽד־הַגְּב֣וּל שִׁלְּח֗וּךָ כֹּ֚ל אַנְשֵׁ֣י בְרִיתֶ֔ךָ הִשִּׁיא֛וּךָ יָכְל֥וּ לְךָ֖ אַנְשֵׁ֣י שְׁלֹמֶ֑ךָ לַחְמְךָ֗ יָשִׂ֤ימוּ מָזוֹר֙ תַּחְתֶּ֔יךָ אֵ֥ין תְּבוּנָ֖ה בּֽוֹ׃\",\n\"הֲל֛וֹא בַּיּ֥וֹם הַה֖וּא נְאֻם־יְהֹוָ֑ה וְהַאֲבַדְתִּ֤י חֲכָמִים֙ מֵֽאֱד֔וֹם וּתְבוּנָ֖ה מֵהַ֥ר עֵשָֽׂו׃\",\n\"וְחַתּ֥וּ גִבּוֹרֶ֖יךָ תֵּימָ֑ן לְמַ֧עַן יִכָּֽרֶת־אִ֛ישׁ מֵהַ֥ר עֵשָׂ֖ו מִקָּֽטֶל׃\",\n\"מֵחֲמַ֛ס אָחִ֥יךָ יַעֲקֹ֖ב תְּכַסְּךָ֣ בוּשָׁ֑ה וְנִכְרַ֖תָּ לְעוֹלָֽם׃\",\n\"בְּיוֹם֙ עֲמׇֽדְךָ֣ מִנֶּ֔גֶד בְּי֛וֹם שְׁב֥וֹת זָרִ֖ים חֵיל֑וֹ וְנׇכְרִ֞ים בָּ֣אוּ שְׁעָרָ֗ו וְעַל־יְרוּשָׁלַ֙͏ִם֙ יַדּ֣וּ גוֹרָ֔ל גַּם־אַתָּ֖ה כְּאַחַ֥ד מֵהֶֽם׃\",\n\"וְאַל־תֵּ֤רֶא בְיוֹם־אָחִ֙יךָ֙ בְּי֣וֹם נׇכְר֔וֹ וְאַל־תִּשְׂמַ֥ח לִבְנֵֽי־יְהוּדָ֖ה בְּי֣וֹם אׇבְדָ֑ם וְאַל־תַּגְדֵּ֥ל פִּ֖יךָ בְּי֥וֹם צָרָֽה׃\",\n\"אַל־תָּב֤וֹא בְשַֽׁעַר־עַמִּי֙ בְּי֣וֹם אֵידָ֔ם אַל־תֵּ֧רֶא גַם־אַתָּ֛ה בְּרָעָת֖וֹ בְּי֣וֹם אֵיד֑וֹ וְאַל־תִּשְׁלַ֥חְנָה בְחֵיל֖וֹ בְּי֥וֹם אֵידֽוֹ׃\",\n\"וְאַֽל־תַּעֲמֹד֙ עַל־הַפֶּ֔רֶק לְהַכְרִ֖ית אֶת־פְּלִיטָ֑יו וְאַל־תַּסְגֵּ֥ר שְׂרִידָ֖יו בְּי֥וֹם צָרָֽה׃\",\n\"כִּֽי־קָר֥וֹב יוֹם־יְהֹוָ֖ה עַל־כׇּל־הַגּוֹיִ֑ם כַּאֲשֶׁ֤ר עָשִׂ֙יתָ֙ יֵעָ֣שֶׂה לָּ֔ךְ גְּמֻלְךָ֖ יָשׁ֥וּב בְּרֹאשֶֽׁךָ׃\",\n\"כִּ֗י כַּֽאֲשֶׁ֤ר שְׁתִיתֶם֙ עַל־הַ֣ר קׇדְשִׁ֔י יִשְׁתּ֥וּ כׇֽל־הַגּוֹיִ֖ם תָּמִ֑יד וְשָׁת֣וּ וְלָע֔וּ וְהָי֖וּ כְּל֥וֹא הָיֽוּ׃\",\n\"וּבְהַ֥ר צִיּ֛וֹן תִּהְיֶ֥ה פְלֵיטָ֖ה וְהָ֣יָה קֹ֑דֶשׁ וְיָֽרְשׁוּ֙ בֵּ֣ית יַֽעֲקֹ֔ב אֵ֖ת מוֹרָֽשֵׁיהֶֽם׃\",\n\"וְהָיָה֩ בֵית־יַעֲקֹ֨ב אֵ֜שׁ וּבֵ֧ית יוֹסֵ֣ף לֶהָבָ֗ה וּבֵ֤ית עֵשָׂו֙ לְקַ֔שׁ וְדָלְק֥וּ בָהֶ֖ם וַאֲכָל֑וּם וְלֹֽא־יִֽהְיֶ֤ה שָׂרִיד֙ לְבֵ֣ית עֵשָׂ֔ו כִּ֥י יְהֹוָ֖ה דִּבֵּֽר׃\",\n\"וְיָרְשׁ֨וּ הַנֶּ֜גֶב אֶת־הַ֣ר עֵשָׂ֗ו וְהַשְּׁפֵלָה֙ אֶת־פְּלִשְׁתִּ֔ים וְיָרְשׁוּ֙ אֶת־שְׂדֵ֣ה אֶפְרַ֔יִם וְאֵ֖ת שְׂדֵ֣ה שֹׁמְר֑וֹן וּבִנְיָמִ֖ן אֶת־הַגִּלְעָֽד׃\",\n\"וְגָלֻ֣ת הַֽחֵל־הַ֠זֶּ֠ה לִבְנֵ֨י יִשְׂרָאֵ֤ל אֲשֶֽׁר־כְּנַעֲנִים֙ עַד־צָ֣רְפַ֔ת וְגָלֻ֥ת יְרוּשָׁלַ֖͏ִם אֲשֶׁ֣ר בִּסְפָרַ֑ד יִֽרְשׁ֕וּ אֵ֖ת עָרֵ֥י הַנֶּֽגֶב׃\",\n\"וְעָל֤וּ מֽוֹשִׁעִים֙ בְּהַ֣ר צִיּ֔וֹן לִשְׁפֹּ֖ט אֶת־הַ֣ר עֵשָׂ֑ו וְהָיְתָ֥ה לַֽיהֹוָ֖ה הַמְּלוּכָֽה׃\"\n]\n],\n\"firstSectionRef\": \"Obadiah 1\"}\n" }, "available_versions": { "type": "array", @@ -9274,7 +9224,6 @@ "hasManuallyWrappedRefs": "", "actualLanguage": "he", "languageFamilyName": "hebrew", - "isBaseText": true, "isSource": true, "isPrimary": true, "direction": "rtl", @@ -9419,14 +9368,11 @@ "description": "The overarching family for the specific language detailed in `actualLanguage`. For example, `Arabic` would be the overarching `languageFamily` for `judeo-arabic`.", "type": "string" }, - "isBaseText": { - "type": "string" - }, "isSource": { "type": "boolean" }, "isPrimary": { - "type": "string" + "type": "boolean" }, "direction": { "type": "string" @@ -9472,9 +9418,8 @@ "hasManuallyWrappedRefs": "", "actualLanguage": "en", "languageFamilyName": "english", - "isBaseText": "", "isSource": false, - "isPrimary": "", + "isPrimary": false, "direction": "ltr", "language": "en", "title": "Obadiah", @@ -9564,10 +9509,6 @@ "description": "The overarching family for the specific language detailed in `actualLanguage`. For example, `Arabic` would be the overarching family for `judeo-arabic`. ", "type": "string" }, - "isBaseText": { - "description": "Indicates whether or not this text is the base for a commentary (i.e. `Genesis` is a base text for `Ramban on Genesis`)", - "type": "boolean" - }, "isSource": { "type": "boolean" }, @@ -9629,7 +9570,6 @@ "hasManuallyWrappedRefs": "", "actualLanguage": "he", "languageFamilyName": "hebrew", - "isBaseText": true, "isSource": true, "isPrimary": true, "direction": "rtl", diff --git a/e2e-tests/tests/search.spec.ts b/e2e-tests/tests/search.spec.ts index 3ea65428d6..692e27fabf 100644 --- a/e2e-tests/tests/search.spec.ts +++ b/e2e-tests/tests/search.spec.ts @@ -5,7 +5,7 @@ test('Search auto complete', async ({ context }) => { const page = await goToPageWithLang(context, '/'); await page.getByPlaceholder('Search').fill('אהבה'); await page.waitForSelector('text=אהבה', { state: 'visible' }); - await page.getByRole('option', { name: 'אהבה', exact: true }).click(); + await page.getByText('אהבה', { exact: true }).click(); await expect(page).toHaveTitle(/Love/); }); diff --git a/helm-chart/sefaria-project/templates/configmap/local-settings-file.yaml b/helm-chart/sefaria-project/templates/configmap/local-settings-file.yaml index 771117bc4b..d59daf235f 100644 --- a/helm-chart/sefaria-project/templates/configmap/local-settings-file.yaml +++ b/helm-chart/sefaria-project/templates/configmap/local-settings-file.yaml @@ -47,6 +47,7 @@ data: ADMINS = ( ('Sefaria Developers', 'dev@sefaria.org'), ) + ADMIN_PATH = os.getenv("SEFARIA_ADMIN_PATH") MANAGERS = ADMINS diff --git a/reader/views.py b/reader/views.py index e26dd0ee0a..4062f2fdbe 100644 --- a/reader/views.py +++ b/reader/views.py @@ -78,7 +78,7 @@ from sefaria.utils.user import delete_user_account from django.core.mail import EmailMultiAlternatives from babel import Locale -from sefaria.helper.topic import update_topic, update_topic_titles +from sefaria.helper.topic import update_topic from sefaria.helper.category import update_order_of_category_children, check_term if USE_VARNISH: @@ -114,7 +114,7 @@ if ENABLE_LINKER: logger.info("Initializing Linker") - library.build_ref_resolver() + library.build_linker('he') if server_coordinator: server_coordinator.connect() @@ -1242,6 +1242,7 @@ def edit_text(request, ref=None, lang=None, version=None): }) @ensure_csrf_cookie +@staff_member_required @sanitize_get_params def edit_text_info(request, title=None, new_title=None): """ @@ -2167,7 +2168,7 @@ def related_api(request, tref): "sheets": get_sheets_for_ref(tref), "notes": [], # get_notes(oref, public=True) # Hiding public notes for now "webpages": get_webpages_for_ref(tref), - "topics": get_topics_for_ref(tref, annotate=True), + "topics": get_topics_for_ref(tref, request.interfaceLang, annotate=True), "manuscripts": ManuscriptPageSet.load_set_for_client(tref), "media": get_media_for_ref(tref), "guides": GuideSet.load_set_for_client(tref) @@ -2437,9 +2438,9 @@ def _internal_do_post(request, update, cat, uid, **kwargs): else: return jsonResponse({"error": "Only Sefaria Moderators can add or delete categories."}) - j = request.POST.get("json") + j = request.body if not j: - return jsonResponse({"error": "Missing 'json' parameter in post data."}) + return jsonResponse({"error": "Missing data in POST request."}) j = json.loads(j) update = int(request.GET.get("update", False)) new_category = Category().load({"path": j["path"]}) @@ -3057,7 +3058,7 @@ def topic_page(request, topic, test_version=None): "en": topic_obj.get_primary_title('en'), "he": topic_obj.get_primary_title('he') }, - "topicData": _topic_page_data(topic), + "topicData": _topic_page_data(topic, request.interfaceLang), } if test_version is not None: @@ -3108,7 +3109,9 @@ def add_new_topic_api(request): data = json.loads(request.POST["json"]) isTopLevelDisplay = data["category"] == Topic.ROOT t = Topic({'slug': "", "isTopLevelDisplay": isTopLevelDisplay, "data_source": "sefaria", "numSources": 0}) - update_topic_titles(t, **data) + titles = data.get('titles') + if titles: + t.set_titles(titles) t.set_slug_to_primary_title() if not isTopLevelDisplay: # not Top Level so create an IntraTopicLink to category new_link = IntraTopicLink({"toTopic": data["category"], "fromTopic": t.slug, "linkType": "displays-under", "dataSource": "sefaria"}) @@ -3167,7 +3170,7 @@ def topics_api(request, topic, v2=False): annotate_time_period = bool(int(request.GET.get("annotate_time_period", False))) with_indexes = bool(int(request.GET.get("with_indexes", False))) ref_link_type_filters = set(filter(lambda x: len(x) > 0, request.GET.get("ref_link_type_filters", "").split("|"))) - response = get_topic(v2, topic, with_html=with_html, with_links=with_links, annotate_links=annotate_links, with_refs=with_refs, group_related=group_related, annotate_time_period=annotate_time_period, ref_link_type_filters=ref_link_type_filters, with_indexes=with_indexes) + response = get_topic(v2, topic, request.interfaceLang, with_html=with_html, with_links=with_links, annotate_links=annotate_links, with_refs=with_refs, group_related=group_related, annotate_time_period=annotate_time_period, ref_link_type_filters=ref_link_type_filters, with_indexes=with_indexes) return jsonResponse(response, callback=request.GET.get("callback", None)) elif request.method == "POST": if not request.user.is_staff: @@ -3253,7 +3256,7 @@ def topic_ref_api(request, tref): annotate = bool(int(data.get("annotate", False))) if request.method == "GET": - response = get_topics_for_ref(tref, annotate) + response = get_topics_for_ref(tref, request.interfaceLang, annotate) return jsonResponse(response, callback=request.GET.get("callback", None)) else: if not request.user.is_staff: @@ -3270,7 +3273,7 @@ def topic_ref_api(request, tref): @staff_member_required def reorder_sources(request): - sources = json.loads(request.POST["json"]).get("sources", []) + sources = json.loads(request.body).get("sources", []) slug = request.GET.get('topic') lang = 'en' if request.GET.get('lang') == 'english' else 'he' return jsonResponse(update_order_of_topic_sources(slug, sources, request.user.id, lang=lang)) @@ -3279,8 +3282,8 @@ def reorder_sources(request): 'authors': ['popular-writing-of'], } -def _topic_page_data(topic): - _topic_data(topic=topic, annotate_time_period=True) +def _topic_page_data(topic, lang): + _topic_data(topic=topic, lang=lang, annotate_time_period=True) def _topic_data(**kwargs): diff --git a/requirements.txt b/requirements.txt index 5bf10c0754..c1219d9635 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,15 +1,18 @@ Appium-Python-Client==1.2.0 -apscheduler==3.6. +Cerberus +PyJWT==1.7.1 # pinned b/c current version 2.0.0 breaks simplejwt. waiting for 2.0.1 +apscheduler==3.6.* +babel bleach==1.4.2 boto3==1.16.6 bs4==0.0.1 +celery[redis] convertdate==2.2.2 cython==0.29.14 dateutils==0.6.12 datrie==0.8.2 deepdiff==3.3.0 diff_match_patch==20200713 -django_mobile==0.7.0 django-anymail==7.2.* django-debug-toolbar==2.2 # not used in prod django-recaptcha==2.0.6 @@ -18,18 +21,22 @@ django-structlog==1.6.2 django-user-agents==0.4.0 django-webpack-loader==1.4.1 django==1.11.* +django_mobile==0.7.0 djangorestframework @ https://github.com/encode/django-rest-framework/archive/3.11.1.tar.gz djangorestframework_simplejwt==3.3.0 -PyJWT==1.7.1 # pinned b/c current version 2.0.0 breaks simplejwt. waiting for 2.0.1 +dnspython~=2.5.0 elasticsearch==8.8.2 -git+https://github.com/Sefaria/elasticsearch-dsl-py@v8.0.0#egg=elasticsearch-dsl -git+https://github.com/Sefaria/LLM@v1.0.3#egg=sefaria_llm_interface&subdirectory=app/llm_interface geojson==2.5.0 geopy==2.3.0 gevent==20.12.0; sys_platform != 'darwin' +git+https://github.com/Sefaria/LLM@v1.0.3#egg=sefaria_llm_interface&subdirectory=app/llm_interface +git+https://github.com/Sefaria/elasticsearch-dsl-py@v8.0.0#egg=elasticsearch-dsl google-api-python-client==1.12.5 +google-auth-oauthlib==0.4.2 +google-auth==1.24.0 google-cloud-logging==1.15.1 google-cloud-storage==1.32.0 +google-re2 gunicorn==20.0.4 html5lib==0.9999999 httplib2==0.18.1 @@ -38,37 +45,31 @@ jedi==0.18.1 # Ipython was previosuly pinned at 7.18 because Jedi 0.18 broke it. jsonpickle==1.4.1 lxml==4.6.1 mailchimp==2.0.9 -google-auth==1.24.0 -google-auth-oauthlib==0.4.2 p929==0.6.1 pathos==0.2.6 -pillow==8.0.1; sys_platform == 'linux' pillow==10.0.1; sys_platform != 'linux' -psycopg2==2.8.6 +pillow==8.0.1; sys_platform == 'linux' +psycopg2==2.8.6 #for dev: psycopg2-binary==2.8.6 py2-py3-django-email-as-username==1.7.1 pymongo==3.12.* pytest==6.1.1 +python-bidi pytz pyyaml==6.0.1 rauth==0.7.3 redis==3.5.3 regex==2020.10.23 +requests roman==3.3 selenium==3.141.0 +sentry-sdk==1.26.0 tqdm==4.51.0 ua-parser==0.10.0 undecorated==0.3.0 unicodecsv==0.14.1 unidecode==1.1.1 user-agents==2.2.0 -sentry-sdk==1.26.0 -babel -python-bidi -requests -Cerberus -celery[redis] -google-re2 -dnspython~=2.5.0 + #opentelemetry-distro #opentelemetry-exporter-otlp diff --git a/scripts/catch_refs_yerushalmi_translation.py b/scripts/catch_refs_yerushalmi_translation.py index da277a76da..a182aaaeac 100644 --- a/scripts/catch_refs_yerushalmi_translation.py +++ b/scripts/catch_refs_yerushalmi_translation.py @@ -150,7 +150,7 @@ def catch_refs_in_title(self, title: str): version = Version().load({"title": title, "language": self.lang, "versionTitle": self.vtitle}) version.walk_thru_contents(self.collect_resolver_input) context_refs, input_text = zip(*self.resolver_input) - all_resolved_refs = self.resolver.bulk_resolve_refs(self.lang, context_refs, input_text, with_failures=True, verbose=True) + all_resolved_refs = self.resolver.bulk_resolve(self.lang, context_refs, input_text, with_failures=True, verbose=True) self.resolved_refs_by_context = {} for context_ref, resolved_refs in zip(context_refs, all_resolved_refs): self.resolved_refs_by_context[context_ref.normal()] = resolved_refs @@ -168,7 +168,7 @@ def catch_refs_in_ref(self, st: str, en_tref: str, he_tref: str, version: Versio resolved_refs = self.post_process_resolved_refs(resolved_refs, context_ref) norm_indices = [r.raw_ref.char_indices for r in resolved_refs] mapping = self.normalizer.get_mapping_after_normalization(st) - orig_indices = self.normalizer.convert_normalized_indices_to_unnormalized_indices(norm_indices, mapping) + orig_indices = self.normalizer.norm_to_unnorm_indices_with_mapping(norm_indices, mapping, ) for resolved_ref, (start_char, end_char) in zip(resolved_refs, orig_indices): before_context, after_context = get_window_around_match(start_char, end_char, st) @@ -292,7 +292,7 @@ def post_process_resolved_refs(self, resolved_refs: List[ResolvedRef], context_r if span_end is not None: subspan_slice = slice(0, span_end) subspan = raw_ref.subspan(subspan_slice) - new_raw_ref = RawRef('en', raw_ref.raw_ref_parts[subspan_slice], subspan) + new_raw_ref = RawRef(subspan, 'en', raw_ref.raw_ref_parts[subspan_slice]) temp_resolved_refs = self.resolver.resolve_raw_ref('en', context_ref, new_raw_ref) for temp_resolved_ref in temp_resolved_refs: temp_ref = temp_resolved_ref.ref diff --git a/scripts/dicta_library_linker.py b/scripts/dicta_library_linker.py index e4470d4e83..0304c65c5b 100644 --- a/scripts/dicta_library_linker.py +++ b/scripts/dicta_library_linker.py @@ -17,7 +17,7 @@ def run_on_page(path, tref): text = get_text(jin) #text = """וכן כתב הרמב"ם ז"ל בהלכות טוען ונטען פ"ב""" ref_resolver = library.get_ref_resolver() - resolved = ref_resolver.bulk_resolve_refs("he", [None], [text], with_failures=True) + resolved = ref_resolver.bulk_resolve("he", [None], [text], with_failures=True) make_html([resolved], [[text]], f"../data/private/linker_results/{tref}.html") diff --git a/scripts/parse_refs_in_gilyon_hashas.py b/scripts/parse_refs_in_gilyon_hashas.py index 11af3d0da2..8113a89d82 100644 --- a/scripts/parse_refs_in_gilyon_hashas.py +++ b/scripts/parse_refs_in_gilyon_hashas.py @@ -22,7 +22,7 @@ def collect_input(s: str, en_tref: str, he_tref: str, v: Version) -> None: version = VersionSet({"title": title, "language": "he"}).array()[0] version.walk_thru_contents(collect_input) - resolved = resolver.bulk_resolve_refs('he', input_context_refs, input_text, with_failures=True, verbose=True) + resolved = resolver.bulk_resolve('he', input_context_refs, input_text, with_failures=True, verbose=True) return resolved, input_text, input_context_refs @@ -64,7 +64,7 @@ def parse_string(resolver): זהו ג"כ הוי כמו חזקה). וכן מצאתי להדיא שכ"כ בתורת השלמים ר"ס קפ"ה ע"ש. וראיתי להנו"ב (מ"ק ביו"ד סי' נ"ז) שהקשה להרשב"א דס"ל דבס"ס נמי היכא דאיכא לברורי מבררינן דא"כ אמאי סמכינן על סתם כלים של נכרים אינן ב"י דהוא מטעם ס"ס אף דאפשר לברר ע"י קפילא וכמ"ש הרא"ש בפ"ב דע"ז דמשום ס"ס לא הטריחוהו להטעימו לקפילא. והרשב"א גופי' פסק כן דסומכין ע"ז דסתם כלים של נכרים אינן ב"י, ותירץ הנו"ב דכיון דאין אנו דנים על הכלי דאפי' ודאי אינו ב"י אסור לבשל בו לכתחלה רק אנו דנים על התבשיל והתבשיל יש לו חזקת היתר ולכן בצירוף חזקת היתר עם הס"ס א"צ לעמוד על הבירור בזה עכ"ד הנו"ב. ואף שיש לפקפק על דבריו במה דפשיטא ליה שהתבשיל יש לו חזקת היתר. די"ל דאיתרע חזקתו כשבישלו בכלי שהוא ספק ב"י. ובכעין זה נחלקו הט"ז והנקה"כ ביו"ד ר"ס ק"ה לענין ספק כבוש, דהט"ז ס"ל שם דיש לאוקמי ההיתר בחזקת כשרות, והנה"כ ושאר """ context_refs = [Ref("Job 1")] - resolved = resolver.bulk_resolve_refs('he', context_refs, [s], with_failures=True) + resolved = resolver.bulk_resolve('he', context_refs, [s], with_failures=True) make_html(resolved, "../data/toratchesed-018__output.html", 'he') save_resolved_refs(zip(context_refs, resolved), 'toratchesed-018__output.csv') diff --git a/scripts/scheduled/parse_rambi_webpages.py b/scripts/scheduled/parse_rambi_webpages.py index 63f9c5dab0..d01ca88203 100644 --- a/scripts/scheduled/parse_rambi_webpages.py +++ b/scripts/scheduled/parse_rambi_webpages.py @@ -60,10 +60,10 @@ def get_refs_from_string(string): lang = 'he' if len(re.findall('[א-ת]', string)) > len(string) / 2 else 'en' if lang == 'en': string = translliterate_russian_to_latin(string) - ref_resolver = library.get_ref_resolver() + linker = library.get_linker('he') if lang == 'he': # remove this line when linker v3 is availabe in English - refs = ref_resolver.bulk_resolve_refs(lang, [None], [string]) - refs = {y.ref for x in refs for y in x if type(y) != AmbiguousResolvedRef} + doc = linker.link(string, type_filter='citation') + refs = {y.ref for y in doc.resolved_refs if not y.is_ambiguous} else: # remove else statement (with its content) when linker v3 is availabe in English refs = set() library.apply_action_for_all_refs_in_string(re.sub('[\(\)]', '', string), lambda x, y: refs.add(x), 'en', citing_only=True) diff --git a/sefaria/helper/linker.py b/sefaria/helper/linker.py index ceb5fc15a0..0d0d57d57f 100644 --- a/sefaria/helper/linker.py +++ b/sefaria/helper/linker.py @@ -3,7 +3,7 @@ import spacy import structlog from sefaria.model.linker.ref_part import TermContext, RefPartType -from sefaria.model.linker.ref_resolver import ResolvedRef, AmbiguousResolvedRef +from sefaria.model.linker.ref_resolver import PossiblyAmbigResolvedRef from sefaria.model import text, library from sefaria.model.webpage import WebPage from sefaria.system.cache import django_cache @@ -119,14 +119,16 @@ def _make_find_refs_response_with_cache(request_text: _FindRefsText, options: _F def _make_find_refs_response_linker_v3(request_text: _FindRefsText, options: _FindRefsTextOptions) -> dict: - resolver = library.get_ref_resolver() - resolved_title = resolver.bulk_resolve_refs(request_text.lang, [None], [request_text.title]) - context_ref = resolved_title[0][0].ref if (len(resolved_title[0]) == 1 and not resolved_title[0][0].is_ambiguous) else None - resolved_body = resolver.bulk_resolve_refs(request_text.lang, [context_ref], [request_text.body], with_failures=True) + linker = library.get_linker(request_text.lang) + title_doc = linker.link(request_text.title, type_filter='citation') + context_ref = None + if len(title_doc.resolved_refs) == 1 and not title_doc.resolved_refs[0].is_ambiguous: + context_ref = title_doc.resolved_refs[0].ref + body_doc = linker.link_by_paragraph(request_text.body, context_ref, with_failures=True, type_filter='citation') response = { - "title": _make_find_refs_response_inner(resolved_title, options), - "body": _make_find_refs_response_inner(resolved_body, options), + "title": _make_find_refs_response_inner(title_doc.resolved_refs, options), + "body": _make_find_refs_response_inner(body_doc.resolved_refs, options), } return response @@ -177,14 +179,13 @@ def _get_trefs_from_response(response): return trefs -def _make_find_refs_response_inner(resolved: List[List[Union[AmbiguousResolvedRef, ResolvedRef]]], options: _FindRefsTextOptions): +def _make_find_refs_response_inner(resolved_ref_list: List[PossiblyAmbigResolvedRef], options: _FindRefsTextOptions): ref_results = [] ref_data = {} debug_data = [] - resolved_ref_list = [resolved_ref for inner_resolved in resolved for resolved_ref in inner_resolved] for resolved_ref in resolved_ref_list: resolved_refs = resolved_ref.resolved_raw_refs if resolved_ref.is_ambiguous else [resolved_ref] - start_char, end_char = resolved_ref.raw_ref.char_indices + start_char, end_char = resolved_ref.raw_entity.char_indices text = resolved_ref.pretty_text link_failed = resolved_refs[0].ref is None if not link_failed and resolved_refs[0].ref.is_book_level(): continue @@ -249,12 +250,12 @@ def _get_ref_text_by_lang_for_linker(oref: text.Ref, lang: str, options: _FindRe return as_array[:options.max_segments or None], was_truncated -def _make_debug_response_for_linker(resolved_ref: ResolvedRef) -> dict: +def _make_debug_response_for_linker(resolved_ref: PossiblyAmbigResolvedRef) -> dict: debug_data = { - "orig_part_strs": [p.text for p in resolved_ref.raw_ref.raw_ref_parts], - "orig_part_types": [p.type.name for p in resolved_ref.raw_ref.raw_ref_parts], - "final_part_strs": [p.text for p in resolved_ref.raw_ref.parts_to_match], - "final_part_types": [p.type.name for p in resolved_ref.raw_ref.parts_to_match], + "orig_part_strs": [p.text for p in resolved_ref.raw_entity.raw_ref_parts], + "orig_part_types": [p.type.name for p in resolved_ref.raw_entity.raw_ref_parts], + "final_part_strs": [p.text for p in resolved_ref.raw_entity.parts_to_match], + "final_part_types": [p.type.name for p in resolved_ref.raw_entity.parts_to_match], "resolved_part_strs": [p.term.slug if isinstance(p, TermContext) else p.text for p in resolved_ref.resolved_parts], "resolved_part_types": [p.type.name for p in resolved_ref.resolved_parts], "resolved_part_classes": [p.__class__.__name__ for p in resolved_ref.resolved_parts], @@ -262,7 +263,7 @@ def _make_debug_response_for_linker(resolved_ref: ResolvedRef) -> dict: "context_type": resolved_ref.context_type.name if resolved_ref.context_type else None, } if RefPartType.RANGE.name in debug_data['final_part_types']: - range_part = next((p for p in resolved_ref.raw_ref.parts_to_match if p.type == RefPartType.RANGE), None) + range_part = next((p for p in resolved_ref.raw_entity.parts_to_match if p.type == RefPartType.RANGE), None) debug_data.update({ 'input_range_sections': [p.text for p in range_part.sections], 'input_range_to_sections': [p.text for p in range_part.toSections] diff --git a/sefaria/helper/linker_index_converter.py b/sefaria/helper/linker_index_converter.py index 3a795b45c2..3eddae3232 100644 --- a/sefaria/helper/linker_index_converter.py +++ b/sefaria/helper/linker_index_converter.py @@ -55,10 +55,27 @@ def create_term(self, **kwargs): term.title_group.add_title(kwargs.get(lang), lang, primary=True) for title in kwargs.get(f"alt_{lang}", []): term.title_group.add_title(title, lang) + + if kwargs.get('delete_if_existing'): + slug = NonUniqueTerm.normalize_slug(term.slug) + existing_term = NonUniqueTerm.init(slug) + if existing_term: + existing_term.delete() term.save() self.context_and_primary_title_to_term[(kwargs.get('context'), term.get_primary_title('en'))] = term return term + def get_or_create_term_for_titled_obj(self, obj, context=None, new_alt_titles=None, title_modifier=None, title_adder=None): + term = self.get_existing_term_for_titled_obj(obj, new_alt_titles, title_modifier, title_adder) + if not term: + return self.create_term_from_titled_obj(obj, context, new_alt_titles, title_modifier, title_adder) + return term + + def get_existing_term_for_titled_obj(self, obj, new_alt_titles=None, title_modifier=None, title_adder=None): + en_title, he_title, alt_en_titles, alt_he_titles = self._make_titles_for_term(obj, new_alt_titles, + title_modifier, title_adder) + return NonUniqueTerm().load({"titles.text": {"$all": [en_title, he_title] + alt_en_titles + alt_he_titles}}) + def create_term_from_titled_obj(self, obj, context=None, new_alt_titles=None, title_modifier=None, title_adder=None): """ Create a NonUniqueTerm from 'titled object' (see explanation of `obj` param) @@ -97,6 +114,15 @@ def title_adder(lang, title): ... """ + en_title, he_title, alt_en_titles, alt_he_titles = self._make_titles_for_term(obj, new_alt_titles, + title_modifier, title_adder) + term = self.create_term(en=en_title, he=he_title, context=context, alt_en=alt_en_titles, alt_he=alt_he_titles) + if isinstance(obj, Term): + self.old_term_map[obj.name] = term + return term + + @staticmethod + def _make_titles_for_term(obj, new_alt_titles=None, title_modifier=None, title_adder=None): new_alt_titles = new_alt_titles or [] title_group = obj if isinstance(obj, TitleGroup) else obj.title_group en_title = title_group.primary_title('en') @@ -122,10 +148,7 @@ def title_adder(lang, title): # make unique alt_en_titles = list(set(alt_en_titles)) alt_he_titles = list(set(alt_he_titles)) - term = self.create_term(en=en_title, he=he_title, context=context, alt_en=alt_en_titles, alt_he=alt_he_titles) - if isinstance(obj, Term): - self.old_term_map[obj.name] = term - return term + return en_title, he_title, alt_en_titles, alt_he_titles class LinkerCategoryConverter: @@ -369,6 +392,18 @@ def _update_lengths(self): outer_shape = base_outer_shape self.index.nodes.lengths = [outer_shape] + ac[1:] + @staticmethod + def get_all_alt_struct_nodes(index): + def alt_struct_nodes_helper(node, nodes): + nodes.append(node) + for child in node.children: + alt_struct_nodes_helper(child, nodes) + + nodes = [] + for node in index.get_alt_struct_roots(): + alt_struct_nodes_helper(node, nodes) + return nodes + def convert(self): if self.get_alt_structs: alt_struct_dict = self.get_alt_structs(self.index) @@ -376,7 +411,7 @@ def convert(self): for name, root in alt_struct_dict.items(): self.index.set_alt_structure(name, root) self._traverse_nodes(self.index.nodes, self.node_visitor, is_alt_node=False) - alt_nodes = self.index.get_alt_struct_leaves() + alt_nodes = self.get_all_alt_struct_nodes(self.index) for inode, node in enumerate(alt_nodes): self.node_visitor(node, 1, inode, len(alt_nodes), True) self._update_lengths() # update lengths for good measure @@ -419,4 +454,7 @@ def node_visitor(self, node, depth, isibling, num_siblings, is_alt_node): if other_fields_dict is not None: for key, val in other_fields_dict.items(): if val is None: continue - setattr(node, key, val) + if val == "DELETE!": + delattr(node, key) + else: + setattr(node, key, val) diff --git a/sefaria/helper/normalization.py b/sefaria/helper/normalization.py index 9969022838..c69bc8487a 100644 --- a/sefaria/helper/normalization.py +++ b/sefaria/helper/normalization.py @@ -85,6 +85,9 @@ def remove_subsets_reducer(curr_text_to_remove: list, next: tuple) -> list: def get_mapping_after_normalization(self, text, removal_list=None, reverse=False, **kwargs): """ + Prefer norm_to_unnorm_indices() over this function since the former is simpler. + Use this function when you need more control over the mapping outputs. + It also can be useful to store the mapping and reuse it as an optimization. text - unnormalized text removal_list - instead of passing `find_text_to_remove`, you can pass an already calculated list of tuples. should be in same format as return value of find_text_to_remove reverse - bool. If True, then will return mapping from unnormalized string to normalized string @@ -97,28 +100,38 @@ def get_mapping_after_normalization(self, text, removal_list=None, reverse=False meaning by the 2nd index, 5 chars have been removed then if you have a range (0,3) in the normalized string "abc" you will know that maps to (0, 8) in the original string """ - if removal_list is None: - removal_list = self.find_text_to_remove(text, **kwargs) + removal_list = removal_list or self.find_text_to_remove(text, **kwargs) total_removed = 0 removal_map = {} - for removal, subst in removal_list: - try: - start, end = removal - except TypeError: - # must be match object - start, end = removal.start(), removal.end() + subst_end_indexes = set() + for (start, end), subst in removal_list: normalized_text_index = start if reverse else (start + min(len(subst), end-start) - total_removed) curr_removed = end - start - len(subst) - if curr_removed > 0: + if curr_removed != 0: total_removed += curr_removed removal_map[normalized_text_index] = total_removed - return removal_map + if len(subst) > 0: + subst_end_indexes.add(normalized_text_index + 1) + return removal_map, subst_end_indexes + + def norm_to_unnorm_indices(self, text, normalized_indices, removal_list=None, reverse=False, **kwargs): + """ + text - unnormalized text + normalized_indices - list of tuples where each tuple is (x, y) x being start index, y is end index + 1 + reverse - if True, normalized_indices are actually unnormalized indices and removal_map was calculated using reverse=True in get_mapping_after_normalization() + """ + removal_map, subst_end_indices = self.get_mapping_after_normalization(text, removal_list, reverse, **kwargs) + return self.norm_to_unnorm_indices_with_mapping(normalized_indices, removal_map, subst_end_indices, reverse) @staticmethod - def convert_normalized_indices_to_unnormalized_indices(normalized_indices, removal_map, reverse=False): + def norm_to_unnorm_indices_with_mapping(normalized_indices, removal_map, subst_end_indices, reverse=False): """ + Prefer norm_to_unnorm_indices() over this function since the former is simpler. + Use this function when you need more control over the mapping inputs. + It also can be useful to store the mapping and reuse it as an optimization. normalized_indices - list of tuples where each tuple is (x, y) x being start index, y is end index + 1 - removal_map - return value of get_mapping_after_normalization() + removal_map - first return value of get_mapping_after_normalization() + subst_end_indices - second return value from get_mapping_after_normalization() reverse - if True, normalized_indices are actually unnormalized indices and removal_map was calculated using reverse=True in get_mapping_after_normalization() """ removal_keys = sorted(removal_map.keys()) @@ -126,8 +139,8 @@ def convert_normalized_indices_to_unnormalized_indices(normalized_indices, remov sign = -1 if reverse else 1 for start, end in normalized_indices: unnorm_start_index = bisect_right(removal_keys, start) - 1 - # special case if range is zero-length. treat end as literal and not off-by-one. - bisect_end_index = end if end == start else (end - 1) + + bisect_end_index = end if (start == end or end in subst_end_indices) else end - 1 unnorm_end_index = bisect_right(removal_keys, bisect_end_index) - 1 unnorm_start = start if unnorm_start_index < 0 else start + (sign * removal_map[removal_keys[unnorm_start_index]]) @@ -267,8 +280,9 @@ def find_text_to_remove(self, s, **kwargs): text_to_remove_inds, text_to_remove_repls = [], [] else: text_to_remove_inds, text_to_remove_repls = zip(*curr_text_to_remove) - for mapping in reversed(mappings): - text_to_remove_inds = step.convert_normalized_indices_to_unnormalized_indices(text_to_remove_inds, mapping) + for mapping, subst_end_indices in reversed(mappings): + text_to_remove_inds = step.norm_to_unnorm_indices_with_mapping(text_to_remove_inds, mapping, + subst_end_indices) curr_text_to_remove = list(zip(text_to_remove_inds, text_to_remove_repls)) # merge any overlapping ranges @@ -295,7 +309,14 @@ def merge_removal_inds(*all_removal_inds): else: # some sort of overlap curr_merged_inds = (last_inds[0], max(last_inds[1], curr_inds[1])) - curr_merged_repl = last_repl[:curr_inds[0]-last_inds[0]] + curr_repl + last_repl[(curr_inds[1]+1)-last_inds[0]:] + if curr_inds[0] == last_inds[0] and last_inds[1] <= curr_inds[1]: + # last is subset. use curr_repl + curr_merged_repl = curr_repl + elif curr_inds[0] >= last_inds[0] and curr_inds[1] <= last_inds[1]: + # curr is subset. use last_repl + curr_merged_repl = last_repl + else: + raise Exception(f"partial overlap. not sure how to reconcile. curr_inds: {curr_inds}. last_inds: {last_inds}") merged_removal_inds[-1] = (curr_merged_inds, curr_merged_repl) return merged_removal_inds @@ -431,7 +452,6 @@ def char_indices_from_word_indices(input_string, word_ranges, split_regex=None): count += len(word) end = count word_indices.append((start, end)) - removal_map = regex_normalizer.get_mapping_after_normalization(input_string) normalized_char_indices = [] for i, words in enumerate(word_ranges): first_word, last_word = [w if w < len(word_indices) else -1 for w in words] @@ -441,7 +461,7 @@ def char_indices_from_word_indices(input_string, word_ranges, split_regex=None): word_indices[last_word][1] if last_word >= 0 else -1 ) ) - return regex_normalizer.convert_normalized_indices_to_unnormalized_indices(normalized_char_indices, removal_map) + return regex_normalizer.norm_to_unnorm_indices(input_string, normalized_char_indices) @lru_cache(maxsize=32) @@ -459,9 +479,8 @@ def word_index_from_char_index(full_string, char_index, split_regex=r'\s+'): def sanitized_words_to_unsanitized_words(input_string, sanitized_string, sanitization_method, sanitized_word_ranges): normalizer = FunctionNormalizer(sanitization_method) - removal_map = normalizer.get_mapping_after_normalization(input_string) sanitized_char_ranges = char_indices_from_word_indices(sanitized_string, sanitized_word_ranges) - unsanitzied_char_ranges = normalizer.convert_normalized_indices_to_unnormalized_indices(sanitized_char_ranges, removal_map) + unsanitzied_char_ranges = normalizer.norm_to_unnorm_indices(input_string, sanitized_char_ranges) # for char_range in unsanitied_char_ranges: # word_range = tuple(word_index_from_char_index(input_string, i) for i in char_range) # stuff.append(word_range) diff --git a/sefaria/helper/tests/linker_test.py b/sefaria/helper/tests/linker_test.py index 94835d2a09..c999d86a32 100644 --- a/sefaria/helper/tests/linker_test.py +++ b/sefaria/helper/tests/linker_test.py @@ -124,10 +124,8 @@ def mock_webpage() -> WebPage: class TestFindRefsHelperClasses: - @patch('sefaria.utils.hebrew.is_hebrew', return_value=False) - def test_find_refs_text(self, mock_is_hebrew: Mock): - find_refs_text = linker._FindRefsText('title', 'body') - mock_is_hebrew.assert_called_once_with('body') + def test_find_refs_text(self): + find_refs_text = linker._FindRefsText('title', 'body', 'en') assert find_refs_text.lang == 'en' def test_find_refs_text_options(self): @@ -194,16 +192,16 @@ def test_add_webpage_hit_for_url_no_url(self, mock_webpage: Mock): class TestFindRefsResponseLinkerV3: @pytest.fixture - def mock_get_ref_resolver(self, spacy_model: spacy.Language): + def mock_get_linker(self, spacy_model: spacy.Language): from sefaria.model.text import library - with patch.object(library, 'get_ref_resolver') as mock_get_ref_resolver: - mock_ref_resolver = Mock() - mock_ref_resolver._raw_ref_model_by_lang = {"en": spacy_model} - mock_get_ref_resolver.return_value = mock_ref_resolver - mock_ref_resolver.bulk_resolve_refs.return_value = [[]] - yield mock_get_ref_resolver - - def test_make_find_refs_response_linker_v3(self, mock_get_ref_resolver: WSGIRequest, + from sefaria.model.linker.linker import LinkedDoc + with patch.object(library, 'get_linker') as mock_get_linker: + mock_linker = Mock() + mock_get_linker.return_value = mock_linker + mock_linker.link.return_value = LinkedDoc('', [], []) + yield mock_get_linker + + def test_make_find_refs_response_linker_v3(self, mock_get_linker: WSGIRequest, mock_find_refs_text: linker._FindRefsText, mock_find_refs_options: linker._FindRefsTextOptions): response = linker._make_find_refs_response_linker_v3(mock_find_refs_text, mock_find_refs_options) @@ -214,7 +212,7 @@ def test_make_find_refs_response_linker_v3(self, mock_get_ref_resolver: WSGIRequ class TestFindRefsResponseInner: @pytest.fixture def mock_resolved(self): - return [[]] + return [] def test_make_find_refs_response_inner(self, mock_resolved: Mock, mock_find_refs_options: linker._FindRefsTextOptions): response = linker._make_find_refs_response_inner(mock_resolved, mock_find_refs_options) diff --git a/sefaria/helper/tests/normalization_tests.py b/sefaria/helper/tests/normalization_tests.py index eaff8ff116..9fc495a3f0 100644 --- a/sefaria/helper/tests/normalization_tests.py +++ b/sefaria/helper/tests/normalization_tests.py @@ -1,3 +1,4 @@ +import pytest import django django.setup() from sefaria.helper.normalization import * @@ -73,6 +74,19 @@ def test_complicated_normalizer_composer(): assert repl0 == ' ' +@pytest.mark.parametrize(('unnorm', 'norm', 'normalizer_steps', 'test_word'), [ + [" test", " test", ['html', 'double-space'], 'test'], + ["\n\n\nThe rest of Chapter 1.\n \n", " The rest of Chapter 1. ", ['unidecode', 'html', 'double-space'], 'Chapter 1'], +]) +def test_mapping(unnorm, norm, normalizer_steps, test_word): + nsc = NormalizerComposer(normalizer_steps) + assert nsc.normalize(unnorm) == norm + start_norm_ind = norm.index(test_word) + norm_inds = (start_norm_ind, start_norm_ind+len(test_word)) + unnorm_inds = nsc.norm_to_unnorm_indices(unnorm, [norm_inds])[0] + assert unnorm[slice(*unnorm_inds)] == norm[slice(*norm_inds)] + + def test_html_normalizer_for_empty_prefix(): text = """It is written241K. 17:1. Elijah the Tisbite""" normalizer = NormalizerComposer(['html']) @@ -82,8 +96,7 @@ def test_html_normalizer_for_empty_prefix(): ne_start = norm_text.index(ne) ne_norm_prefix_inds = (ne_start, ne_start) assert norm_text[ne_norm_prefix_inds[0]:ne_norm_prefix_inds[0]+len(ne)] == ne - mapping = normalizer.get_mapping_after_normalization(text) - ne_inds = normalizer.convert_normalized_indices_to_unnormalized_indices([ne_norm_prefix_inds], mapping)[0] + ne_inds = normalizer.norm_to_unnorm_indices(text, [ne_norm_prefix_inds])[0] # actual test assert ne_inds[0] == ne_inds[1] assert text[ne_inds[0]:ne_inds[0]+len(ne)] == ne @@ -102,6 +115,7 @@ def test_nested_itag(): assert text[s:e] == """bullnestedThe.""" +@pytest.mark.xfail(reason="not clear we want to support char_indices_from_word_indices as it's unused") def test_two_steps_normalization(): test_string = ' This is a {{test}}' diff --git a/sefaria/helper/tests/topic_test.py b/sefaria/helper/tests/topic_test.py index 8d20d4f049..e5e5477e0d 100644 --- a/sefaria/helper/tests/topic_test.py +++ b/sefaria/helper/tests/topic_test.py @@ -67,6 +67,17 @@ def author_root(): yield {"topic": t, "link": l} t.delete() +@pytest.fixture(autouse=True, scope='module') +def some_topic(): + t = Topic({'slug': "abcd_test", "data_source": "sefaria", "numSources": 0}) + title = "title in English" + he_title = "כותרת בעברית" + t.add_primary_titles(title, he_title) + t.set_slug_to_primary_title() + t.save() + yield t + t.delete() + @pytest.fixture(autouse=True, scope='module') def actual_author(author_root): @@ -86,18 +97,24 @@ def actual_author(author_root): def test_title_and_desc(author_root, actual_author, root_with_self_link, child_of_root_with_self_link, grandchild_of_root_with_self_link): for count, t in enumerate([author_root, actual_author, root_with_self_link, child_of_root_with_self_link, grandchild_of_root_with_self_link]): - new_values = {"title": f"new title {count+1}", - "altTitles": {"en": [f"New Alt title {count+1}"], "he": [f"New He Alt Title {count+1}"]}, - "heTitle": f"new hebrew title {count+1}", "description": {"en": f"new desc", "he": "new hebrew desc"}} + en_primary_title = {"text": f"new title {count+1}", "primary": True, "lang": 'en'} + he_primary_title = {"lang": "he", "text": f"new hebrew title {count+1}", "primary": True} + + en_alt_title = {"lang": "en", "text": f"New Alt title {count+1}"} + he_alt_title = {"lang": "he", "text": f"New He Alt Title {count+1}"} + + new_values = {"titles": [en_primary_title, en_alt_title, he_alt_title, he_primary_title], + "description": {"en": f"new desc", "he": "new hebrew desc"}} topic.update_topic(t["topic"], **new_values) assert t["topic"].description == new_values["description"] - assert t["topic"].get_primary_title('he') == new_values['heTitle'] - assert t["topic"].get_titles('en') == [new_values["title"]]+new_values["altTitles"]['en'] + assert t["topic"].get_primary_title('he') == he_primary_title['text'] + assert t["topic"].get_titles('en') == [en_primary_title['text'], en_alt_title['text']] def test_author_root(author_root, actual_author): - new_values = {"category": "authors", "title": actual_author["topic"].get_primary_title('en'), - "heTitle": actual_author["topic"].get_primary_title('he'), - "birthPlace": "Kyoto, Japan", "birthYear": 1300} + new_values = {"category": "authors", "titles": [ + {'text': actual_author["topic"].get_primary_title('en'), "lang": 'en', 'primary': True}, + {"text": actual_author["topic"].get_primary_title('he'), "lang": 'he', 'primary': True}], + "birthPlace": "Kyoto, Japan", "birthYear": 1300} assert Place().load({'key': new_values["birthPlace"]}) is None topic.update_topic(actual_author["topic"], **new_values) assert Place().load({'key': new_values["birthPlace"]}) @@ -111,18 +128,19 @@ def test_change_categories_and_titles(author_root, root_with_self_link): orig_tree_from_root_with_self_link = library.get_topic_toc_json_recursive(root_with_self_link["topic"]) orig_trees = [orig_tree_from_normal_root, orig_tree_from_root_with_self_link] roots = [author_root["topic"], root_with_self_link["topic"]] - orig_titles = [roots[0].get_primary_title('en'), roots[1].get_primary_title('en')] - orig_he_titles = [roots[0].get_primary_title('he'), roots[1].get_primary_title('he')] + orig_titles = [{'text': roots[0].get_primary_title('en'), 'lang':'en', 'primary': True}, {'text': roots[1].get_primary_title('en'), 'lang':'en', 'primary': True}] + orig_he_titles = [{'text': roots[0].get_primary_title('he'), 'lang':'he', 'primary': True}, {'text': roots[1].get_primary_title('he'), 'lang':'he', 'primary': True}] for i, root in enumerate(roots): other_root = roots[1 - i] - topic.update_topic(root, title=f"fake new title {i+1}", heTitle=f"fake new he title {i+1}", category=other_root.slug) # move root to be child of other root + topic.update_topic(root, titles=[{'text': f"fake new title {i+1}", 'lang': 'he', 'primary': True}, + {'text': f"fake new he title {i+1}", 'lang': 'he', 'primary': True}], category=other_root.slug) # move root to be child of other root new_tree = library.get_topic_toc_json_recursive(other_root) assert new_tree != orig_trees[i] # assert that the changes in the tree have occurred - assert root.get_titles('en') != [orig_titles[i]] - assert root.get_titles('he') != [orig_he_titles[i]] - topic.update_topic(root, title=orig_titles[i], heTitle=orig_he_titles[i], category=Topic.ROOT) # move it back to the main menu - assert root.get_titles('en') == [orig_titles[i]] - assert root.get_titles('he') == [orig_he_titles[i]] + assert root.get_titles('en') != [orig_titles[i]['text']] + assert root.get_titles('he') != [orig_he_titles[i]['text']] + topic.update_topic(root, titles=[orig_titles[i], orig_he_titles[i]], category=Topic.ROOT) # move it back to the main menu + assert root.get_titles('en') == [orig_titles[i]['text']] + assert root.get_titles('he') == [orig_he_titles[i]['text']] final_tree_from_normal_root = library.get_topic_toc_json_recursive(roots[0]) @@ -178,3 +196,19 @@ def test_calculate_approved_review_state(current, requested, was_ai_generated, m ]) def test_get_merged_descriptions(current, requested, merged): assert topic._get_merged_descriptions(current, requested) == merged + + +def test_update_topic(some_topic): + topic.update_topic(some_topic, titles=[{"text": "Tamar", "lang": "en", "primary": True}, + {"text": "תמר", "lang": "he", "primary": True, "disambiguation": "יהודה"}]) + assert some_topic.titles == [{"text": "Tamar", "lang": "en", "primary": True}, + {"text": "תמר", "lang": "he", "primary": True, "disambiguation": "יהודה"}] + + topic.update_topic(some_topic, description={"en": "abcdefg"}) + assert some_topic.description == {"en": "abcdefg"} + + with pytest.raises(Exception): + topic.update_topic(some_topic, titles=[{"a": "Tamar", "b": "en"}, + {"c": "תמר", "lang": "d", "disambiguation": "יהודה"}]) + with pytest.raises(Exception): + topic.update_topic(some_topic, slug='abc') diff --git a/sefaria/helper/topic.py b/sefaria/helper/topic.py index e721a01072..d6f8aa92bc 100644 --- a/sefaria/helper/topic.py +++ b/sefaria/helper/topic.py @@ -3,7 +3,7 @@ from pymongo import UpdateOne, InsertOne from typing import Optional, Union from collections import defaultdict -from functools import cmp_to_key +from functools import cmp_to_key, partial from sefaria.model import * from sefaria.model.place import process_topic_place_change from sefaria.system.exceptions import InputError @@ -16,7 +16,23 @@ from sefaria.helper.descriptions import create_era_link logger = structlog.get_logger(__name__) -def get_topic(v2, topic, with_html=True, with_links=True, annotate_links=True, with_refs=True, group_related=True, annotate_time_period=False, ref_link_type_filters=None, with_indexes=True): +def get_topic(v2, topic, lang, with_html=True, with_links=True, annotate_links=True, with_refs=True, group_related=True, annotate_time_period=False, ref_link_type_filters=None, with_indexes=True): + """ + Helper function for api/topics/ + TODO fill in rest of parameters + @param v2: + @param topic: slug of topic to get data for + @param lang: the language of the user to sort the ref links by + @param with_html: True if description should be returned with HTML. If false, HTML is stripped. + @param with_links: Should intra-topic links be returned. If true, return dict has a `links` key + @param annotate_links: + @param with_refs: + @param group_related: + @param annotate_time_period: + @param ref_link_type_filters: + @param with_indexes: + @return: + """ topic_obj = Topic.init(topic) if topic_obj is None: return {} @@ -45,7 +61,7 @@ def get_topic(v2, topic, with_html=True, with_links=True, annotate_links=True, w if with_links: response['links'] = group_links_by_type('intraTopic', intra_links, annotate_links, group_related) if with_refs: - ref_links = sort_and_group_similar_refs(ref_links) + ref_links = sort_and_group_similar_refs(ref_links, lang) if v2: ref_links = group_links_by_type('refTopic', ref_links, False, False) response['refs'] = ref_links @@ -169,8 +185,8 @@ def iterate_and_merge(new_ref_links, new_link, subset_ref_map, temp_subset_refs) new_ref_links[index] = merge_props_for_similar_refs(new_ref_links[index], new_link) return new_ref_links -def sort_and_group_similar_refs(ref_links): - ref_links.sort(key=cmp_to_key(sort_refs_by_relevance)) +def sort_and_group_similar_refs(ref_links, lang): + ref_links.sort(key=cmp_to_key(partial(sort_refs_by_relevance, lang=lang))) subset_ref_map = defaultdict(list) new_ref_links = [] for link in ref_links: @@ -233,15 +249,32 @@ def get_topic_by_parasha(parasha:str) -> Topic: return Topic().load({"parasha": parasha}) -def sort_refs_by_relevance(a, b): +def sort_refs_by_relevance(a, b, lang="english"): + """ + This function should mimic behavior of `refSort` in TopicPage.jsx. + It is a comparison function that takes two items from the list and returns the corresponding integer to indicate which should go first. To be used with `cmp_to_key`. + It considers the following criteria in order: + - If one object has an `order` key and another doesn't, the one with the `order` key comes first + - curatedPrimacy, higher comes first + - pagerank, higher comes first + - numDatasource (how many distinct links have this ref/topic pair) multiplied by tfidf (a bit complex, in short how "central" to this topic is the vocab used in this ref), higher comes first + @param lang: language to sort by. Defaults to "english". + @return: + """ aord = a.get('order', {}) bord = b.get('order', {}) + def curated_primacy(order_dict, lang): + return order_dict.get("curatedPrimacy", {}).get(lang, 0) + if not aord and not bord: return 0 if bool(aord) != bool(bord): - return len(bord) - len(aord) - if aord.get("curatedPrimacy") or bord.get("curatedPrimacy"): - return len(bord.get("curatedPrimacy", {})) - len(aord.get("curatedPrimacy", {})) + return int(bool(bord)) - int(bool(aord)) + short_lang = lang[:2] + aprimacy = curated_primacy(aord, short_lang) + bprimacy = curated_primacy(bord, short_lang) + if aprimacy > 0 or bprimacy > 0: + return bprimacy - aprimacy if aord.get('pr', 0) != bord.get('pr', 0): return bord.get('pr', 0) - aord.get('pr', 0) return (bord.get('numDatasource', 0) * bord.get('tfidf', 0)) - (aord.get('numDatasource', 0) * aord.get('tfidf', 0)) @@ -318,7 +351,7 @@ def ref_topic_link_prep(link): link['dataSource']['slug'] = data_source_slug return link -def get_topics_for_ref(tref, annotate=False): +def get_topics_for_ref(tref, lang="english", annotate=False): serialized = [l.contents() for l in Ref(tref).topiclinkset()] if annotate: if len(serialized) > 0: @@ -329,7 +362,7 @@ def get_topics_for_ref(tref, annotate=False): for link in serialized: ref_topic_link_prep(link) - serialized.sort(key=cmp_to_key(sort_refs_by_relevance)) + serialized.sort(key=cmp_to_key(partial(sort_refs_by_relevance, lang=lang))) return serialized @@ -682,7 +715,7 @@ def calculate_other_ref_scores(ref_topic_map): try: tp = oref.index.best_time_period() year = int(tp.start) if tp else 3000 - except ValueError: + except (ValueError, AttributeError): year = 3000 comp_date_map[(topic, tref)] = year order_id_map[(topic, tref)] = oref.order_id() @@ -1037,18 +1070,6 @@ def topic_change_category(topic_obj, new_category, old_category="", rebuild=Fals rebuild_topic_toc(topic_obj, category_changed=True) return topic_obj -def update_topic_titles(topic, title="", heTitle="", **kwargs): - new_primary = {"en": title, "he": heTitle} - for lang in ['en', 'he']: # first remove all titles and add new primary and then alt titles - for title in topic.get_titles(lang): - topic.remove_title(title, lang) - topic.add_title(new_primary[lang], lang, True, False) - if 'altTitles' in kwargs: - for title in kwargs['altTitles'][lang]: - topic.add_title(title, lang) - return topic - - def update_authors_place_and_time(topic, dataSource='learning-team-editing-tool', **kwargs): # update place info added to author, then update year and era info if not hasattr(topic, 'properties'): @@ -1064,11 +1085,11 @@ def update_properties(topic_obj, dataSource, k, v): def update_author_era(topic_obj, dataSource='learning-team-editing-tool', **kwargs): for k in ["birthYear", "deathYear"]: - if k in kwargs.keys(): # only change property value if key exists, otherwise it indicates no change + if kwargs.get(k, False): # only change property value if key exists, otherwise it indicates no change year = kwargs[k] update_properties(topic_obj, dataSource, k, year) - if 'era' in kwargs.keys(): # only change property value if key is in data, otherwise it indicates no change + if kwargs.get('era', False): # only change property value if key is in data, otherwise it indicates no change prev_era = topic_obj.properties.get('era', {}).get('value') era = kwargs['era'] update_properties(topic_obj, dataSource, 'era', era) @@ -1077,46 +1098,49 @@ def update_author_era(topic_obj, dataSource='learning-team-editing-tool', **kwar return topic_obj -def update_topic(topic, **kwargs): +def update_topic(topic, titles=None, category=None, origCategory=None, categoryDescritpion=None, description=None, + birthPlace=None, deathPlace=None, birthYear=None, deathYear=None, era=None, + rebuild_toc=True, manual=False, image=None, **kwargs): """ - Can update topic object's title, hebrew title, category, description, and categoryDescription fields + Can update topic object's titles, category, description, and categoryDescription fields :param topic: (Topic) The topic to update - :param **kwargs can be title, heTitle, category, description, categoryDescription, and rebuild_toc where `title`, `heTitle`, - and `category` are strings. `description` and `categoryDescription` are dictionaries where the fields are `en` and `he`. + :param **kwargs can be titles, category, description, categoryDescription, and rebuild_toc where `titles` is a list + of title objects as they are represented in the database, and `category` is a string. `description` and `categoryDescription` are dictionaries where the fields are `en` and `he`. The `category` parameter should be the slug of the new category. `rebuild_topic_toc` is a boolean and is assumed to be True :return: (model.Topic) The modified topic """ old_category = "" orig_slug = topic.slug - update_topic_titles(topic, **kwargs) - if kwargs.get('category') == 'authors': - topic = update_authors_place_and_time(topic, **kwargs) - if 'category' in kwargs and kwargs['category'] != kwargs.get('origCategory', kwargs['category']): + if titles: + topic.set_titles(titles) + if category == 'authors': + topic = update_authors_place_and_time(topic, birthPlace=birthPlace, birthYear=birthYear, deathPlace=deathPlace, deathYear=deathYear, era=era) + + if category and origCategory and origCategory != category: orig_link = IntraTopicLink().load({"linkType": "displays-under", "fromTopic": topic.slug, "toTopic": {"$ne": topic.slug}}) old_category = orig_link.toTopic if orig_link else Topic.ROOT - if old_category != kwargs['category']: - topic = topic_change_category(topic, kwargs["category"], old_category=old_category) # can change topic and intratopiclinks + if old_category != category: + topic = topic_change_category(topic, category, old_category=old_category) # can change topic and intratopiclinks - if kwargs.get('manual', False): + if manual: topic.data_source = "sefaria" # any topic edited manually should display automatically in the TOC and this flag ensures this topic.description_published = True - if "description" in kwargs or "categoryDescription" in kwargs: - topic.change_description(kwargs.get("description", None), kwargs.get("categoryDescription", None)) + if description or categoryDescritpion: + topic.change_description(description, categoryDescritpion) - if "image" in kwargs: - image_dict = kwargs["image"] - if image_dict["image_uri"] != "": - topic.image = kwargs["image"] + if image: + if image["image_uri"] != "": + topic.image = image elif hasattr(topic, 'image'): # we don't want captions without image_uris, so if the image_uri is blank, get rid of the caption too del topic.image topic.save() - if kwargs.get('rebuild_topic_toc', True): - rebuild_topic_toc(topic, orig_slug=orig_slug, category_changed=(old_category != kwargs.get('category', ""))) + if rebuild_toc: + rebuild_topic_toc(topic, orig_slug=orig_slug, category_changed=(old_category != category)) return topic @@ -1183,7 +1207,7 @@ def edit_topic_source(slug, orig_tref, new_tref="", creating_new_link=True, topic_obj = Topic.init(slug) if topic_obj is None: return {"error": "Topic does not exist."} - ref_topic_dict = {"toTopic": slug, "linkType": linkType, "ref": orig_tref} + ref_topic_dict = {"toTopic": slug, "linkType": linkType, "ref": orig_tref, "dataSource": "learning-team"} link = RefTopicLink().load(ref_topic_dict) link_already_existed = link is not None if not link_already_existed: @@ -1249,17 +1273,19 @@ def update_order_of_topic_sources(topic, sources, uid, lang='en'): Ref(s['ref']).normal() except InputError as e: return {"error": f"Invalid ref {s['ref']}"} - link = RefTopicLink().load({"toTopic": topic, "linkType": "about", "ref": s['ref']}) + link = RefTopicLink().load({"toTopic": topic, "linkType": "about", "ref": s['ref'], "dataSource": "learning-team"}) if link is None: - return {"error": f"Link between {topic} and {s['ref']} doesn't exist."} + # for now, we are focusing on learning team links and the lack of existence isn't considered an error + continue + # return {"error": f"Link between {topic} and {s['ref']} doesn't exist."} order = getattr(link, 'order', {}) - if lang not in order.get('availableLangs', []) : - return {"error": f"Link between {topic} and {s['ref']} does not exist in '{lang}'."} ref_to_link[s['ref']] = link # now update curatedPrimacy data for display_order, s in enumerate(sources[::-1]): - link = ref_to_link[s['ref']] + link = ref_to_link.get(s['ref']) + if not link: + continue order = getattr(link, 'order', {}) curatedPrimacy = order.get('curatedPrimacy', {}) curatedPrimacy[lang] = display_order @@ -1280,17 +1306,18 @@ def delete_ref_topic_link(tref, to_topic, link_type, lang): if Topic.init(to_topic) is None: return {"error": f"Topic {to_topic} doesn't exist."} - topic_link = {"toTopic": to_topic, "linkType": link_type, 'ref': tref} + topic_link = {"toTopic": to_topic, "linkType": link_type, 'ref': tref, "dataSource": "learning-team"} link = RefTopicLink().load(topic_link) if link is None: - return {"error": f"Link between {tref} and {to_topic} doesn't exist."} + return {"error": f"A learning-team link between {tref} and {to_topic} doesn't exist. If you are trying to delete a non-learning-team link, reach out to the engineering team."} - if lang in link.order.get('availableLangs', []): - link.order['availableLangs'].remove(lang) if lang in link.order.get('curatedPrimacy', []): link.order['curatedPrimacy'].pop(lang) + if lang in getattr(link, 'descriptions', {}): + link.descriptions.pop(lang) - if len(link.order.get('availableLangs', [])) > 0: + # Note, using curatedPrimacy as a proxy here since we are currently only allowing deletion of learning-team links. + if len(link.order.get('curatedPrimacy', [])) > 0: link.save() return {"status": "ok"} else: # deleted in both hebrew and english so delete link object diff --git a/sefaria/local_settings_example.py b/sefaria/local_settings_example.py index 1d022c2fa9..3517e77eb9 100644 --- a/sefaria/local_settings_example.py +++ b/sefaria/local_settings_example.py @@ -53,6 +53,8 @@ ADMINS = ( ('Your Name', 'you@example.com'), ) +ADMIN_PATH = 'somethingsomething' #This will be the path to the admin site, locally it can also be 'admin' + PINNED_IPCOUNTRY = "IL" #change if you want parashat hashavua to be diaspora. MONGO_REPLICASET_NAME = None # If the below is a list, this should be set to something other than None. diff --git a/sefaria/model/__init__.py b/sefaria/model/__init__.py index a6995be1fc..13232def14 100644 --- a/sefaria/model/__init__.py +++ b/sefaria/model/__init__.py @@ -45,7 +45,7 @@ from .portal import Portal from .manuscript import Manuscript, ManuscriptSet, ManuscriptPage, ManuscriptPageSet from .linker.ref_part import RawRef -from .linker.ref_resolver import RefResolver +from .linker.linker import Linker from . import dependencies library._build_index_maps() diff --git a/sefaria/model/linker/linker.py b/sefaria/model/linker/linker.py new file mode 100644 index 0000000000..f496780dd0 --- /dev/null +++ b/sefaria/model/linker/linker.py @@ -0,0 +1,140 @@ +import dataclasses +from typing import List, Optional, Union, Iterable, Tuple +from tqdm import tqdm +from sefaria.model.text import Ref +from sefaria.model.linker.ref_part import RawRef, RawNamedEntity, span_inds +from sefaria.model.linker.ref_resolver import RefResolver, ResolutionThoroughness, PossiblyAmbigResolvedRef +from sefaria.model.linker.named_entity_resolver import NamedEntityResolver, ResolvedNamedEntity +from sefaria.model.linker.named_entity_recognizer import NamedEntityRecognizer + + +@dataclasses.dataclass +class LinkedDoc: + text: str + resolved_refs: List[PossiblyAmbigResolvedRef] + resolved_named_entities: List[ResolvedNamedEntity] + + @property + def all_resolved(self) -> List[Union[PossiblyAmbigResolvedRef, ResolvedNamedEntity]]: + return self.resolved_refs + self.resolved_named_entities + + +class Linker: + + def __init__(self, ner: NamedEntityRecognizer, ref_resolver: RefResolver, ne_resolver: NamedEntityResolver, ): + self._ner = ner + self._ref_resolver = ref_resolver + self._ne_resolver = ne_resolver + + def bulk_link(self, inputs: List[str], book_context_refs: Optional[List[Optional[Ref]]] = None, with_failures=False, + verbose=False, thoroughness=ResolutionThoroughness.NORMAL, type_filter='all') -> List[LinkedDoc]: + """ + Bulk operation to link every string in `inputs` with citations and named entities + `bulk_link()` is faster than running `link()` in a loop because it can pass all strings to the relevant models + at once. + @param inputs: String inputs. Each input is processed independently. + @param book_context_refs: Additional context references that represents the source book that the input came from. + @param with_failures: True to return all recognized entities, even if they weren't linked. + @param verbose: True to print progress to the console + @param thoroughness: How thorough the search to link entities should be. HIGH increases the processing time. + @param type_filter: Type of entities to return, either 'all', 'citation' or 'named entity' + @return: list of LinkedDocs + """ + self._ref_resolver.reset_ibid_history() + all_named_entities = self._ner.bulk_recognize(inputs) + docs = [] + book_context_refs = book_context_refs or [None]*len(all_named_entities) + iterable = self._get_bulk_link_iterable(inputs, all_named_entities, book_context_refs, verbose) + for input_str, book_context_ref, inner_named_entities in iterable: + raw_refs, named_entities = self._partition_raw_refs_and_named_entities(inner_named_entities) + resolved_refs, resolved_named_entities = [], [] + if type_filter in {'all', 'citation'}: + resolved_refs = self._ref_resolver.bulk_resolve(raw_refs, book_context_ref, with_failures, thoroughness, reset_ibids=False) + if type_filter in {'all', 'named entity'}: + resolved_named_entities = self._ne_resolver.bulk_resolve(named_entities, with_failures) + docs += [LinkedDoc(input_str, resolved_refs, resolved_named_entities)] + + named_entity_list_list = [[rr.raw_entity for rr in doc.all_resolved] for doc in docs] + self._ner.bulk_map_normal_output_to_original_input(inputs, named_entity_list_list) + return docs + + def link(self, input_str: str, book_context_ref: Optional[Ref] = None, with_failures=False, + thoroughness=ResolutionThoroughness.NORMAL, type_filter='all') -> LinkedDoc: + """ + Link `input_str` with citations and named entities + @param input_str: + @param book_context_ref: Additional context reference that represents the source book that the input came from. + @param with_failures: True to return all recognized entities, even if they weren't linked. + @param thoroughness: How thorough the search to link entities should be. HIGH increases the processing time. + @param type_filter: Type of entities to return, either 'all', 'citation' or 'named entity' + @return: + """ + raw_refs, named_entities = self._ner.recognize(input_str) + resolved_refs, resolved_named_entities = [], [] + if type_filter in {'all', 'citation'}: + resolved_refs = self._ref_resolver.bulk_resolve(raw_refs, book_context_ref, with_failures, thoroughness) + if type_filter in {'all', 'named entity'}: + resolved_named_entities = self._ne_resolver.bulk_resolve(named_entities, with_failures) + doc = LinkedDoc(input_str, resolved_refs, resolved_named_entities) + self._ner.map_normal_output_to_original_input(input_str, [x.raw_entity for x in doc.all_resolved]) + return doc + + def link_by_paragraph(self, input_str: str, book_context_ref: Ref, *link_args, **link_kwargs) -> LinkedDoc: + """ + Similar to `link()` except model is run on each paragraph individually (via a bulk operation) + This better mimics the way the underlying ML models were trained and tends to lead to better results + Paragraphs are delineated by new line characters + @param input_str: + @param book_context_ref: + @param link_args: *args to be passed to link() + @param link_kwargs: **kwargs to be passed to link() + @return: + """ + import re + + inputs = re.split(r'\s*\n+\s*', input_str) + linked_docs = self.bulk_link(inputs, [book_context_ref]*len(inputs), *link_args, **link_kwargs) + resolved_refs = [] + resolved_named_entities = [] + full_spacy_doc = self._ner.named_entity_model.make_doc(input_str) + offset = 0 + for curr_input, linked_doc in zip(inputs, linked_docs): + resolved_refs += linked_doc.resolved_refs + resolved_named_entities += linked_doc.resolved_named_entities + + for resolved in linked_doc.all_resolved: + named_entity = resolved.raw_entity + named_entity.align_to_new_doc(full_spacy_doc, offset) + if isinstance(named_entity, RawRef): + # named_entity's current start has already been offset so it's the offset we need for each part + raw_ref_offset, _ = span_inds(named_entity.span) + named_entity.align_parts_to_new_doc(full_spacy_doc, raw_ref_offset) + curr_token_count = len(self._ner.named_entity_model.make_doc(curr_input)) + offset += curr_token_count+1 # +1 for newline token + return LinkedDoc(input_str, resolved_refs, resolved_named_entities) + + def get_ner(self) -> NamedEntityRecognizer: + return self._ner + + def reset_ibid_history(self) -> None: + """ + Reflecting this function out + @return: + """ + self._ref_resolver.reset_ibid_history() + + @staticmethod + def _partition_raw_refs_and_named_entities(raw_refs_and_named_entities: List[RawNamedEntity]) \ + -> Tuple[List[RawRef], List[RawNamedEntity]]: + raw_refs = [ne for ne in raw_refs_and_named_entities if isinstance(ne, RawRef)] + named_entities = [ne for ne in raw_refs_and_named_entities if not isinstance(ne, RawRef)] + return raw_refs, named_entities + + @staticmethod + def _get_bulk_link_iterable(inputs: List[str], all_named_entities: List[List[RawNamedEntity]], + book_context_refs: Optional[List[Optional[Ref]]] = None, verbose=False + ) -> Iterable[Tuple[Ref, List[RawNamedEntity]]]: + iterable = zip(inputs, book_context_refs, all_named_entities) + if verbose: + iterable = tqdm(iterable, total=len(book_context_refs)) + return iterable diff --git a/sefaria/model/linker/named_entity_recognizer.py b/sefaria/model/linker/named_entity_recognizer.py new file mode 100644 index 0000000000..dabf190cd2 --- /dev/null +++ b/sefaria/model/linker/named_entity_recognizer.py @@ -0,0 +1,249 @@ +from typing import List, Generator, Optional, Tuple +from functools import reduce +from collections import defaultdict +from sefaria.model.linker.ref_part import RawRef, RawRefPart, SpanOrToken, span_inds, RefPartType, RawNamedEntity, NamedEntityType +from sefaria.helper.normalization import NormalizerComposer +try: + import spacy + from spacy.tokens import Span + from spacy.language import Language +except ImportError: + spacy = Doc = Span = Token = Language = None + + +class NamedEntityRecognizer: + """ + Given models, runs them and returns named entity results + Currently, named entities include: + - refs + - people + - groups of people + """ + + def __init__(self, lang: str, named_entity_model: Language, raw_ref_part_model: Language): + """ + + @param lang: language that the Recognizer understands (based on how the models were trained) + @param named_entity_model: spaCy model which takes a string and recognizes where entities are + @param raw_ref_part_model: spaCy model which takes a string raw ref and recognizes the parts of the ref + """ + self._lang = lang + self._named_entity_model = named_entity_model + self._raw_ref_part_model = raw_ref_part_model + self._normalizer = self.__init_normalizer() + + def __init_normalizer(self) -> NormalizerComposer: + # see ML Repo library_exporter.py:TextWalker.__init__() which uses same normalization + # important that normalization is equivalent to normalization done at training time + normalizer_steps = ['unidecode', 'html', 'double-space'] + if self._lang == 'he': + normalizer_steps += ['maqaf', 'cantillation'] + return NormalizerComposer(normalizer_steps) + + def bulk_recognize(self, inputs: List[str]) -> List[List[RawNamedEntity]]: + """ + Return all RawNamedEntity's in `inputs`. If the entity is a citation, parse out the inner RawRefParts and create + RawRefs. + @param inputs: List of strings to search for named entities in. + @return: 2D list of RawNamedEntity's. Includes RawRefs which are a subtype of RawNamedEntity + """ + all_raw_named_entities = self._bulk_get_raw_named_entities_wo_raw_refs(inputs) + all_citations, all_non_citations = self._bulk_partition_named_entities_by_citation_type(all_raw_named_entities) + all_raw_refs = self._bulk_parse_raw_refs(all_citations) + merged_entities = [] + for inner_non_citations, inner_citations in zip(all_non_citations, all_raw_refs): + merged_entities += [inner_non_citations + inner_citations] + return merged_entities + + def recognize(self, input_str: str) -> Tuple[List[RawRef], List[RawNamedEntity]]: + raw_named_entities = self._get_raw_named_entities_wo_raw_refs(input_str) + citations, non_citations = self._partition_named_entities_by_citation_type(raw_named_entities) + raw_refs = self._parse_raw_refs(citations) + return raw_refs, non_citations + + def _bulk_get_raw_named_entities_wo_raw_refs(self, inputs: List[str]) -> List[List[RawNamedEntity]]: + """ + Finds RawNamedEntities in `inputs` but doesn't parse citations into RawRefs with RawRefParts + @param inputs: + @return: + """ + normalized_inputs = self._normalize_input(inputs) + all_raw_named_entity_spans = list(self._bulk_get_raw_named_entity_spans(normalized_inputs)) + all_raw_named_entities = [] + for raw_named_entity_spans in all_raw_named_entity_spans: + temp_raw_named_entities = [] + for span in raw_named_entity_spans: + ne_type = NamedEntityType.span_label_to_enum(span.label_) + temp_raw_named_entities += [RawNamedEntity(span, ne_type)] + all_raw_named_entities += [temp_raw_named_entities] + return all_raw_named_entities + + def _get_raw_named_entities_wo_raw_refs(self, input_str: str) -> List[RawNamedEntity]: + """ + Finds RawNamedEntities in `input_str` but doesn't parse citations into RawRefs with RawRefParts + @param input_str: + @return: + """ + normalized_input = self._normalize_input([input_str])[0] + raw_named_entity_spans = self._get_raw_named_entity_spans(normalized_input) + raw_named_entities = [] + for span in raw_named_entity_spans: + ne_type = NamedEntityType.span_label_to_enum(span.label_) + raw_named_entities += [RawNamedEntity(span, ne_type)] + return raw_named_entities + + @staticmethod + def _bulk_partition_named_entities_by_citation_type( + all_raw_named_entities: List[List[RawNamedEntity]] + ) -> Tuple[List[List[RawNamedEntity]], List[List[RawNamedEntity]]]: + """ + Given named entities, partition them into two lists; list of entities that are citations and those that aren't. + @param all_raw_named_entities: + @return: + """ + citations, non_citations = [], [] + for sublist in all_raw_named_entities: + inner_citations, inner_non_citations = NamedEntityRecognizer._partition_named_entities_by_citation_type(sublist) + citations += [inner_citations] + non_citations += [inner_non_citations] + return citations, non_citations + + @staticmethod + def _partition_named_entities_by_citation_type( + raw_named_entities: List[RawNamedEntity] + ) -> Tuple[List[RawNamedEntity], List[RawNamedEntity]]: + citations, non_citations = [], [] + for named_entity in raw_named_entities: + curr_list = citations if named_entity.type == NamedEntityType.CITATION else non_citations + curr_list += [named_entity] + return citations, non_citations + + def _bulk_parse_raw_refs(self, all_citation_entities: List[List[RawNamedEntity]]) -> List[List[RawRef]]: + """ + Runs models on inputs to locate all refs and ref parts + Note: takes advantage of bulk spaCy operations. It is more efficient to pass multiple strings in input than to + run this function multiple times + @param inputs: List of strings to search for refs in. + @return: 2D list of RawRefs. Each inner list corresponds to the refs found in a string of the input. + """ + ref_part_input = reduce(lambda a, b: a + [(sub_b.text, b[0]) for sub_b in b[1]], enumerate(all_citation_entities), []) + all_raw_ref_part_spans = list(self._bulk_get_raw_ref_part_spans(ref_part_input, as_tuples=True)) + all_raw_ref_part_span_map = defaultdict(list) + for ref_part_span, input_idx in all_raw_ref_part_spans: + all_raw_ref_part_span_map[input_idx] += [ref_part_span] + + all_raw_refs = [] + for input_idx, named_entities in enumerate(all_citation_entities): + raw_ref_part_spans = all_raw_ref_part_span_map[input_idx] + all_raw_refs += [self._bulk_make_raw_refs(named_entities, raw_ref_part_spans)] + return all_raw_refs + + def _parse_raw_refs(self, citation_entities: List[RawNamedEntity]) -> List[RawRef]: + raw_ref_part_spans = list(self._bulk_get_raw_ref_part_spans([e.text for e in citation_entities])) + return self._bulk_make_raw_refs(citation_entities, raw_ref_part_spans) + + def bulk_map_normal_output_to_original_input(self, input: List[str], raw_ref_list_list: List[List[RawRef]]): + for temp_input, raw_ref_list in zip(input, raw_ref_list_list): + self.map_normal_output_to_original_input(temp_input, raw_ref_list) + + def map_normal_output_to_original_input(self, input: str, named_entities: List[RawNamedEntity]) -> None: + """ + Ref resolution ran on normalized input. Remap raw refs to original (non-normalized) input + """ + unnorm_doc = self._named_entity_model.make_doc(input) + mapping, subst_end_indices = self._normalizer.get_mapping_after_normalization(input) + conv = self._normalizer.norm_to_unnorm_indices_with_mapping + norm_inds = [named_entity.char_indices for named_entity in named_entities] + unnorm_inds = conv(norm_inds, mapping, subst_end_indices) + unnorm_part_inds = [] + for (named_entity, (norm_raw_ref_start, _)) in zip(named_entities, norm_inds): + raw_ref_parts = named_entity.raw_ref_parts if isinstance(named_entity, RawRef) else [] + unnorm_part_inds += [conv([[norm_raw_ref_start + i for i in part.char_indices] + for part in raw_ref_parts], mapping, subst_end_indices)] + for named_entity, temp_unnorm_inds, temp_unnorm_part_inds in zip(named_entities, unnorm_inds, unnorm_part_inds): + named_entity.map_new_char_indices(unnorm_doc, temp_unnorm_inds) + if isinstance(named_entity, RawRef): + named_entity.map_new_part_char_indices(temp_unnorm_part_inds) + + @property + def named_entity_model(self): + return self._named_entity_model + + @property + def raw_ref_part_model(self): + return self._raw_ref_part_model + + def _normalize_input(self, input: List[str]): + """ + Normalize input text to match normalization that happened at training time + """ + return [self._normalizer.normalize(s) for s in input] + + def _get_raw_named_entity_spans(self, st: str) -> List[SpanOrToken]: + doc = self._named_entity_model(st) + return doc.ents + + def _get_raw_ref_part_spans(self, st: str) -> List[SpanOrToken]: + doc = self._raw_ref_part_model(st) + return doc.ents + + def _bulk_get_raw_named_entity_spans(self, input: List[str], batch_size=150, **kwargs) -> Generator[List[Span], None, None]: + for doc in self._named_entity_model.pipe(input, batch_size=batch_size, **kwargs): + if kwargs.get('as_tuples', False): + doc, context = doc + yield doc.ents, context + else: + yield doc.ents + + def _bulk_get_raw_ref_part_spans(self, input: List[str], batch_size=None, **kwargs) -> Generator[List[Span], None, None]: + for doc in self._raw_ref_part_model.pipe(input, batch_size=batch_size or len(input), **kwargs): + if kwargs.get('as_tuples', False): + doc, context = doc + yield doc.ents, context + else: + yield doc.ents + + def _bulk_make_raw_refs(self, named_entities: List[RawNamedEntity], raw_ref_part_spans: List[List[SpanOrToken]]) -> List[RawRef]: + raw_refs = [] + dh_continuations = self._bulk_make_dh_continuations(named_entities, raw_ref_part_spans) + for named_entity, part_span_list, temp_dh_continuations in zip(named_entities, raw_ref_part_spans, dh_continuations): + raw_refs += [self._make_raw_ref(named_entity.span, part_span_list, temp_dh_continuations)] + return raw_refs + + def _make_raw_ref(self, span: SpanOrToken, part_span_list: List[SpanOrToken], dh_continuations: List[SpanOrToken]) -> RawRef: + raw_ref_parts = [] + for part_span, dh_continuation in zip(part_span_list, dh_continuations): + part_type = RefPartType.span_label_to_enum(part_span.label_) + raw_ref_parts += [RawRefPart(part_type, part_span, dh_continuation)] + return RawRef(span, self._lang, raw_ref_parts) + + def _bulk_make_dh_continuations(self, named_entities: List[RawNamedEntity], raw_ref_part_spans) -> List[List[SpanOrToken]]: + dh_continuations = [] + for ispan, (named_entity, part_span_list) in enumerate(zip(named_entities, raw_ref_part_spans)): + temp_dh_continuations = [] + for ipart, part_span in enumerate(part_span_list): + part_type = RefPartType.span_label_to_enum(part_span.label_) + dh_continuation = None + if part_type == RefPartType.DH: + dh_continuation = self._get_dh_continuation(ispan, ipart, named_entities, part_span_list, + named_entity.span, part_span) + temp_dh_continuations += [dh_continuation] + dh_continuations += [temp_dh_continuations] + return dh_continuations + + @staticmethod + def _get_dh_continuation(ispan: int, ipart: int, named_entities: List[RawNamedEntity], part_span_list: List[SpanOrToken], span: SpanOrToken, part_span: SpanOrToken) -> Optional[SpanOrToken]: + if ipart == len(part_span_list) - 1: + curr_doc = span.doc + _, span_end = span_inds(span) + if ispan == len(named_entities) - 1: + dh_cont = curr_doc[span_end:] + else: + next_span_start, _ = span_inds(named_entities[ispan + 1].span) + dh_cont = curr_doc[span_end:next_span_start] + else: + _, part_span_end = span_inds(part_span) + next_part_span_start, _ = span_inds(part_span_list[ipart + 1]) + dh_cont = part_span.doc[part_span_end:next_part_span_start] + + return dh_cont diff --git a/sefaria/model/linker/named_entity_resolver.py b/sefaria/model/linker/named_entity_resolver.py new file mode 100644 index 0000000000..4e40b128cd --- /dev/null +++ b/sefaria/model/linker/named_entity_resolver.py @@ -0,0 +1,146 @@ +import dataclasses +from typing import List, Dict, Type, Set +import re2 as re +from functools import reduce +from collections import defaultdict +from sefaria.model.linker.ref_part import RawNamedEntity +from sefaria.model.topic import Topic +from sefaria.utils.hebrew import strip_cantillation +from sefaria.system.exceptions import InputError + + +class ResolvedNamedEntity: + + def __init__(self, raw_named_entity: RawNamedEntity, topics: List[Topic]): + self.raw_entity = raw_named_entity + self.topics = topics + + @property + def topic(self): + if len(self.topics) != 1: + raise InputError(f"ResolvedNamedEntity is ambiguous and has {len(self.topics)} topics so you can't access " + ".topic.") + return self.topics[0] + + @property + def is_ambiguous(self): + return len(self.topics) != 1 + + +class TitleGenerator: + + expansions = {} + + @classmethod + def generate(cls, title: str) -> List[str]: + expansions = [title] + for reg, reg_expansions in cls.expansions.items(): + for reg_expansion in reg_expansions: + potential_expansion = re.sub(reg, reg_expansion, title) + if potential_expansion == title: continue + expansions += [potential_expansion] + expansions = [strip_cantillation(t, strip_vowels=True) for t in expansions] + return expansions + + +class PersonTitleGenerator(TitleGenerator): + + expansions = { + r' b\. ': [' ben ', ' bar ', ', son of ', ', the son of ', ' son of ', ' the son of ', ' Bar ', ' Ben '], + r'^Ben ': ['ben '], + r'^Bar ': ['bar '], + r'^Rabbi ': ['R. '], + r'^Rebbi ': ['R. '], + } + + +class FallbackTitleGenerator(TitleGenerator): + + expansions = { + '^The ': ['the '], + } + + +@dataclasses.dataclass +class NamedEntityTitleExpanderRoute: + type_slug: str + generator: Type[TitleGenerator] + + +class NamedEntityTitleExpander: + type_generator_router = [ + NamedEntityTitleExpanderRoute('people', PersonTitleGenerator), + NamedEntityTitleExpanderRoute('entity', FallbackTitleGenerator), + ] + + def __init__(self, lang: str): + self._lang = lang + + def expand(self, topic: Topic) -> List[str]: + for route in self.type_generator_router: + if topic.has_types({route.type_slug}): + return self._expand_titles_with_generator(topic, route.generator) + return self._get_topic_titles(topic) + + def _get_topic_titles(self, topic: Topic) -> List[str]: + return topic.get_titles(lang=self._lang, with_disambiguation=False) + + def _expand_titles_with_generator(self, topic: Topic, generator: Type[TitleGenerator]) -> List[str]: + expansions = [] + for title in self._get_topic_titles(topic): + expansions += generator.generate(title) + return expansions + + +class TopicMatcher: + + def __init__(self, lang: str, named_entity_types_to_topics: Dict[str, Dict[str, List[str]]]): + self._lang = lang + self._title_expander = NamedEntityTitleExpander(lang) + topics_by_type = { + named_entity_type: self.__generate_topic_list_from_spec(topic_spec) + for named_entity_type, topic_spec in named_entity_types_to_topics.items() + } + all_topics = reduce(lambda a, b: a + b, topics_by_type.values(), []) + self._slug_topic_map = {t.slug: t for t in all_topics} + self._title_slug_map_by_type = { + named_entity_type: self.__get_title_map_for_topics(topics_by_type[named_entity_type]) + for named_entity_type, topic_spec in named_entity_types_to_topics.items() + } + + def __get_title_map_for_topics(self, topics: List[Topic]) -> Dict[str, Set[str]]: + title_slug_map = defaultdict(set) + unique_topics = {t.slug: t for t in topics}.values() + for topic in unique_topics: + for title in self._title_expander.expand(topic): + title_slug_map[title].add(topic.slug) + return title_slug_map + + @staticmethod + def __generate_topic_list_from_spec(topic_spec: Dict[str, List[str]]) -> List[Topic]: + topics = [] + for root in topic_spec.get('ontology_roots', []): + topics += Topic.init(root).topics_by_link_type_recursively() + topics += [Topic.init(slug) for slug in topic_spec.get('single_slugs', [])] + return topics + + def match(self, named_entity: RawNamedEntity) -> List[Topic]: + slugs = self._title_slug_map_by_type.get(named_entity.type.name, {}).get(named_entity.text, []) + return [self._slug_topic_map[slug] for slug in slugs] + + +class NamedEntityResolver: + + def __init__(self, topic_matcher: TopicMatcher): + self._topic_matcher = topic_matcher + + def bulk_resolve(self, raw_named_entities: List[RawNamedEntity], with_failures=False) -> List[ResolvedNamedEntity]: + resolved = [] + for named_entity in raw_named_entities: + matched_topics = self._topic_matcher.match(named_entity) + if len(matched_topics) > 0 or with_failures: + resolved += [ResolvedNamedEntity(named_entity, matched_topics)] + return resolved + + + diff --git a/sefaria/model/linker/ref_part.py b/sefaria/model/linker/ref_part.py index 20e590b781..19d10e8dfc 100644 --- a/sefaria/model/linker/ref_part.py +++ b/sefaria/model/linker/ref_part.py @@ -34,6 +34,18 @@ "non-cts": "NON_CTS", } + +# keys correspond named entity labels in spacy models +# values are properties in NamedEntityType +LABEL_TO_NAMED_ENTITY_TYPE_ATTR = { + # HE + "מקור": "CITATION", + # EN + "Person": "PERSON", + "Group": "GROUP", + "Citation": "CITATION", +} + SpanOrToken = Union[Span, Token] # convenience type since Spans and Tokens are very similar @@ -76,6 +88,19 @@ def span_char_inds(span: SpanOrToken) -> Tuple[int, int]: return idx, idx + len(span) +class NamedEntityType(Enum): + PERSON = "person" + GROUP = "group" + CITATION = "citation" + + @classmethod + def span_label_to_enum(cls, span_label: str) -> 'NamedEntityType': + """ + Convert span label from spacy named entity to NamedEntityType + """ + return getattr(cls, LABEL_TO_NAMED_ENTITY_TYPE_ATTR[span_label]) + + class RefPartType(Enum): NAMED = "named" NUMBERED = "numbered" @@ -282,12 +307,55 @@ def _get_full_span(sections, toSections): return start_span.doc[start_token_i:end_token_i] -class RawRef(abst.Cloneable): +class RawNamedEntity(abst.Cloneable): + """ + Span of text which represents a named entity before it has been identified with an object in Sefaria's DB + """ + + def __init__(self, span: SpanOrToken, type: NamedEntityType, **cloneable_kwargs) -> None: + self.span = span + self.type = type + + def map_new_char_indices(self, new_doc: Doc, new_char_indices: Tuple[int, int]) -> None: + """ + Remap self.span to new indices + """ + self.span = new_doc.char_span(*new_char_indices, alignment_mode='expand') + if self.span is None: raise InputError(f"${new_char_indices} don't match token boundaries. Using 'expand' alignment mode text is '{new_doc.char_span(*new_indices, alignment_mode='expand')}'") + + def align_to_new_doc(self, new_doc: Doc, offset: int) -> None: + """ + Aligns underlying span to `new_doc`'s tokens. Assumption is `new_doc` has some token offset from the original + doc of `self.span` + + @param new_doc: new Doc to align to + @param offset: token offset that aligns tokens in `self.span` to `new_doc + """ + curr_start, curr_end = span_inds(self.span) + new_start, new_end = curr_start+offset, curr_end+offset + self.span = new_doc[new_start:new_end] + + @property + def text(self): + """ + Return text of underlying span + """ + return self.span.text + + @property + def char_indices(self) -> Tuple[int, int]: + """ + Return start and end char indices of underlying text + """ + return span_char_inds(self.span) + + +class RawRef(RawNamedEntity): """ Span of text which may represent one or more Refs Contains RawRefParts """ - def __init__(self, lang: str, raw_ref_parts: list, span: SpanOrToken, **clonable_kwargs) -> None: + def __init__(self, span: SpanOrToken, lang: str, raw_ref_parts: list, **clonable_kwargs) -> None: """ @param lang: @@ -295,6 +363,7 @@ def __init__(self, lang: str, raw_ref_parts: list, span: SpanOrToken, **clonable @param span: @param clonable_kwargs: kwargs when running Clonable.clone() """ + super().__init__(span, NamedEntityType.CITATION) self.lang = lang self.raw_ref_parts = self._group_ranged_parts(raw_ref_parts) self.parts_to_match = self.raw_ref_parts # actual parts that will be matched. different when their are context swaps @@ -374,7 +443,7 @@ def split_part(self, part: RawRefPart, str_end) -> Tuple['RawRef', RawRefPart, R """ start_char, end_char = span_char_inds(part.span) pivot = len(part.text) - len(str_end) + start_char - aspan = part.span.doc.char_span(0, pivot, alignment_mode='contract') + aspan = part.span.doc.char_span(start_char, pivot, alignment_mode='contract') bspan = part.span.doc.char_span(pivot, end_char, alignment_mode='contract') if aspan is None or bspan is None: raise InputError(f"Couldn't break on token boundaries for strings '{self.text[0:pivot]}' and '{self.text[pivot:end_char]}'") @@ -397,27 +466,25 @@ def split_part(self, part: RawRefPart, str_end) -> Tuple['RawRef', RawRefPart, R new_parts_to_match = self.parts_to_match return self.clone(raw_ref_parts=new_parts, parts_to_match=new_parts_to_match), apart, bpart - @property - def text(self): + def map_new_part_char_indices(self, new_part_char_indices: List[Tuple[int, int]]) -> None: """ - Return text of underlying span - """ - return self.span.text - - @property - def char_indices(self) -> Tuple[int, int]: - """ - Return start and end char indices of underlying text + Remap self.span and all spans of parts to new indices """ - return span_char_inds(self.span) + start_char, _ = self.char_indices + doc_span = self.span.as_doc() + for part, temp_part_indices in zip(self.raw_ref_parts, new_part_char_indices): + part.span = doc_span.char_span(*[i-start_char for i in temp_part_indices], alignment_mode='expand') + if part.span is None: + raise InputError(f"{temp_part_indices} doesn't match token boundaries for part {part}.") - def map_new_indices(self, new_doc: Doc, new_indices: Tuple[int, int], new_part_indices: List[Tuple[int, int]]) -> None: + def align_parts_to_new_doc(self, new_doc: Doc, offset: int) -> None: """ - Remap self.span and all spans of parts to new indices + See `RawNamedEntity.align_to_new_doc` + @param new_doc: + @param offset: + @return: """ - self.span = new_doc.char_span(*new_indices) - if self.span is None: raise InputError(f"${new_indices} don't match token boundaries. Using 'expand' alignment mode text is '{new_doc.char_span(*new_indices, alignment_mode='expand')}'") - doc_span = self.span.as_doc() - for part, temp_part_indices in zip(self.raw_ref_parts, new_part_indices): - part.span = doc_span.char_span(*[i-new_indices[0] for i in temp_part_indices]) - if part.span is None: raise InputError(f"{temp_part_indices} doesn't match token boundaries for part {part}. Using 'expand' alignment mode text is '{new_doc.char_span(*temp_part_indices, alignment_mode='expand')}'") + for part in self.raw_ref_parts: + curr_start, curr_end = span_inds(part.span) + new_start, new_end = curr_start+offset, curr_end+offset + part.span = new_doc[new_start:new_end] diff --git a/sefaria/model/linker/ref_resolver.py b/sefaria/model/linker/ref_resolver.py index e0721e0d95..1ae7f54637 100644 --- a/sefaria/model/linker/ref_resolver.py +++ b/sefaria/model/linker/ref_resolver.py @@ -1,14 +1,13 @@ from collections import defaultdict -from typing import List, Union, Dict, Optional, Tuple, Generator, Iterable, Set -from enum import IntEnum, Enum +from typing import List, Union, Dict, Optional, Tuple, Iterable, Set from functools import reduce -from tqdm import tqdm +from enum import IntEnum, Enum from sefaria.system.exceptions import InputError from sefaria.model import abstract as abst from sefaria.model import text from sefaria.model import schema from sefaria.model.linker.ref_part import RawRef, RawRefPart, SpanOrToken, span_inds, RefPartType, SectionContext, ContextPart, TermContext -from sefaria.model.linker.referenceable_book_node import NamedReferenceableBookNode, ReferenceableBookNode +from sefaria.model.linker.referenceable_book_node import ReferenceableBookNode from sefaria.model.linker.match_template import MatchTemplateTrie, LEAF_TRIE_ENTRY from sefaria.model.linker.resolved_ref_refiner_factory import resolved_ref_refiner_factory import structlog @@ -48,8 +47,8 @@ class ResolvedRef(abst.Cloneable): """ is_ambiguous = False - def __init__(self, raw_ref: RawRef, resolved_parts: List[RawRefPart], node, ref: text.Ref, context_ref: text.Ref = None, context_type: ContextType = None, context_parts: List[ContextPart] = None, _thoroughness=ResolutionThoroughness.NORMAL, _matched_dh_map=None) -> None: - self.raw_ref = raw_ref + def __init__(self, raw_entity: RawRef, resolved_parts: List[RawRefPart], node, ref: text.Ref, context_ref: text.Ref = None, context_type: ContextType = None, context_parts: List[ContextPart] = None, _thoroughness=ResolutionThoroughness.NORMAL, _matched_dh_map=None) -> None: + self.raw_entity = raw_entity self.resolved_parts = resolved_parts self.node: ReferenceableBookNode = node self.ref = ref @@ -71,7 +70,7 @@ def pretty_text(self) -> str: - adds extra DH words that were matched but aren't in span @return: """ - new_raw_ref_span = self._get_pretty_dh_span(self.raw_ref.span) + new_raw_ref_span = self._get_pretty_dh_span(self.raw_entity.span) new_raw_ref_span = self._get_pretty_end_paren_span(new_raw_ref_span) return new_raw_ref_span.text @@ -164,7 +163,7 @@ def __init__(self, resolved_refs: List[ResolvedRef]): if len(resolved_refs) == 0: raise InputError("Length of `resolved_refs` must be at least 1") self.resolved_raw_refs = resolved_refs - self.raw_ref = resolved_refs[0].raw_ref # assumption is all resolved_refs share same raw_ref. expose at top level + self.raw_entity = resolved_refs[0].raw_entity # assumption is all resolved_refs share same raw_ref. expose at top level @property def pretty_text(self): @@ -172,6 +171,9 @@ def pretty_text(self): return self.resolved_raw_refs[0].pretty_text +PossiblyAmbigResolvedRef = Union[ResolvedRef, AmbiguousResolvedRef] + + class TermMatcher: """ Used to match raw ref parts to non-unique terms naively. @@ -207,12 +209,23 @@ def match_terms(self, ref_parts: List[RawRefPart]) -> List[schema.NonUniqueTerm] class IbidHistory: + ignored_term_slugs = ['torah', 'talmud', 'gemara', 'mishnah', 'midrash'] + def __init__(self, last_n_titles: int = 3, last_n_refs: int = 3): self.last_n_titles = last_n_titles self.last_n_refs = last_n_refs self._last_refs: List[text.Ref] = [] self._last_titles: List[str] = [] self._title_ref_map: Dict[str, text.Ref] = {} + self._ignored_titles: Set[str] = self._get_ignored_titles() + + @classmethod + def _get_ignored_titles(cls) -> Set[str]: + terms = [schema.NonUniqueTerm.init(slug) for slug in cls.ignored_term_slugs] + return reduce(lambda a, b: a | set(b), [term.get_titles() for term in terms], set()) + + def should_ignore_text(self, text) -> bool: + return text in self._ignored_titles def _get_last_refs(self) -> List[text.Ref]: return self._last_refs @@ -241,177 +254,63 @@ def get_ref_by_title(self, title: str) -> Optional[text.Ref]: class RefResolver: - def __init__(self, raw_ref_model_by_lang: Dict[str, Language], raw_ref_part_model_by_lang: Dict[str, Language], - ref_part_title_trie_by_lang: Dict[str, MatchTemplateTrie], - term_matcher_by_lang: Dict[str, TermMatcher]) -> None: - from sefaria.helper.normalization import NormalizerByLang, NormalizerComposer + def __init__(self, lang: str, ref_part_title_trie: MatchTemplateTrie, term_matcher: TermMatcher) -> None: - self._raw_ref_model_by_lang = raw_ref_model_by_lang - self._raw_ref_part_model_by_lang = raw_ref_part_model_by_lang - self._ref_part_title_trie_by_lang = ref_part_title_trie_by_lang - self._term_matcher_by_lang = term_matcher_by_lang + self._lang = lang + self._ref_part_title_trie = ref_part_title_trie + self._term_matcher = term_matcher self._ibid_history = IbidHistory() self._thoroughness = ResolutionThoroughness.NORMAL - # see ML Repo library_exporter.py:TextWalker.__init__() which uses same normalization - # important that normalization is equivalent to normalization done at training time - base_normalizer_steps = ['unidecode', 'html', 'double-space'] - self._normalizer = NormalizerByLang({ - 'en': NormalizerComposer(base_normalizer_steps), - 'he': NormalizerComposer(base_normalizer_steps + ['maqaf', 'cantillation']), - }) - def reset_ibid_history(self): self._ibid_history = IbidHistory() - def _normalize_input(self, lang: str, input: List[str]): - """ - Normalize input text to match normalization that happened at training time - """ - return [self._normalizer.normalize(s, lang=lang) for s in input] - - def _map_normal_output_to_original_input(self, lang: str, input: List[str], resolved: List[List[Union[ResolvedRef, AmbiguousResolvedRef]]]) -> None: - """ - Ref resolution ran on normalized input. Remap resolved refs to original (non-normalized) input + def bulk_resolve(self, raw_refs: List[RawRef], book_context_ref: Optional[text.Ref] = None, + with_failures=False, thoroughness=ResolutionThoroughness.NORMAL, reset_ibids=True) -> List[PossiblyAmbigResolvedRef]: """ - for temp_input, temp_resolved in zip(input, resolved): - unnorm_doc = self.get_raw_ref_model(lang).make_doc(temp_input) - mapping = self._normalizer.get_mapping_after_normalization(temp_input, lang=lang) - conv = self._normalizer.convert_normalized_indices_to_unnormalized_indices # this function name is waaay too long - norm_inds = [rr.raw_ref.char_indices for rr in temp_resolved] - unnorm_inds = conv(norm_inds, mapping) - unnorm_part_inds = [] - for (rr, (norm_raw_ref_start, _)) in zip(temp_resolved, norm_inds): - unnorm_part_inds += [conv([[norm_raw_ref_start + i for i in part.char_indices] for part in rr.raw_ref.raw_ref_parts], mapping)] - for resolved_ref, temp_unnorm_inds, temp_unnorm_part_inds in zip(temp_resolved, unnorm_inds, unnorm_part_inds): - resolved_ref.raw_ref.map_new_indices(unnorm_doc, temp_unnorm_inds, temp_unnorm_part_inds) - - def bulk_resolve_refs(self, lang: str, book_context_refs: List[Optional[text.Ref]], input: List[str], with_failures=False, verbose=False, reset_ibids_every_context_ref=True, thoroughness=ResolutionThoroughness.NORMAL) -> List[List[Union[ResolvedRef, AmbiguousResolvedRef]]]: - """ - Main function for resolving refs in text. Given a list of texts, returns ResolvedRefs for each - @param lang: - @param book_context_refs: - @param input: + Main function for resolving refs in text. Given a list of RawRefs, returns ResolvedRefs for each + @param raw_refs: + @param book_context_ref: @param with_failures: - @param verbose: - @param reset_ibids_every_context_ref: @param thoroughness: how thorough should the search be. More thorough == slower. Currently "normal" will avoid searching for DH matches at book level and avoid filtering empty refs + @param reset_ibids: If true, reset ibid history before resolving @return: """ self._thoroughness = thoroughness - self.reset_ibid_history() - normalized_input = self._normalize_input(lang, input) - all_raw_refs = self._bulk_get_raw_refs(lang, normalized_input) + if reset_ibids: + self.reset_ibid_history() resolved = [] - iter = zip(book_context_refs, all_raw_refs) - if verbose: - iter = tqdm(iter, total=len(book_context_refs)) - for book_context_ref, raw_refs in iter: - if reset_ibids_every_context_ref: - self.reset_ibid_history() - inner_resolved = [] - for raw_ref in raw_refs: - temp_resolved = self.resolve_raw_ref(lang, book_context_ref, raw_ref) - if len(temp_resolved) == 0: - self.reset_ibid_history() - if with_failures: - inner_resolved += [ResolvedRef(raw_ref, [], None, None, context_ref=book_context_ref)] - elif any(r.is_ambiguous for r in temp_resolved): - # can't be sure about future ibid inferences - # TODO can probably salvage parts of history if matches are ambiguous within one book - self.reset_ibid_history() - else: - self._ibid_history.last_refs = temp_resolved[-1].ref - inner_resolved += temp_resolved - resolved += [inner_resolved] - self._map_normal_output_to_original_input(lang, input, resolved) + for raw_ref in raw_refs: + temp_resolved = self._resolve_raw_ref_and_update_ibid_history(raw_ref, book_context_ref, with_failures) + resolved += temp_resolved return resolved - def _bulk_get_raw_refs(self, lang: str, input: List[str]) -> List[List[RawRef]]: - all_raw_ref_spans = list(self._bulk_get_raw_ref_spans(lang, input)) - ref_part_input = reduce(lambda a, b: a + [(sub_b.text, b[0]) for sub_b in b[1]], enumerate(all_raw_ref_spans), []) - all_raw_ref_part_spans = list(self._bulk_get_raw_ref_part_spans(lang, ref_part_input, as_tuples=True)) - all_raw_ref_part_span_map = defaultdict(list) - for ref_part_span, input_idx in all_raw_ref_part_spans: - all_raw_ref_part_span_map[input_idx] += [ref_part_span] - - all_raw_refs = [] - for input_idx, raw_ref_spans in enumerate(all_raw_ref_spans): - raw_ref_part_spans = all_raw_ref_part_span_map[input_idx] - raw_refs = [] - for ispan, (span, part_span_list) in enumerate(zip(raw_ref_spans, raw_ref_part_spans)): - raw_ref_parts = [] - for ipart, part_span in enumerate(part_span_list): - part_type = RefPartType.span_label_to_enum(part_span.label_) - dh_continuation = None - if part_type == RefPartType.DH: - dh_continuation = self._get_dh_continuation(ispan, ipart, raw_ref_spans, part_span_list, span, part_span) - raw_ref_parts += [RawRefPart(part_type, part_span, dh_continuation)] - raw_refs += [RawRef(lang, raw_ref_parts, span)] - all_raw_refs += [raw_refs] - return all_raw_refs - - @staticmethod - def _get_dh_continuation(ispan: int, ipart: int, raw_ref_spans: List[SpanOrToken], part_span_list: List[SpanOrToken], span: SpanOrToken, part_span: SpanOrToken) -> Optional[SpanOrToken]: - if ipart == len(part_span_list) - 1: - curr_doc = span.doc - _, span_end = span_inds(span) - if ispan == len(raw_ref_spans) - 1: - dh_cont = curr_doc[span_end:] - else: - next_span_start, _ = span_inds(raw_ref_spans[ispan + 1]) - dh_cont = curr_doc[span_end:next_span_start] + def _resolve_raw_ref_and_update_ibid_history(self, raw_ref: RawRef, book_context_ref: text.Ref, with_failures=False) -> List[PossiblyAmbigResolvedRef]: + temp_resolved = self.resolve_raw_ref(book_context_ref, raw_ref) + self._update_ibid_history(raw_ref, temp_resolved) + if len(temp_resolved) == 0 and with_failures: + return [ResolvedRef(raw_ref, [], None, None, context_ref=book_context_ref)] + return temp_resolved + + def _update_ibid_history(self, raw_ref: RawRef, temp_resolved: List[PossiblyAmbigResolvedRef]): + if self._ibid_history.should_ignore_text(raw_ref.text): + return + if len(temp_resolved) == 0: + self.reset_ibid_history() + elif any(r.is_ambiguous for r in temp_resolved): + # can't be sure about future ibid inferences + # TODO can probably salvage parts of history if matches are ambiguous within one book + self.reset_ibid_history() else: - _, part_span_end = span_inds(part_span) - next_part_span_start, _ = span_inds(part_span_list[ipart + 1]) - dh_cont = part_span.doc[part_span_end:next_part_span_start] - - return dh_cont - - def __get_attr_by_lang(self, lang: str, by_lang_attr: dict, error_msg: str): - try: - return by_lang_attr[lang] - except KeyError as e: - raise KeyError(f"{error_msg} for lang `{lang}`") + self._ibid_history.last_refs = temp_resolved[-1].ref - def get_raw_ref_model(self, lang: str) -> Language: - return self.__get_attr_by_lang(lang, self._raw_ref_model_by_lang, 'No Raw Ref Model') + def get_ref_part_title_trie(self) -> MatchTemplateTrie: + return self._ref_part_title_trie - def get_raw_ref_part_model(self, lang: str) -> Language: - return self.__get_attr_by_lang(lang, self._raw_ref_part_model_by_lang, 'No Raw Ref Model') - - def get_ref_part_title_trie(self, lang: str) -> MatchTemplateTrie: - return self.__get_attr_by_lang(lang, self._ref_part_title_trie_by_lang, 'No Raw Ref Part Title Trie') - - def get_term_matcher(self, lang: str) -> TermMatcher: - return self.__get_attr_by_lang(lang, self._term_matcher_by_lang, 'No Term Matcher') - - def _get_raw_ref_spans_in_string(self, lang: str, st: str) -> List[Span]: - doc = self.get_raw_ref_model(lang)(st) - return doc.ents - - def _bulk_get_raw_ref_spans(self, lang: str, input: List[str], batch_size=150, **kwargs) -> Generator[List[Span], None, None]: - for doc in self.get_raw_ref_model(lang).pipe(input, batch_size=batch_size, **kwargs): - if kwargs.get('as_tuples', False): - doc, context = doc - yield doc.ents, context - else: - yield doc.ents + def get_term_matcher(self) -> TermMatcher: + return self._term_matcher - def _get_raw_ref_part_spans_in_string(self, lang: str, st: str) -> List[Span]: - doc = self.get_raw_ref_part_model(lang)(st) - return doc.ents - - def _bulk_get_raw_ref_part_spans(self, lang: str, input: List[str], batch_size=None, **kwargs) -> Generator[List[Span], None, None]: - for doc in self.get_raw_ref_part_model(lang).pipe(input, batch_size=batch_size or len(input), **kwargs): - if kwargs.get('as_tuples', False): - doc, context = doc - yield doc.ents, context - else: - yield doc.ents - - @staticmethod - def split_non_cts_parts(lang, raw_ref: RawRef) -> List[RawRef]: + def split_non_cts_parts(self, raw_ref: RawRef) -> List[RawRef]: if not any(part.type == RefPartType.NON_CTS for part in raw_ref.raw_ref_parts): return [raw_ref] split_raw_refs = [] curr_parts = [] @@ -426,7 +325,7 @@ def split_non_cts_parts(lang, raw_ref: RawRef) -> List[RawRef]: try: raw_ref_span = raw_ref.subspan(slice(curr_part_start, curr_part_end)) curr_parts = [p.realign_to_new_raw_ref(raw_ref.span, raw_ref_span) for p in curr_parts] - split_raw_refs += [RawRef(lang, curr_parts, raw_ref_span)] + split_raw_refs += [RawRef(raw_ref_span, self._lang, curr_parts)] except AssertionError: pass curr_parts = [] @@ -436,18 +335,18 @@ def split_non_cts_parts(lang, raw_ref: RawRef) -> List[RawRef]: def set_thoroughness(self, thoroughness: ResolutionThoroughness) -> None: self._thoroughness = thoroughness - def resolve_raw_ref(self, lang: str, book_context_ref: Optional[text.Ref], raw_ref: RawRef) -> List[Union[ResolvedRef, AmbiguousResolvedRef]]: - split_raw_refs = self.split_non_cts_parts(lang, raw_ref) + def resolve_raw_ref(self, book_context_ref: Optional[text.Ref], raw_ref: RawRef) -> List[PossiblyAmbigResolvedRef]: + split_raw_refs = self.split_non_cts_parts(raw_ref) resolved_list = [] for i, temp_raw_ref in enumerate(split_raw_refs): is_non_cts = i > 0 and len(resolved_list) > 0 if is_non_cts: # TODO assumes context is only first resolved ref - book_context_ref = resolved_list[0].ref + book_context_ref = None if resolved_list[0].is_ambiguous else resolved_list[0].ref context_swap_map = None if book_context_ref is None else getattr(book_context_ref.index.nodes, 'ref_resolver_context_swaps', None) - self._apply_context_swaps(lang, raw_ref, context_swap_map) - unrefined_matches = self.get_unrefined_ref_part_matches(lang, book_context_ref, temp_raw_ref) + self._apply_context_swaps(raw_ref, context_swap_map) + unrefined_matches = self.get_unrefined_ref_part_matches(book_context_ref, temp_raw_ref) if is_non_cts: # filter unrefined matches to matches that resolved previously resolved_titles = {r.ref.index.title for r in resolved_list} @@ -458,7 +357,7 @@ def resolve_raw_ref(self, lang: str, book_context_ref: Optional[text.Ref], raw_r match.ref = match.ref.subref(book_context_ref.sections[:-len(temp_raw_ref.raw_ref_parts)]) except (InputError, AttributeError): continue - temp_resolved_list = self.refine_ref_part_matches(lang, book_context_ref, unrefined_matches) + temp_resolved_list = self.refine_ref_part_matches(book_context_ref, unrefined_matches) if len(temp_resolved_list) > 1: resolved_list += [AmbiguousResolvedRef(temp_resolved_list)] else: @@ -477,25 +376,25 @@ def resolve_raw_ref_using_ref_instantiation(raw_ref: RawRef) -> List[ResolvedRef except: return [] - def get_unrefined_ref_part_matches(self, lang: str, book_context_ref: Optional[text.Ref], raw_ref: RawRef) -> List[ + def get_unrefined_ref_part_matches(self, book_context_ref: Optional[text.Ref], raw_ref: RawRef) -> List[ 'ResolvedRef']: - context_free_matches = self._get_unrefined_ref_part_matches_recursive(lang, raw_ref, ref_parts=raw_ref.parts_to_match) + context_free_matches = self._get_unrefined_ref_part_matches_recursive(raw_ref, ref_parts=raw_ref.parts_to_match) contexts = [(book_context_ref, ContextType.CURRENT_BOOK)] + [(ibid_ref, ContextType.IBID) for ibid_ref in self._ibid_history.last_refs] matches = context_free_matches if len(matches) == 0: context_full_matches = [] for context_ref, context_type in contexts: - context_full_matches += self._get_unrefined_ref_part_matches_for_title_context(lang, context_ref, raw_ref, context_type) + context_full_matches += self._get_unrefined_ref_part_matches_for_title_context(context_ref, raw_ref, context_type) matches = context_full_matches + context_free_matches return matches - def _get_unrefined_ref_part_matches_for_title_context(self, lang: str, context_ref: Optional[text.Ref], raw_ref: RawRef, context_type: ContextType) -> List[ResolvedRef]: + def _get_unrefined_ref_part_matches_for_title_context(self, context_ref: Optional[text.Ref], raw_ref: RawRef, context_type: ContextType) -> List[ResolvedRef]: matches = [] if context_ref is None: return matches term_contexts = self._get_term_contexts(context_ref.index.nodes) if len(term_contexts) == 0: return matches temp_ref_parts = raw_ref.parts_to_match + term_contexts - temp_matches = self._get_unrefined_ref_part_matches_recursive(lang, raw_ref, ref_parts=temp_ref_parts) + temp_matches = self._get_unrefined_ref_part_matches_recursive(raw_ref, ref_parts=temp_ref_parts) for match in temp_matches: if match.num_resolved(include={TermContext}) == 0: continue match.context_ref = context_ref @@ -504,7 +403,7 @@ def _get_unrefined_ref_part_matches_for_title_context(self, lang: str, context_r matches += [match] return matches - def _apply_context_swaps(self, lang: str, raw_ref: RawRef, context_swap_map: Dict[str, str]=None): + def _apply_context_swaps(self, raw_ref: RawRef, context_swap_map: Dict[str, str]=None): """ Use `context_swap_map` to swap matching element of `ref_parts` Allows us to redefine how a ref part is interpreted depending on the context @@ -513,7 +412,7 @@ def _apply_context_swaps(self, lang: str, raw_ref: RawRef, context_swap_map: Dic Modifies `raw_ref` with updated ref_parts """ swapped_ref_parts = [] - term_matcher = self.get_term_matcher(lang) + term_matcher = self.get_term_matcher() if context_swap_map is None: return for part in raw_ref.raw_ref_parts: # TODO assumes only one match in term_matches @@ -527,25 +426,29 @@ def _apply_context_swaps(self, lang: str, raw_ref: RawRef, context_swap_map: Dic if not found_match: swapped_ref_parts += [part] raw_ref.parts_to_match = swapped_ref_parts - def _get_unrefined_ref_part_matches_recursive(self, lang: str, raw_ref: RawRef, title_trie: MatchTemplateTrie = None, ref_parts: list = None, prev_ref_parts: list = None) -> List[ResolvedRef]: - title_trie = title_trie or self.get_ref_part_title_trie(lang) + def _get_unrefined_ref_part_matches_recursive(self, raw_ref: RawRef, title_trie: MatchTemplateTrie = None, ref_parts: list = None, prev_ref_parts: list = None) -> List[ResolvedRef]: + """ + We are now considering all types for trie lookups (not just NAMED) since there seem to be no cases of false positives when we consider all part types + In addition, sometimes the raw ref part type model misclassifies a part type and relaxing the type requirement here allows it to recover. + The exception is we only will split NAMED parts since this causes some odd parts to split. e.g. משנה א can be considered part of the title of book when א is removed + """ + title_trie = title_trie or self.get_ref_part_title_trie() prev_ref_parts = prev_ref_parts or [] matches = [] for part in ref_parts: temp_raw_ref = raw_ref - # no need to consider other types at root level - if part.type != RefPartType.NAMED: continue - temp_title_trie, partial_key_end = title_trie.get_continuations(part.key(), allow_partial=True) if temp_title_trie is None: continue if partial_key_end is None: matched_part = part - else: + elif part.type == RefPartType.NAMED: try: temp_raw_ref, apart, bpart = raw_ref.split_part(part, partial_key_end) matched_part = apart except InputError: matched_part = part # fallback on original part + else: + continue temp_prev_ref_parts = prev_ref_parts + [matched_part] if LEAF_TRIE_ENTRY in temp_title_trie: for node in temp_title_trie[LEAF_TRIE_ENTRY]: @@ -555,16 +458,16 @@ def _get_unrefined_ref_part_matches_recursive(self, lang: str, raw_ref: RawRef, continue matches += [ResolvedRef(temp_raw_ref, temp_prev_ref_parts, node, ref, _thoroughness=self._thoroughness)] temp_ref_parts = [temp_part for temp_part in ref_parts if temp_part != part] - matches += self._get_unrefined_ref_part_matches_recursive(lang, temp_raw_ref, temp_title_trie, ref_parts=temp_ref_parts, prev_ref_parts=temp_prev_ref_parts) + matches += self._get_unrefined_ref_part_matches_recursive(temp_raw_ref, temp_title_trie, ref_parts=temp_ref_parts, prev_ref_parts=temp_prev_ref_parts) return ResolvedRefPruner.prune_unrefined_ref_part_matches(matches) - def refine_ref_part_matches(self, lang: str, book_context_ref: Optional[text.Ref], matches: List[ResolvedRef]) -> List[ResolvedRef]: + def refine_ref_part_matches(self, book_context_ref: Optional[text.Ref], matches: List[ResolvedRef]) -> List[ResolvedRef]: temp_matches = [] refs_matched = {match.ref.normal() for match in matches} for unrefined_match in matches: - unused_parts = list(set(unrefined_match.raw_ref.parts_to_match) - set(unrefined_match.resolved_parts)) - context_free_matches = self._get_refined_ref_part_matches_recursive(lang, unrefined_match, unused_parts) + unused_parts = list(set(unrefined_match.raw_entity.parts_to_match) - set(unrefined_match.resolved_parts)) + context_free_matches = self._get_refined_ref_part_matches_recursive(unrefined_match, unused_parts) # context # if unrefined_match already used context, make sure it continues to use it @@ -573,7 +476,7 @@ def refine_ref_part_matches(self, lang: str, book_context_ref: Optional[text.Ref context_type_list = [ContextType.CURRENT_BOOK, ContextType.IBID] if unrefined_match.context_ref is None else [unrefined_match.context_type] context_full_matches = [] for context_ref, context_type in zip(context_ref_list, context_type_list): - context_full_matches += self._get_refined_ref_part_matches_for_section_context(lang, context_ref, context_type, unrefined_match, unused_parts) + context_full_matches += self._get_refined_ref_part_matches_for_section_context(context_ref, context_type, unrefined_match, unused_parts) # combine if len(context_full_matches) > 0: @@ -605,6 +508,9 @@ def get_section_set(index: text.Index) -> Set[Tuple[str, str, bool]]: return [] context_node = context_ref.index_node + if not hasattr(context_node, 'addressTypes'): + # complex text + return [] referenceable_sections = getattr(context_node, 'referenceableSections', [True]*len(context_node.addressTypes)) context_sec_list = list(zip(context_node.addressTypes, context_node.sectionNames, referenceable_sections)) match_sec_set = get_section_set(match_index) @@ -641,8 +547,7 @@ def _get_term_contexts(node: schema.SchemaNode) -> List[TermContext]: longest_template = min(match_templates, key=lambda x: len(list(x.terms))) return [TermContext(term) for term in longest_template.terms] - @staticmethod - def _get_refined_ref_part_matches_for_section_context(lang: str, context_ref: Optional[text.Ref], context_type: ContextType, ref_part_match: ResolvedRef, ref_parts: List[RawRefPart]) -> List[ResolvedRef]: + def _get_refined_ref_part_matches_for_section_context(self, context_ref: Optional[text.Ref], context_type: ContextType, ref_part_match: ResolvedRef, ref_parts: List[RawRefPart]) -> List[ResolvedRef]: """ Tries to infer sections from context ref and uses them to refine `ref_part_match` """ @@ -655,7 +560,7 @@ def _get_refined_ref_part_matches_for_section_context(lang: str, context_ref: Op sec_contexts = RefResolver._get_section_contexts(context_ref, ref_part_match.ref.index, common_index) term_contexts = RefResolver._get_all_term_contexts(context_ref.index_node, include_root=False) context_to_consider = sec_contexts + term_contexts - temp_matches = RefResolver._get_refined_ref_part_matches_recursive(lang, ref_part_match, ref_parts + context_to_consider) + temp_matches = self._get_refined_ref_part_matches_recursive(ref_part_match, ref_parts + context_to_consider) # remove matches which don't use context temp_matches = list(filter(lambda x: len(set(x.get_resolved_parts(include={ContextPart})) & set(context_to_consider)) > 0, temp_matches)) @@ -667,17 +572,16 @@ def _get_refined_ref_part_matches_for_section_context(lang: str, context_ref: Op matches += temp_matches return matches - @staticmethod - def _get_refined_ref_part_matches_recursive(lang: str, match: ResolvedRef, ref_parts: List[RawRefPart]) -> List[ResolvedRef]: + def _get_refined_ref_part_matches_recursive(self, match: ResolvedRef, ref_parts: List[RawRefPart]) -> List[ResolvedRef]: fully_refined = [] children = match.get_node_children() for part in ref_parts: for child in children: resolved_ref_refiner = resolved_ref_refiner_factory.create(part, child, match) - temp_matches = resolved_ref_refiner.refine(lang) + temp_matches = resolved_ref_refiner.refine(self._lang) for temp_match in temp_matches: temp_ref_parts = list(set(ref_parts) - set(temp_match.resolved_parts)) - fully_refined += RefResolver._get_refined_ref_part_matches_recursive(lang, temp_match, temp_ref_parts) + fully_refined += self._get_refined_ref_part_matches_recursive(temp_match, temp_ref_parts) if len(fully_refined) == 0: # original match is better than no matches return [match] @@ -697,9 +601,27 @@ def prune_unrefined_ref_part_matches(ref_part_matches: List[ResolvedRef]) -> Lis index_match_map[key] += [match] pruned_matches = [] for match_list in index_match_map.values(): - pruned_matches += [max(match_list, key=lambda m: m.num_resolved())] + pruned_matches += ResolvedRefPruner.remove_subset_sets(match_list, key=lambda match: set(part.char_indices for part in match.get_resolved_parts())) return pruned_matches + @staticmethod + def remove_subset_sets(items, key=None): + if key: + sets_to_filter = [key(x) for x in items] + else: + sets_to_filter = items + items, sets_to_filter = zip(*sorted((zip(items, sets_to_filter)), key=lambda x: len(x[1]), reverse=True)) + result = [] + for i in range(len(sets_to_filter)): + for j in range(i): + if sets_to_filter[i].issubset(sets_to_filter[j]): + # Break the loop as the sublist is a subset of a previous sublist + break + else: + # If the sublist is not a subset of any previous sublist, add it to the result + result.append(items[i]) + return result + @staticmethod def do_explicit_sections_match_before_context_sections(match: ResolvedRef) -> bool: first_explicit_section = None @@ -713,7 +635,7 @@ def do_explicit_sections_match_before_context_sections(match: ResolvedRef) -> bo @staticmethod def matched_all_explicit_sections(match: ResolvedRef) -> bool: resolved_explicit = set(match.get_resolved_parts(exclude={ContextPart})) - to_match_explicit = {part for part in match.raw_ref.parts_to_match if not part.is_context} + to_match_explicit = {part for part in match.raw_entity.parts_to_match if not part.is_context} if match.context_type in CONTEXT_TO_REF_PART_TYPE.keys(): # remove an equivalent number of context parts that were resolved from to_match_explicit to approximate @@ -766,6 +688,13 @@ def is_match_correct(match: ResolvedRef) -> bool: @staticmethod def remove_superfluous_matches(thoroughness: ResolutionThoroughness, resolved_refs: List[ResolvedRef]) -> List[ResolvedRef]: + # make matches with refs that are essentially equivalent (i.e. refs cover same span) actually equivalent + resolved_refs.sort(key=lambda x: x.ref and x.ref.order_id()) + for i, r in enumerate(resolved_refs[:-1]): + next_r = resolved_refs[i+1] + if r.ref.contains(next_r.ref) and next_r.ref.contains(r.ref): + next_r.ref = r.ref + # make unique resolved_refs = list({r.ref: r for r in resolved_refs}.values()) if thoroughness >= ResolutionThoroughness.HIGH or len(resolved_refs) > 1: @@ -784,7 +713,7 @@ def remove_incorrect_matches(resolved_refs: List[ResolvedRef]) -> List[ResolvedR @staticmethod def get_context_free_matches(resolved_refs: List[ResolvedRef]) -> List[ResolvedRef]: def match_is_context_free(match: ResolvedRef) -> bool: - return match.context_ref is None and set(match.get_resolved_parts()) == set(match.raw_ref.parts_to_match) + return match.context_ref is None and set(match.get_resolved_parts()) == set(match.raw_entity.parts_to_match) return list(filter(match_is_context_free, resolved_refs)) @staticmethod @@ -822,15 +751,15 @@ def _merge_subset_matches(resolved_refs: List[ResolvedRef]) -> List[ResolvedRef] Merge matches where one ref is contained in another ref E.g. if matchA.ref == Ref("Genesis 1") and matchB.ref == Ref("Genesis 1:1"), matchA will be deleted and its parts will be appended to matchB's parts """ - resolved_refs.sort(key=lambda x: x.ref and x.ref.order_id()) + resolved_refs.sort(key=lambda x: "N/A" if x.ref is None else x.ref.order_id()) merged_resolved_refs = [] next_merged = False for imatch, match in enumerate(resolved_refs[:-1]): - if match.is_ambiguous or match.ref is None or next_merged: + next_match = resolved_refs[imatch+1] + if match.is_ambiguous or match.ref is None or next_match.ref is None or next_merged: merged_resolved_refs += [match] next_merged = False continue - next_match = resolved_refs[imatch+1] if match.ref.index.title != next_match.ref.index.title: # optimization, the easiest cases to check for merged_resolved_refs += [match] diff --git a/sefaria/model/linker/referenceable_book_node.py b/sefaria/model/linker/referenceable_book_node.py index ad0e10bf6e..bb09aa0cff 100644 --- a/sefaria/model/linker/referenceable_book_node.py +++ b/sefaria/model/linker/referenceable_book_node.py @@ -1,8 +1,50 @@ import dataclasses -from typing import List, Union, Optional +from typing import List, Union, Optional, Tuple, Dict from sefaria.model import abstract as abst from sefaria.model import text from sefaria.model import schema +from sefaria.system.exceptions import InputError +from bisect import bisect_right + + +def subref(ref: text.Ref, section: int): + if ref.index_node.addressTypes[len(ref.sections)-1] == "Talmud": + return _talmud_subref(ref, section) + elif ref.index.categories == ['Tanakh', 'Torah']: + return _parsha_subref(ref, section) + else: + return ref.subref(section) + + +def _talmud_subref(ref: text.Ref, section: int): + d = ref._core_dict() + d['sections'][-1] += (section-1) + d['toSections'] = d['sections'][:] + return text.Ref(_obj=d) + + +def _parsha_subref(ref: text.Ref, section: int): + parsha_trefs = {n.wholeRef for n in ref.index.get_alt_struct_leaves()} + if ref.normal() in parsha_trefs: + book_subref = text.Ref(ref.index.title).subref(section) + if ref.contains(book_subref): + return book_subref + else: + # section doesn't fall within parsha + # Note, only validates that perek is in parsha range, doesn't check segment level. + # Edge case is Parshat Noach 6:3 + raise InputError + else: + return ref.subref(section) + + +def truncate_serialized_node_to_depth(serial_node: dict, depth: int) -> dict: + truncated_serial_node = serial_node.copy() + for list_attr in ('addressTypes', 'sectionNames', 'lengths', 'referenceableSections'): + if list_attr not in serial_node: + continue + truncated_serial_node[list_attr] = serial_node[list_attr][depth:] + return truncated_serial_node class ReferenceableBookNode: @@ -20,6 +62,10 @@ def get_children(self, *args, **kwargs) -> List['ReferenceableBookNode']: def is_default(self) -> bool: return False + @property + def referenceable(self) -> bool: + return True + class NamedReferenceableBookNode(ReferenceableBookNode): @@ -29,6 +75,10 @@ def __init__(self, titled_tree_node_or_index: Union[schema.TitledTreeNode, text. if isinstance(titled_tree_node_or_index, text.Index): self._titled_tree_node = titled_tree_node_or_index.nodes + @property + def referenceable(self): + return getattr(self._titled_tree_node, 'referenceable', not self.is_default()) + def is_default(self): return self._titled_tree_node.is_default() @@ -38,22 +88,42 @@ def get_numeric_equivalent(self): def ref(self) -> text.Ref: return self._titled_tree_node.ref() + @staticmethod + def _is_array_map_referenceable(node: schema.ArrayMapNode) -> bool: + if not getattr(node, "isMapReferenceable", True): + return False + if getattr(node, "refs", None): + return True + if getattr(node, "wholeRef", None) and getattr(node, "includeSections", None): + return True + return False + def _get_all_children(self) -> List[ReferenceableBookNode]: thingy = self._titled_tree_node_or_index - #the schema node for this referenceable node has a dibur hamatchil child + # the schema node for this referenceable node has a dibur hamatchil child if isinstance(thingy, schema.NumberedTitledTreeNode) and thingy.is_segment_level_dibur_hamatchil(): return [DiburHamatchilNodeSet({"container_refs": self.ref().normal()})] - #the schema node for this referenceable is a JAN. JANs act as both named and numbered nodes + # the schema node for this referenceable is a JAN. JANs act as both named and numbered nodes if isinstance(thingy, schema.JaggedArrayNode) and len(thingy.children) == 0: return [NumberedReferenceableBookNode(thingy)] if isinstance(thingy, text.Index): children = thingy.referenceable_children() + elif isinstance(thingy, schema.ArrayMapNode): + if self._is_array_map_referenceable(thingy): + return [MapReferenceableBookNode(thingy)] + else: + index = thingy.ref().index + yo = NamedReferenceableBookNode(index) + return yo.get_children() else: # Any other type of TitledTreeNode children = self._titled_tree_node.children children = [self._transform_schema_node_to_referenceable(x) for x in children] return children + def _get_children_from_array_map_node(self, node: schema.ArrayMapNode) -> List[ReferenceableBookNode]: + pass + @staticmethod def _transform_schema_node_to_referenceable(schema_node: schema.TitledTreeNode) -> ReferenceableBookNode: if isinstance(schema_node, schema.JaggedArrayNode) and (schema_node.is_default() or schema_node.parent is None): @@ -84,27 +154,43 @@ class NumberedReferenceableBookNode(ReferenceableBookNode): def __init__(self, ja_node: schema.NumberedTitledTreeNode): self._ja_node = ja_node + @property + def referenceable(self): + return getattr(self._ja_node, 'referenceable', True) + def is_default(self): return self._ja_node.is_default() and self._ja_node.parent is not None def ref(self): return self._ja_node.ref() + def possible_subrefs(self, lang: str, initial_ref: text.Ref, section_str: str, fromSections=None) -> Tuple[List[text.Ref], List[bool]]: + try: + possible_sections, possible_to_sections, addr_classes = self._address_class.get_all_possible_sections_from_string(lang, section_str, fromSections, strip_prefixes=True) + except (IndexError, TypeError, KeyError): + return [], [] + possible_subrefs = [] + can_match_out_of_order_list = [] + for sec, toSec, addr_class in zip(possible_sections, possible_to_sections, addr_classes): + try: + refined_ref = subref(initial_ref, sec) + if toSec != sec: + to_ref = subref(initial_ref, toSec) + refined_ref = refined_ref.to(to_ref) + possible_subrefs += [refined_ref] + can_match_out_of_order_list += [addr_class.can_match_out_of_order(lang, section_str)] + except (InputError, IndexError, AssertionError, AttributeError): + continue + return possible_subrefs, can_match_out_of_order_list + @property - def address_class(self) -> schema.AddressType: + def _address_class(self) -> schema.AddressType: return self._ja_node.address_class(0) @property - def section_name(self) -> str: + def _section_name(self) -> str: return self._ja_node.sectionNames[0] - def get_all_possible_sections_from_string(self, *args, **kwargs): - """ - wraps AddressType function with same name - @return: - """ - return self.address_class.get_all_possible_sections_from_string(*args, **kwargs) - def _get_next_referenceable_depth(self): if self.is_default(): return 0 @@ -126,7 +212,7 @@ def get_children(self, context_ref=None, **kwargs) -> [ReferenceableBookNode]: if serial['depth'] <= 1 and self._ja_node.is_segment_level_dibur_hamatchil(): return [DiburHamatchilNodeSet({"container_refs": context_ref.normal()})] if (self._ja_node.depth - next_referenceable_depth) == 0: - if isinstance(self.address_class, schema.AddressTalmud): + if isinstance(self._address_class, schema.AddressTalmud): serial['addressTypes'] = ["Amud"] serial['sectionNames'] = ["Amud"] serial['lengths'] = [1] @@ -134,10 +220,7 @@ def get_children(self, context_ref=None, **kwargs) -> [ReferenceableBookNode]: else: return [] else: - for list_attr in ('addressTypes', 'sectionNames', 'lengths', 'referenceableSections'): - # truncate every list attribute by `next_referenceable_depth` - if list_attr not in serial: continue - serial[list_attr] = serial[list_attr][next_referenceable_depth:] + serial = truncate_serialized_node_to_depth(serial, next_referenceable_depth) new_ja = schema.JaggedArrayNode(serial=serial, index=getattr(self, 'index', None), **kwargs) return [NumberedReferenceableBookNode(new_ja)] @@ -145,11 +228,96 @@ def matches_section_context(self, section_context: 'SectionContext') -> bool: """ Does the address in `self` match the address in `section_context`? """ - if self.address_class.__class__ != section_context.addr_type.__class__: return False - if self.section_name != section_context.section_name: return False + if self._address_class.__class__ != section_context.addr_type.__class__: return False + if self._section_name != section_context.section_name: return False return True +class MapReferenceableBookNode(NumberedReferenceableBookNode): + """ + Node that can only be referenced by refs in a mapping + """ + + def __init__(self, node: schema.ArrayMapNode): + ja_node = self.__make_ja_from_array_map(node) + super().__init__(ja_node) + self._section_ref_map = self.__make_section_ref_map(node) + + @staticmethod + def __make_ja_from_array_map(node: schema.ArrayMapNode): + return MapReferenceableBookNode.__make_ja(**MapReferenceableBookNode.__get_ja_attributes_from_array_map(node)) + + @staticmethod + def __make_ja(addressTypes: List[str], sectionNames: List[str], **ja_node_attrs): + return schema.JaggedArrayNode(serial={ + "addressTypes": addressTypes, + "sectionNames": sectionNames, + **ja_node_attrs, + "depth": len(addressTypes), + }) + + @staticmethod + def __get_ja_attributes_from_array_map(node: schema.ArrayMapNode) -> dict: + if getattr(node, 'refs', None): + address_types = node.addressTypes + section_names = node.sectionNames + return {"addressTypes": address_types, "sectionNames": section_names} + elif getattr(node, 'wholeRef', None) and getattr(node, 'includeSections', False): + whole_ref = text.Ref(node.wholeRef) + schema_node = whole_ref.index_node.serialize() + return truncate_serialized_node_to_depth(schema_node, -2) + else: + return {} + + def __make_section_ref_map(self, node: schema.ArrayMapNode) -> Dict[int, text.Ref]: + if getattr(node, 'refs', None): + section_ref_map = { + self.__get_section_with_offset(ichild, node): text.Ref(tref) + for ichild, tref in enumerate(node.refs) + } + elif getattr(node, 'wholeRef', None) and getattr(node, 'includeSections', False): + whole_ref = text.Ref(node.wholeRef) + refs = whole_ref.split_spanning_ref() + section_ref_map = {} + for oref in refs: + section = oref.section_ref().sections[0] + section_ref_map[section] = oref + else: + raise Exception("ArrayMapNode doesn't have expected attributes 'refs' or 'wholeRef'.") + return section_ref_map + + def __get_section_with_offset(self, i: int, node: schema.ArrayMapNode) -> int: + addresses = getattr(node, "addresses", None) + if addresses: + return addresses[i] + section = i + 1 + starting_address = getattr(node, "startingAddress", None) + if starting_address: + section = i + self._address_class.toNumber("en", starting_address) + skipped_addresses = getattr(node, "skipped_addresses", None) + if skipped_addresses: + skipped_addresses.sort() + section += bisect_right(skipped_addresses, section) + return section + + def ref(self): + return self._ref + + def possible_subrefs(self, lang: str, initial_ref: text.Ref, section_str: str, fromSections=None) -> Tuple[List[text.Ref], List[bool]]: + try: + possible_sections, possible_to_sections, addr_classes = self._address_class.\ + get_all_possible_sections_from_string(lang, section_str, fromSections, strip_prefixes=True) + except (IndexError, TypeError, KeyError): + return [], [] + # map sections to equivalent refs in section_ref_map + mapped_refs = [] + for sec, to_sec in zip(possible_sections, possible_to_sections): + mapped_ref = self._section_ref_map.get(sec) + if mapped_ref and sec == to_sec: + mapped_refs += [mapped_ref] + return mapped_refs, [True]*len(mapped_refs) + + @dataclasses.dataclass class DiburHamatchilMatch: score: float diff --git a/sefaria/model/linker/resolved_ref_refiner.py b/sefaria/model/linker/resolved_ref_refiner.py index 5219d168ce..f71d40cb27 100644 --- a/sefaria/model/linker/resolved_ref_refiner.py +++ b/sefaria/model/linker/resolved_ref_refiner.py @@ -8,16 +8,6 @@ from sefaria.model.text import Ref -def subref(ref: Ref, section: int): - if ref.index_node.addressTypes[len(ref.sections)-1] == "Talmud": - d = ref._core_dict() - d['sections'][-1] += (section-1) - d['toSections'] = d['sections'][:] - return Ref(_obj=d) - else: - return ref.subref(section) - - class ResolvedRefRefiner(ABC): def __init__(self, part_to_match: RawRefPart, node: ReferenceableBookNode, resolved_ref: 'ResolvedRef'): @@ -36,7 +26,7 @@ def _has_prev_unused_numbered_ref_part(self) -> bool: Helper function to avoid matching AddressInteger sections out of order Returns True if there is a RawRefPart which immediately precedes `raw_ref_part` and is not yet included in this match """ - prev_part = self.resolved_ref.raw_ref.prev_num_parts_map.get(self.part_to_match, None) + prev_part = self.resolved_ref.raw_entity.prev_num_parts_map.get(self.part_to_match, None) if prev_part is None: return False return prev_part not in set(self.resolved_ref.resolved_parts) @@ -88,14 +78,10 @@ def __refine_context_full(self) -> List['ResolvedRef']: def __refine_context_free(self, lang: str, fromSections=None) -> List['ResolvedRef']: if self.node is None: return [] - try: - possible_sections, possible_to_sections, addr_classes = self.node.get_all_possible_sections_from_string(lang, self.part_to_match.text, fromSections, strip_prefixes=True) - except (IndexError, TypeError, KeyError): - return [] + possible_subrefs, can_match_out_of_order_list = self.node.possible_subrefs(lang, self.resolved_ref.ref, self.part_to_match.text, fromSections) refined_refs = [] - addr_classes_used = [] - for sec, toSec, addr_class in zip(possible_sections, possible_to_sections, addr_classes): - if self._has_prev_unused_numbered_ref_part() and not addr_class.can_match_out_of_order(lang, self.part_to_match.text): + for refined_ref, can_match_out_of_order in zip(possible_subrefs, can_match_out_of_order_list): + if self._has_prev_unused_numbered_ref_part() and not can_match_out_of_order: """ If raw_ref has NUMBERED parts [a, b] and part b matches before part a @@ -103,16 +89,7 @@ def __refine_context_free(self, lang: str, fromSections=None) -> List['ResolvedR discard match because AddressInteger parts need to match in order """ continue - try: - refined_ref = subref(self.resolved_ref.ref, sec) - if toSec != sec: - to_ref = subref(self.resolved_ref.ref, toSec) - refined_ref = refined_ref.to(to_ref) - refined_refs += [refined_ref] - addr_classes_used += [addr_class] - except (InputError, IndexError, AssertionError, AttributeError): - continue - + refined_refs += [refined_ref] return [self._clone_resolved_ref(resolved_parts=self._get_resolved_parts(), node=self.node, ref=refined_ref) for refined_ref in refined_refs] diff --git a/sefaria/model/linker/resolved_ref_refiner_factory.py b/sefaria/model/linker/resolved_ref_refiner_factory.py index 864d1b01d8..633b74705b 100644 --- a/sefaria/model/linker/resolved_ref_refiner_factory.py +++ b/sefaria/model/linker/resolved_ref_refiner_factory.py @@ -1,5 +1,5 @@ from sefaria.model.linker.ref_part import RawRefPart, RefPartType -from sefaria.model.linker.referenceable_book_node import ReferenceableBookNode, NamedReferenceableBookNode, NumberedReferenceableBookNode +from sefaria.model.linker.referenceable_book_node import ReferenceableBookNode, NamedReferenceableBookNode, NumberedReferenceableBookNode, MapReferenceableBookNode from sefaria.model.linker.resolved_ref_refiner import ResolvedRefRefinerForDefaultNode, ResolvedRefRefinerForNumberedPart, ResolvedRefRefinerForDiburHamatchilPart, ResolvedRefRefinerForRangedPart, ResolvedRefRefinerForNamedNode, ResolvedRefRefiner, ResolvedRefRefinerCatchAll @@ -48,6 +48,7 @@ def initialize_resolved_ref_refiner_factory() -> ResolvedRefRefinerFactory: refiners_to_register = [ (key(is_default=True), ResolvedRefRefinerForDefaultNode), (key(RefPartType.NUMBERED, node_class=NumberedReferenceableBookNode), ResolvedRefRefinerForNumberedPart), + (key(RefPartType.NUMBERED, node_class=MapReferenceableBookNode), ResolvedRefRefinerForNumberedPart), (key(RefPartType.RANGE, node_class=NumberedReferenceableBookNode), ResolvedRefRefinerForRangedPart), (key(RefPartType.NAMED, node_class=NamedReferenceableBookNode), ResolvedRefRefinerForNamedNode), (key(RefPartType.NUMBERED, node_class=NamedReferenceableBookNode), ResolvedRefRefinerForNamedNode), diff --git a/sefaria/model/linker/tests/linker_test.py b/sefaria/model/linker/tests/linker_test.py index 75397f1280..165593cbb9 100644 --- a/sefaria/model/linker/tests/linker_test.py +++ b/sefaria/model/linker/tests/linker_test.py @@ -8,8 +8,6 @@ if not ENABLE_LINKER: pytest.skip("Linker not enabled", allow_module_level=True) -ref_resolver = library.get_ref_resolver() - def test_referenceable_child(): i = library.get_index("Rashi on Berakhot") @@ -32,6 +30,8 @@ def test_resolved_raw_ref_clone(): @pytest.mark.parametrize(('resolver_data', 'expected_trefs'), [ # Numbered JAs + [crrd(["@Jerusalem", "@Talmud", "@Yoma", "#5a"], lang='en'), ("Jerusalem Talmud Yoma 1:1:20-25",)], + [crrd(["@Babylonian", "@Talmud", "@Sukkah", "#49b"], lang='en'), ("Sukkah 49b",)], [crrd(["@בבלי", "@ברכות", "#דף ב"]), ("Berakhot 2",)], # amud-less talmud [crrd(["@ברכות", "#דף ב"]), ("Berakhot 2",)], # amud-less talmud [crrd(["@בבלי", "@שבת", "#דף ב."]), ("Shabbat 2a",)], # amud-ful talmud @@ -50,6 +50,11 @@ def test_resolved_raw_ref_clone(): [crrd(['@שבת', '#א', '#ב']), ["Mishnah Shabbat 1:2"]], # shouldn't match Shabbat 2a by reversing order of parts [crrd(['@שבת', '#ב', '#א']), ["Shabbat 2a", "Mishnah Shabbat 2:1"]], # ambiguous case + # Parsha -> sections + [crrd(["@Parshat Vayikra", "#2", "#3"], lang='en'), ('Leviticus 2:3',)], + [crrd(["@Parshat Tzav", "#2", "#3"], lang='en'), tuple()], # validate that sections fall within parsha + pytest.param(crrd(["@Parshat Noach", "#6", "#3"], lang='en'), tuple(), marks=pytest.mark.xfail(reason="currently dont check if pasuk/perek pair fall in parsha, only perek")), + # Aliases for perakim [crrd(["@משנה", "@ברכות", "#פרק קמא"]), ("Mishnah Berakhot 1",)], [crrd(["@משנה", "@ברכות", "#פרק בתרא"]), ("Mishnah Berakhot 9",)], @@ -59,7 +64,7 @@ def test_resolved_raw_ref_clone(): # Named alt structs [crrd(["@פרק אלו דברים", "@בפסחים"]), ("Pesachim 65b:10-73b:16",)], # talmud perek (that's ambiguous) - [crrd(["@פרק אלו דברים"]), ("Pesachim 65b:10-73b:16", "Berakhot 51b:11-53b:33")], # talmud perek without book that's ambiguous + [crrd(["@פרק אלו דברים"]), ("Pesachim 65b:10-73b:16", "Berakhot 51b:11-53b:33", "Jerusalem Talmud Berakhot 8:1:1-8:7", "Jerusalem Talmud Pesachim 6:1:1-6:4", "Jerusalem Talmud Demai 2:1:1-5:4")], # talmud perek without book that's ambiguous [crrd(["@רש\"י", "@פרק יום טוב", "@בביצה"]), ("Rashi on Beitzah 15b:1-23b:10",)], # rashi perek [crrd(["@רש\"י", "@פרק מאימתי"]), ("Rashi on Berakhot 2a:1-13a:15",)], # rashi perek [crrd(["@רש\"י", "@פרק כל כנויי נזירות", "@בנזיר", "*ד\"ה כל כינויי נזירות"]), ("Rashi on Nazir 2a:1:1",)], # rashi perek dibur hamatchil @@ -72,6 +77,16 @@ def test_resolved_raw_ref_clone(): [crrd(["#פרק בתרא", "@בפסחים"]), ("Mishnah Pesachim 10", "Pesachim 99b:1-121b:3")], # numbered talmud perek [crrd(['@מגמ\'', '#דרפ\"ו', '@דנדה']), ("Niddah 48a:11-54b:9",)], # prefixes in front of perek name + # Using addressTypes of alt structs + [crrd(["@JT", "@Bikkurim", "#Chapter 2"], lang="en"), ("Jerusalem Talmud Bikkurim 2",)], + [crrd(["@Tosafot Rabbi Akiva Eiger", "@Shabbat", "#Letter 87"], lang="en"), ("Tosafot Rabbi Akiva Eiger on Mishnah Shabbat 7.2.1",)], + [crrd(["@JT", "@Berakhot", "#2a"], lang="en"), ("Jerusalem Talmud Berakhot 1:1:2-4", "Jerusalem Talmud Berakhot 1:1:7-11",)], # ambig b/w Venice and Vilna + [crrd(["@JT", "@Berakhot", "#Chapter 1", "#2a"], lang="en"), ("Jerusalem Talmud Berakhot 1:1:2-4", "Jerusalem Talmud Berakhot 1:1:7-11",)], + [crrd(["@JT", "@Peah", "#10b"], lang="en"), ("Jerusalem Talmud Peah 2:1:1-4",)], # Venice not ambig + [crrd(["@JT", "@Peah", "#Chapter 3", "#15b"], lang="en"), ("Jerusalem Talmud Peah 3:2:4-4:3",)], # Venice not ambig because of chapter + [crrd(["@JT", "@Peah", "#15c"], lang="en"), ("Jerusalem Talmud Peah 1:1:20-30",)], # Folio address + [crrd(["@Chapter 1"], lang="en"), tuple()], # It used to be that Bavli perakim where Chapter N which causes problems for global scope + # Dibur hamatchils [crrd(["@רש\"י", "@יום טוב", "*ד\"ה שמא יפשע"]), ("Rashi on Beitzah 15b:8:1",)], [crrd(["@רש\"י", "@ביצה", "*ד\"ה שמא יפשע"]), ("Rashi on Beitzah 15b:8:1",)], @@ -90,8 +105,12 @@ def test_resolved_raw_ref_clone(): # Base text context [crrd(['@ובתוס\'', '#דכ"ז ע"ב', '*ד"ה והלכתא'], "Rashi on Berakhot 2a"), ("Tosafot on Berakhot 27b:14:2",)], # shared context child via graph context + # Mis-classified part types + [crrd(['@ושו"ע', "#אה״ע", "#סי׳ כ״ח", "#סעיף א"]), ("Shulchan Arukh, Even HaEzer 28:1",)], + # Ibid [crrd(['&שם', '#ז'], prev_trefs=["Genesis 1"]), ["Genesis 7", "Genesis 1:7"]], # ambiguous ibid + [crrd(['&Ibid', '#12'], prev_trefs=["Exodus 1:7"], lang='en'), ["Exodus 1:12", "Exodus 12"]], # ambiguous ibid when context is segment level (not clear if this is really ambiguous. maybe should only have segment level result) [crrd(['#ב'], prev_trefs=["Genesis 1"]), ["Genesis 1:2", "Genesis 2"]], # ambiguous ibid [crrd(['#ב', '#ז'], prev_trefs=["Genesis 1:3", "Exodus 1:3"]), ["Genesis 2:7", "Exodus 2:7"]], [crrd(['@בראשית', '&שם', '#ז'], prev_trefs=["Exodus 1:3", "Genesis 1:3"]), ["Genesis 1:7"]], @@ -119,6 +138,7 @@ def test_resolved_raw_ref_clone(): [crrd(['<לקמן', '#משנה א'], "Mishnah Berakhot 1", prev_trefs=['Mishnah Shabbat 1']), ("Mishnah Berakhot 1:1",)], # competing relative and sham # Superfluous information + [crrd(['@Vayikra', '@Leviticus', '#1'], lang='en'), ("Leviticus 1",)], [crrd(['@תוספות', '#פרק קמא', '@דברכות', '#דף ב']), ['Tosafot on Berakhot 2']], # YERUSHALMI EN @@ -160,8 +180,13 @@ def test_resolved_raw_ref_clone(): [crrd(['@טור יורה דעה', '#סימן א']), ['Tur, Yoreh Deah 1']], [crrd(['@תוספתא', '@ברכות', '#א', '#א']), ['Tosefta Berakhot 1:1', 'Tosefta Berakhot (Lieberman) 1:1']], # tosefta ambiguity [crrd(['@תוספתא', '@ברכות', '#א', '#טז']), ['Tosefta Berakhot 1:16']], # tosefta ambiguity - [crrd(['@זוה"ק', '#ח"א', '#דף פג:']), ['Zohar 1:83b']], - # pytest.param(crrd(None, 'he', 'זוהר שמות י.', [0, 1, slice(2, 4)], [RPT.NAMED, RPT.NAMED, RPT.NUMBERED]), ['Zohar 2:10a'], marks=pytest.mark.xfail(reason="Don't support Sefer HaTashbetz yet")), # infer Zohar volume from parasha + + # zohar + [crrd(['@זוה"ק', '#ח"א','@לך לך', '@סתרי תורה', '#דף פ.']), ['Zohar, Lech Lecha 10.78-84']], + [crrd(['@זוה"ק', '#ח"א','@לך לך', '#דף פג:']), ['Zohar, Lech Lecha 17.152-18.165']], + [crrd(['@זוה"ק', '#ח"א', '#דף פג:']), ['Zohar, Lech Lecha 17.152-18.165']], + [crrd(['@זוה"ק', '@לך לך', '#דף פג:']), ['Zohar, Lech Lecha 17.152-18.165']], + [crrd(['@זהר חדש', '@בראשית']), ['Zohar Chadash, Bereshit']], [crrd(['@מסכת', '@סופרים', '#ב', '#ג']), ['Tractate Soferim 2:3']], [crrd(['@אדר"נ', '#ב', '#ג']), ["Avot D'Rabbi Natan 2:3"]], @@ -233,8 +258,10 @@ def test_resolved_raw_ref_clone(): [crrd(['@Rashi on Genesis', '#1', '#1', '#1'], lang='en'), ["Rashi on Genesis 1:1:1"]], ]) def test_resolve_raw_ref(resolver_data, expected_trefs): - ref_resolver.reset_ibid_history() # reset from previous test runs raw_ref, context_ref, lang, prev_trefs = resolver_data + linker = library.get_linker(lang) + ref_resolver = linker._ref_resolver + ref_resolver.reset_ibid_history() # reset from previous test runs if prev_trefs: for prev_tref in prev_trefs: if prev_tref is None: @@ -243,7 +270,7 @@ def test_resolve_raw_ref(resolver_data, expected_trefs): ref_resolver._ibid_history.last_refs = Ref(prev_tref) print_spans(raw_ref) ref_resolver.set_thoroughness(ResolutionThoroughness.HIGH) - matches = ref_resolver.resolve_raw_ref(lang, context_ref, raw_ref) + matches = ref_resolver.resolve_raw_ref(context_ref, raw_ref) matched_orefs = sorted(reduce(lambda a, b: a + b, [[match.ref] if not match.is_ambiguous else [inner_match.ref for inner_match in match.resolved_raw_refs] for match in matches], []), key=lambda x: x.normal()) if len(expected_trefs) != len(matched_orefs): print(f"Found {len(matched_orefs)} refs instead of {len(expected_trefs)}") @@ -265,7 +292,9 @@ class TestResolveRawRef: ]) def test_full_pipeline_ref_resolver(context_tref, input_str, lang, expected_trefs, expected_pretty_texts): context_oref = context_tref and Ref(context_tref) - resolved = ref_resolver.bulk_resolve_refs(lang, [context_oref], [input_str])[0] + linker = library.get_linker(lang) + doc = linker.link(input_str, context_oref, type_filter='citation') + resolved = doc.resolved_refs assert len(resolved) == len(expected_trefs) resolved_orefs = sorted(reduce(lambda a, b: a + b, [[match.ref] if not match.is_ambiguous else [inner_match.ref for inner_match in match.resolved_raw_refs] for match in resolved], []), key=lambda x: x.normal()) if len(expected_trefs) != len(resolved_orefs): @@ -275,7 +304,7 @@ def test_full_pipeline_ref_resolver(context_tref, input_str, lang, expected_tref for expected_tref, matched_oref in zip(sorted(expected_trefs, key=lambda x: x), resolved_orefs): assert matched_oref == Ref(expected_tref) for match, expected_pretty_text in zip(resolved, expected_pretty_texts): - assert input_str[slice(*match.raw_ref.char_indices)] == match.raw_ref.text + assert input_str[slice(*match.raw_entity.char_indices)] == match.raw_entity.text assert match.pretty_text == expected_pretty_text @@ -306,7 +335,7 @@ def test_get_all_possible_sections_from_string(input_addr_str, AddressClass, exp ]) def test_group_ranged_parts(raw_ref_params, expected_section_slices): lang, raw_ref_parts, span = raw_ref_params - raw_ref = RawRef(lang, raw_ref_parts, span) + raw_ref = RawRef(span, lang, raw_ref_parts) exp_sec_slice, exp2sec_slice = expected_section_slices if exp_sec_slice is None: expected_raw_ref_parts = raw_ref_parts @@ -384,17 +413,18 @@ def test_map_new_indices(crrd_params): # unnorm data raw_ref, _, lang, _ = crrd(*crrd_params) text = raw_ref.text - doc = ref_resolver.get_raw_ref_model(lang).make_doc(text) + linker = library.get_linker(lang) + nlp = linker.get_ner().named_entity_model + doc = nlp.make_doc(text) indices = raw_ref.char_indices part_indices = [p.char_indices for p in raw_ref.raw_ref_parts] print_spans(raw_ref) # norm data - n = ref_resolver._normalizer - norm_text = n.normalize(text, lang=lang) - norm_doc = ref_resolver.get_raw_ref_model(lang).make_doc(norm_text) - mapping = n.get_mapping_after_normalization(text, reverse=True, lang=lang) - norm_part_indices = n.convert_normalized_indices_to_unnormalized_indices(part_indices, mapping, reverse=True) + n = linker.get_ner()._normalizer + norm_text = n.normalize(text) + norm_doc = nlp.make_doc(norm_text) + norm_part_indices = n.norm_to_unnorm_indices(text, part_indices, reverse=True) norm_part_spans = [norm_doc.char_span(s, e) for (s, e) in norm_part_indices] norm_part_token_inds = [] for span in norm_part_spans: @@ -409,7 +439,8 @@ def test_map_new_indices(crrd_params): # test assert norm_raw_ref.text == norm_text.strip() - norm_raw_ref.map_new_indices(doc, indices, part_indices) + norm_raw_ref.map_new_char_indices(doc, indices) + norm_raw_ref.map_new_part_char_indices(part_indices) assert norm_raw_ref.text == raw_ref.text for norm_part, part in zip(norm_raw_ref.raw_ref_parts, raw_ref.raw_ref_parts): assert norm_part.text == part.text diff --git a/sefaria/model/linker/tests/linker_test_utils.py b/sefaria/model/linker/tests/linker_test_utils.py index 0651510b87..fa511dc25a 100644 --- a/sefaria/model/linker/tests/linker_test_utils.py +++ b/sefaria/model/linker/tests/linker_test_utils.py @@ -8,8 +8,6 @@ if not ENABLE_LINKER: pytest.skip("Linker not enabled", allow_module_level=True) -ref_resolver = library.get_ref_resolver() - class RefPartTypeNone: """ @@ -58,7 +56,7 @@ def get_symbol_by_part_type(part_type): @staticmethod def convert_to_raw_encoded_part_list(lang, text, span_inds, part_types): - nlp = ref_resolver.get_raw_ref_part_model(lang) + nlp = library.get_linker(lang).get_ner().raw_ref_part_model doc = nlp.make_doc(text) span = doc[0:] raw_encoded_part_list = [] @@ -104,7 +102,7 @@ def part_types(self): @property def span(self): if not self._span: - nlp = ref_resolver.get_raw_ref_part_model(self.lang) + nlp = library.get_linker(self.lang).get_ner().raw_ref_part_model doc = nlp.make_doc(self.input_str) self._span = doc[0:] return self._span @@ -131,7 +129,7 @@ def raw_ref_parts(self): return raw_ref_parts def get_raw_ref_params(self): - return self.lang, self.raw_ref_parts, self.span + return self.span, self.lang, self.raw_ref_parts def print_debug_info(self): print('Input:', self.input_str) diff --git a/sefaria/model/linker/tests/named_entity_resolver_tests.py b/sefaria/model/linker/tests/named_entity_resolver_tests.py new file mode 100644 index 0000000000..c207c7f8f4 --- /dev/null +++ b/sefaria/model/linker/tests/named_entity_resolver_tests.py @@ -0,0 +1,14 @@ +import pytest +from sefaria.model.linker.named_entity_resolver import NamedEntityTitleGenerator, PersonTitleGenerator + + +@pytest.mark.parametrize(('title', 'expected_output'), [ + ['Rabbi b. Ben', ['Rabbi b. Ben', 'Rabbi ben Ben', 'Rabbi bar Ben', 'Rabbi, son of Ben', 'Rabbi, the son of Ben', + 'Rabbi son of Ben', 'Rabbi the son of Ben', 'Rabbi Bar Ben', 'Rabbi Ben Ben', 'R. b. Ben']], + ['Rabbi ben Ben', ['R. ben Ben', 'Rabbi ben Ben']], + ['Bar Kochba', ['Bar Kochba', 'bar Kochba']], +]) +def test_person_title_generator(title, expected_output): + expected_output = sorted(expected_output) + actual_output = sorted(PersonTitleGenerator.generate(title)) + assert actual_output == expected_output diff --git a/sefaria/model/place.py b/sefaria/model/place.py index dd42ff7f47..c5f24ac58c 100644 --- a/sefaria/model/place.py +++ b/sefaria/model/place.py @@ -112,7 +112,7 @@ def process_index_place_change(indx, **kwargs): def process_topic_place_change(topic_obj, **kwargs): keys = ["birthPlace", "deathPlace"] for key in keys: - if key in kwargs.keys(): # only change property value if key is in data, otherwise it indicates no change + if kwargs.get(key, False): # only change property value if key is in data, otherwise it indicates no change new_val = kwargs[key] if new_val != '': he_key = get_he_key(key) diff --git a/sefaria/model/schema.py b/sefaria/model/schema.py index c89233c743..77cb1f4f07 100644 --- a/sefaria/model/schema.py +++ b/sefaria/model/schema.py @@ -1125,6 +1125,19 @@ def is_segment_level_dibur_hamatchil(self) -> bool: return getattr(self, 'isSegmentLevelDiburHamatchil', False) +class AltStructNode(TitledTreeNode): + """ + Structural node for alt structs + Allows additional attributes for referencing these nodes with the linker + Note, these nodes can't be the end of a reference since they themselves don't map to a `Ref`. But they are helpful + being intermediate nodes in a longer reference. + """ + optional_param_keys = ["match_templates", "numeric_equivalent", 'referenceable'] + + def ref(self): + return None + + class ArrayMapNode(NumberedTitledTreeNode): """ A :class:`TreeNode` that contains jagged arrays of references. @@ -1132,7 +1145,7 @@ class ArrayMapNode(NumberedTitledTreeNode): (e.g., Parsha structures of chapter/verse stored Tanach, or Perek structures of Daf/Line stored Talmud) """ required_param_keys = ["depth", "wholeRef"] - optional_param_keys = ["lengths", "addressTypes", "sectionNames", "refs", "includeSections", "startingAddress", "match_templates", "numeric_equivalent", "referenceableSections", "isSegmentLevelDiburHamatchil", "diburHamatchilRegexes", 'referenceable', "addresses", "skipped_addresses"] # "addressTypes", "sectionNames", "refs" are not required for depth 0, but are required for depth 1 + + optional_param_keys = ["lengths", "addressTypes", "sectionNames", "refs", "includeSections", "startingAddress", "match_templates", "numeric_equivalent", "referenceableSections", "isSegmentLevelDiburHamatchil", "diburHamatchilRegexes", 'referenceable', "addresses", "skipped_addresses", "isMapReferenceable"] # "addressTypes", "sectionNames", "refs" are not required for depth 0, but are required for depth 1 + has_key = False # This is not used as schema for content def get_ref_from_sections(self, sections): diff --git a/sefaria/model/tests/text_test.py b/sefaria/model/tests/text_test.py index 860a68edf3..75536b1ea4 100644 --- a/sefaria/model/tests/text_test.py +++ b/sefaria/model/tests/text_test.py @@ -327,15 +327,6 @@ def test_merge(): def test_text_helpers(): - res = model.library.get_dependant_indices() - assert 'Rashbam on Genesis' in res - assert 'Rashi on Bava Batra' in res - assert 'Bartenura on Mishnah Oholot' in res - assert 'Onkelos Leviticus' in res - assert 'Chizkuni' in res - assert 'Akeidat Yitzchak' not in res - assert 'Berakhot' not in res - res = model.library.get_indices_by_collective_title("Rashi") assert 'Rashi on Bava Batra' in res assert 'Rashi on Genesis' in res @@ -346,46 +337,61 @@ def test_text_helpers(): assert 'Bartenura on Mishnah Oholot' in res assert 'Rashbam on Genesis' not in res - res = model.library.get_dependant_indices(book_title="Exodus") - assert 'Ibn Ezra on Exodus' in res - assert 'Ramban on Exodus' in res - assert 'Meshekh Chokhmah' in res - assert 'Abarbanel on Torah' in res - assert 'Targum Jonathan on Exodus' in res - assert 'Onkelos Exodus' in res - assert 'Harchev Davar on Exodus' in res - - assert 'Exodus' not in res - assert 'Rashi on Genesis' not in res - - res = model.library.get_dependant_indices(book_title="Exodus", dependence_type='Commentary') - assert 'Ibn Ezra on Exodus' in res - assert 'Ramban on Exodus' in res - assert 'Meshekh Chokhmah' in res - assert 'Abarbanel on Torah' in res - assert 'Harchev Davar on Exodus' in res - - assert 'Targum Jonathan on Exodus' not in res - assert 'Onkelos Exodus' not in res - assert 'Exodus' not in res - assert 'Rashi on Genesis' not in res - - res = model.library.get_dependant_indices(book_title="Exodus", dependence_type='Commentary', structure_match=True) - assert 'Ibn Ezra on Exodus' in res - assert 'Ramban on Exodus' in res - - assert 'Harchev Davar on Exodus' not in res - assert 'Meshekh Chokhmah' not in res - assert 'Abarbanel on Torah' not in res - assert 'Exodus' not in res - assert 'Rashi on Genesis' not in res - cats = model.library.get_text_categories() assert 'Tanakh' in cats assert 'Torah' in cats assert 'Prophets' in cats assert 'Commentary' in cats +@pytest.mark.parametrize(('book_title', 'dependence_type', 'structure_match', 'expected_titles', 'not_expected_titles'), [ + [None, None, False, [ + 'Rashbam on Genesis', + 'Rashi on Bava Batra', + 'Bartenura on Mishnah Oholot', + 'Onkelos Leviticus', + 'Chizkuni', + ], [ + 'Akeidat Yitzchak', + 'Berakhot'] + ], + ['Exodus', None, False, ['Ibn Ezra on Exodus; Perush HaArokh', + 'Ramban on Exodus', + 'Abarbanel on Torah', + 'Meshekh Chokhmah', + 'Targum Jonathan on Exodus', + 'Onkelos Exodus', + 'Harchev Davar on Exodus' + ], ['Exodus', + 'Rashi on Genesis'] + ], + ['Exodus', 'Commentary', False, ['Ibn Ezra on Exodus; Perush HaArokh', + 'Ramban on Exodus', + 'Abarbanel on Torah', + 'Meshekh Chokhmah', + 'Harchev Davar on Exodus' + ], ['Targum Jonathan on Exodus', + 'Onkelos Exodus', + 'Exodus', + 'Rashi on Genesis'] + ], + ['Exodus', 'Commentary', True, ['Ibn Ezra on Exodus; Perush HaArokh', + 'Ramban on Exodus' + ], ['Abarbanel on Torah', + 'Meshekh Chokhmah', + 'Targum Jonathan on Exodus', + 'Onkelos Exodus', + 'Harchev Davar on Exodus', + 'Exodus', + 'Rashi on Genesis'] + ], +]) +def test_get_dependent_indices(book_title, dependence_type, structure_match, expected_titles, not_expected_titles): + res = model.library.get_dependant_indices(book_title=book_title, dependence_type=dependence_type, structure_match=structure_match) + for title in expected_titles: + assert title in res + for title in not_expected_titles: + assert title not in res + def test_index_update(): ''' @@ -743,53 +749,85 @@ def setup_class(cls): "sectionNames": ["Chapter", "Paragraph"], "categories": ["Musar"], }).save() - cls.versionWithTranslation = model.Version( + cls.firstTranslationVersion = model.Version( { - "chapter": cls.myIndex.nodes.create_skeleton(), + "chapter": [['1'], ['2'], ["original text", "2nd"]], "versionTitle": "Version 1 TEST [fr]", "versionSource": "blabla", - "language": "he", + "language": "en", "title": cls.myIndexTitle } - ) - cls.versionWithTranslation.chapter = [['1'], ['2'], ["original text", "2nd"]] - cls.versionWithTranslation.save() - cls.versionWithoutTranslation = model.Version( + ).save() + cls.sourceVersion = model.Version( { - "chapter": cls.myIndex.nodes.create_skeleton(), + "chapter":cls.myIndex.nodes.create_skeleton(), "versionTitle": "Version 1 TEST", "versionSource": "blabla", "language": "he", "title": cls.myIndexTitle } ) - cls.versionWithoutTranslation.chapter = [['1'], ['2'], ["original text", "2nd"]] - cls.versionWithoutTranslation.save() + cls.sourceVersion.chapter = [['1'], ['2'], ["original text", "2nd"]] + cls.sourceVersion.save() cls.versionWithLangCodeMismatch = model.Version( { "chapter": cls.myIndex.nodes.create_skeleton(), "versionTitle": "Version 1 TEST [ar]", "versionSource": "blabla", - "language": "he", - "actualLanguage": "fr", + "language": "en", + 'actualLanguage': 'fr', "title": cls.myIndexTitle } ) @classmethod def teardown_class(cls): - for c in [cls.myIndex, cls.versionWithTranslation, cls.versionWithoutTranslation, cls.versionWithLangCodeMismatch]: + for c in [cls.myIndex, cls.sourceVersion, cls.firstTranslationVersion, cls.versionWithLangCodeMismatch]: try: c.delete() except Exception: pass - def test_normalizes_actualLanguage_from_brackets(self): - assert self.versionWithTranslation.actualLanguage == "fr" - - def test_normalizes_language_from_language(self): - assert self.versionWithoutTranslation.actualLanguage == "he" + def test_normalize(self): + expected_attrs = { + 'firstTranslationVersion': { + 'actualLanguage': 'fr', + 'direction': 'ltr', + 'languageFamilyName': 'french', + 'isPrimary': True, + 'isSource': False, + }, + 'sourceVersion': { + 'actualLanguage': 'he', + 'direction': 'rtl', + 'languageFamilyName': 'hebrew', + 'isPrimary': True, + 'isSource': True, + }, + 'versionWithLangCodeMismatch': { + 'actualLanguage': 'fr', + 'direction': 'ltr', + 'languageFamilyName': 'french', + 'isPrimary': False, + 'isSource': False, + }, + } + self.versionWithLangCodeMismatch._normalize() + for version_key in expected_attrs: + version = getattr(self, version_key) + for attr in expected_attrs[version_key]: + assert getattr(version, attr) == expected_attrs[version_key][attr] + +@pytest.mark.parametrize(('text_with_html', 'text_without_html'), + [ + ["בְּרֵאשִׁ֖ית בָּרָ֣א אֱלֹהִ֑ים אֵ֥ת הַשָּׁמַ֖יִם וְאֵ֥ת הָאָֽרֶץ", + "בְּרֵאשִׁ֖ית בָּרָ֣א אֱלֹהִ֑ים אֵ֥ת הַשָּׁמַ֖יִם וְאֵ֥ת הָאָֽרֶץ"], + [ + "Happy is the man who has not followed the counsel of the wicked,
or taken the path of sinners,
or joined the company of the insolent;", + "Happy is the man who has not followed the counsel of the wicked, or taken the path of sinners, or joined the company of the insolent;"] + ]) + +def test_remove_html(text_with_html, text_without_html): + assert model.TextChunk.remove_html(text_with_html) == text_without_html + - def test_save_when_language_mismatch(self): - self.versionWithLangCodeMismatch.save() - assert self.versionWithLangCodeMismatch.actualLanguage == "ar" \ No newline at end of file diff --git a/sefaria/model/tests/topic_test.py b/sefaria/model/tests/topic_test.py index 59f49d205b..56345624f1 100644 --- a/sefaria/model/tests/topic_test.py +++ b/sefaria/model/tests/topic_test.py @@ -3,6 +3,7 @@ from sefaria.model.text import Ref from sefaria.system.database import db from sefaria.system.exceptions import SluggedMongoRecordMissingError +from sefaria.helper.topic import update_topic def make_topic(slug): @@ -166,6 +167,8 @@ def test_sanitize(self): + + class TestTopicLinkHelper(object): def test_init_by_class(self, topic_graph): diff --git a/sefaria/model/text.py b/sefaria/model/text.py index da6ea12f6a..1255a7a6c3 100644 --- a/sefaria/model/text.py +++ b/sefaria/model/text.py @@ -19,7 +19,7 @@ from bs4 import BeautifulSoup, Tag import re2 as re from . import abstract as abst -from .schema import deserialize_tree, SchemaNode, VirtualNode, DictionaryNode, JaggedArrayNode, TitledTreeNode, DictionaryEntryNode, SheetNode, AddressTalmud, Term, TermSet, TitleGroup, AddressType +from .schema import deserialize_tree, AltStructNode, VirtualNode, DictionaryNode, JaggedArrayNode, TitledTreeNode, DictionaryEntryNode, SheetNode, AddressTalmud, Term, TermSet, TitleGroup, AddressType from sefaria.system.database import db import sefaria.system.cache as scache @@ -234,7 +234,7 @@ def _set_struct_objs(self): self.struct_objs = {} if getattr(self, "alt_structs", None) and self.nodes: for name, struct in list(self.alt_structs.items()): - self.struct_objs[name] = deserialize_tree(struct, index=self, struct_class=TitledTreeNode) + self.struct_objs[name] = deserialize_tree(struct, index=self, struct_class=AltStructNode) self.struct_objs[name].title_group = self.nodes.title_group def is_complex(self): @@ -1171,14 +1171,21 @@ def sanitize_text(cls, t): @staticmethod def remove_html(t): + + def conditional_replace(match): + tag = match.group() + if tag in ["
", "
"]: + return " " + return "" + if isinstance(t, list): for i, v in enumerate(t): if isinstance(v, str): - t[i] = re.sub('<[^>]+>', " ", v) + t[i] = re.sub('<[^>]+>', conditional_replace, v) else: t[i] = AbstractTextRecord.remove_html(v) elif isinstance(t, str): - t = re.sub('<[^>]+>', " ", t) + t = re.sub('<[^>]+>', conditional_replace, t) else: return False return t @@ -1297,7 +1304,12 @@ class Version(AbstractTextRecord, abst.AbstractMongoRecord, AbstractSchemaConten "title", # FK to Index.title "versionSource", "versionTitle", - "chapter" # required. change to "content"? + "chapter", # required. change to "content"? + "actualLanguage", # ISO language code + 'languageFamilyName', # full name of the language, but without specificity (for Judeo Arabic actualLanguage=jrb, languageFamilyName=arabic + 'isSource', # bool, True if this version is not a translation + 'isPrimary', # bool, True if we see it as a primary version (usually equals to isSource, but Hebrew Kuzarif or example is primary but not source) + 'direction', # 'rtl' or 'ltr' ] """ @@ -1323,12 +1335,6 @@ class Version(AbstractTextRecord, abst.AbstractMongoRecord, AbstractSchemaConten "purchaseInformationImage", "purchaseInformationURL", "hasManuallyWrappedRefs", # true for texts where refs were manually wrapped in a-tags. no need to run linker at run-time. - "actualLanguage", # ISO language code - 'languageFamilyName', # full name of the language, but without specificity (for Judeo Arabic actualLanguage=jrb, languageFamilyName=arabic - "isBaseText", # should be deprecated (needs some changes on client side) - 'isSource', # bool, True if this version is not a translation - 'isPrimary', # bool, True if we see it as a primary version (usually equals to isSource, but Hebrew Kuzarif or example is primary but not source) - 'direction', # 'rtl' or 'ltr' ] def __str__(self): @@ -1343,14 +1349,6 @@ def _validate(self): Old style database text record have a field called 'chapter' Version records in the wild have a field called 'text', and not always a field called 'chapter' """ - languageCodeRe = re.search(r"\[([a-z]{2})\]$", getattr(self, "versionTitle", None)) - if languageCodeRe and languageCodeRe.group(1) != getattr(self,"actualLanguage",None): - self.actualLanguage = languageCodeRe.group(1) - if not getattr(self, 'languageFamilyName', None): - try: - self.languageFamilyName = constants.LANGUAGE_CODES[self.actualLanguage] - except KeyError: - self.languageFamilyName = constants.LANGUAGE_CODES[self.language] if getattr(self,"language", None) not in ["en", "he"]: raise InputError("Version language must be either 'en' or 'he'") index = self.get_index() @@ -1386,16 +1384,20 @@ def _check_node_offsets(self, content, node): def _normalize(self): # add actualLanguage -- TODO: migration to get rid of bracket notation completely - actualLanguage = getattr(self, "actualLanguage", None) - versionTitle = getattr(self, "versionTitle", None) + actualLanguage = getattr(self, "actualLanguage", None) + versionTitle = getattr(self, "versionTitle", None) if not actualLanguage and versionTitle: languageCode = re.search(r"\[([a-z]{2})\]$", versionTitle) if languageCode and languageCode.group(1): - self.actualLanguage = languageCode.group(1) - else: - self.actualLanguage = self.language - - if not getattr(self, 'direction', None): + actualLanguage = languageCode.group(1) + self.actualLanguage = actualLanguage or self.language + + if not hasattr(self, 'languageFamilyName'): + self.languageFamilyName = constants.LANGUAGE_CODES.get(self.actualLanguage) or constants.LANGUAGE_CODES[self.language] + self.isSource = getattr(self, "isSource", self.actualLanguage == 'he') + if not hasattr(self, "isPrimary"): + self.isPrimary = self.isSource or not VersionSet({'title': self.title}) #first version is primary + if not hasattr(self, 'direction'): self.direction = 'rtl' if self.language == 'he' else 'ltr' if getattr(self, "priority", None): @@ -4949,7 +4951,7 @@ def __init__(self): self._simple_term_mapping = {} self._full_term_mapping = {} self._simple_term_mapping_json = None - self._ref_resolver = None + self._linker_by_lang = {} # Topics self._topic_mapping = {} @@ -5714,36 +5716,57 @@ def _build_topic_mapping(self): self._topic_mapping = {t.slug: {"en": t.get_primary_title("en"), "he": t.get_primary_title("he")} for t in TopicSet()} return self._topic_mapping - def get_ref_resolver(self, rebuild=False): - resolver = self._ref_resolver - if not resolver or rebuild: - resolver = self.build_ref_resolver() - return resolver + def get_linker(self, lang: str, rebuild=False): + linker = self._linker_by_lang.get(lang) + if not linker or rebuild: + linker = self.build_linker(lang) + return linker + + def build_linker(self, lang: str): + from sefaria.model.linker.linker import Linker + + logger.info("Loading Spacy Model") - def build_ref_resolver(self): + named_entity_resolver = self._build_named_entity_resolver(lang) + ref_resolver = self._build_ref_resolver(lang) + named_entity_recognizer = self._build_named_entity_recognizer(lang) + self._linker_by_lang[lang] = Linker(named_entity_recognizer, ref_resolver, named_entity_resolver) + return self._linker_by_lang[lang] + + @staticmethod + def _build_named_entity_resolver(lang: str): + from .linker.named_entity_resolver import TopicMatcher, NamedEntityResolver + + named_entity_types_to_topics = { + "PERSON": {"ontology_roots": ['people'], "single_slugs": ['god', 'the-tetragrammaton']}, + "GROUP": {'ontology_roots': ["group-of-people"]}, + } + return NamedEntityResolver(TopicMatcher(lang, named_entity_types_to_topics)) + + @staticmethod + def _build_named_entity_recognizer(lang: str): + from .linker.named_entity_recognizer import NamedEntityRecognizer + from sefaria.helper.linker import load_spacy_model + + return NamedEntityRecognizer( + lang, + load_spacy_model(RAW_REF_MODEL_BY_LANG_FILEPATH[lang]), + load_spacy_model(RAW_REF_PART_MODEL_BY_LANG_FILEPATH[lang]) + ) + + def _build_ref_resolver(self, lang: str): from .linker.match_template import MatchTemplateTrie from .linker.ref_resolver import RefResolver, TermMatcher from sefaria.model.schema import NonUniqueTermSet - from sefaria.helper.linker import load_spacy_model - - logger.info("Loading Spacy Model") root_nodes = list(filter(lambda n: getattr(n, 'match_templates', None) is not None, self.get_index_forest())) alone_nodes = reduce(lambda a, b: a + b.index.get_referenceable_alone_nodes(), root_nodes, []) non_unique_terms = NonUniqueTermSet() - self._ref_resolver = RefResolver( - {k: load_spacy_model(v) for k, v in RAW_REF_MODEL_BY_LANG_FILEPATH.items() if v is not None}, - {k: load_spacy_model(v) for k, v in RAW_REF_PART_MODEL_BY_LANG_FILEPATH.items() if v is not None}, - { - "en": MatchTemplateTrie('en', nodes=(root_nodes + alone_nodes), scope='alone'), - "he": MatchTemplateTrie('he', nodes=(root_nodes + alone_nodes), scope='alone') - }, - { - "en": TermMatcher('en', non_unique_terms), - "he": TermMatcher('he', non_unique_terms), - } + + return RefResolver( + lang, MatchTemplateTrie(lang, nodes=(root_nodes + alone_nodes), scope='alone'), + TermMatcher(lang, non_unique_terms), ) - return self._ref_resolver def get_index_forest(self): """ diff --git a/sefaria/model/topic.py b/sefaria/model/topic.py index 57a04fd70b..b965dbcaf5 100644 --- a/sefaria/model/topic.py +++ b/sefaria/model/topic.py @@ -380,7 +380,7 @@ def get_primary_title(self, lang='en', with_disambiguation=True): if disambig_text: title += f' ({disambig_text})' elif getattr(self, 'isAmbiguous', False) and len(title) > 0: - title += ' (Ambiguous)' + title += ' [Ambiguous]' return title def get_titles(self, lang=None, with_disambiguation=True): @@ -835,6 +835,21 @@ def _normalize(self): self.ref = Ref(self.ref).normal() self.expandedRefs = [r.normal() for r in Ref(self.ref).all_segment_refs()] + def _sanitize(self): + """ + Sanitize the "title" and "prompt" for all descriptions. + Since they're human editable they are candidates for XSS. + @return: + """ + for lang in ("en", "he"): + description = getattr(self, "descriptions", {}).get(lang) + if description: + for field in ("title", "prompt"): + value = description.get(field) + if value: + description[field] = bleach.clean(value, tags=self.ALLOWED_TAGS, attributes=self.ALLOWED_ATTRS) + self.descriptions[lang] = description + def _validate(self): Topic.validate_slug_exists(self.toTopic) TopicLinkType.validate_slug_exists(self.linkType, 0) diff --git a/sefaria/model/trend.py b/sefaria/model/trend.py index ed8518287a..c2d9b8306d 100644 --- a/sefaria/model/trend.py +++ b/sefaria/model/trend.py @@ -7,8 +7,6 @@ import time from datetime import datetime, date, timedelta -from py import process - from . import abstract as abst from . import user_profile from . import text diff --git a/sefaria/model/webpage.py b/sefaria/model/webpage.py index 5be91223ca..10e2c00300 100644 --- a/sefaria/model/webpage.py +++ b/sefaria/model/webpage.py @@ -113,7 +113,7 @@ def normalize_url(url): "remove url params": lambda url: re.sub(r"\?.+", "", url), "remove utm params": lambda url: re.sub(r"\?utm_.+", "", url), "remove fbclid param": lambda url: re.sub(r"\?fbclid=.+", "", url), - "remove www": lambda url: re.sub(r"^(https?://)www\.", r"\1", url), + "remove www": lambda url: re.sub(r"^(https?://)?www\.", r"\1", url), "remove mediawiki params": lambda url: re.sub(r"&.+", "", url), "remove sort param": lambda url: re.sub(r"\?sort=.+", "", url), "remove all params after id": lambda url: re.sub(r"(\?id=\d+).+$", r"\1", url) diff --git a/sefaria/spacy_function_registry.py b/sefaria/spacy_function_registry.py index 34a8366744..a4a86049c0 100644 --- a/sefaria/spacy_function_registry.py +++ b/sefaria/spacy_function_registry.py @@ -5,7 +5,7 @@ def inner_punct_tokenizer_factory(): def inner_punct_tokenizer(nlp): # infix_re = spacy.util.compile_infix_regex(nlp.Defaults.infixes) - infix_re = re.compile(r'''[\.\,\?\:\;…\‘\’\`\“\”\"\'~\–\-/\(\)]''') + infix_re = re.compile(r'''[.,?!:;…‘’`“”"'~–—\-‐‑‒־―⸺⸻/()<>]''') prefix_re = spacy.util.compile_prefix_regex(nlp.Defaults.prefixes) suffix_re = spacy.util.compile_suffix_regex(nlp.Defaults.suffixes) diff --git a/sefaria/system/middleware.py b/sefaria/system/middleware.py index 8cdc458ac6..0e67309ded 100644 --- a/sefaria/system/middleware.py +++ b/sefaria/system/middleware.py @@ -172,7 +172,12 @@ def current_domain_lang(request): class CORSDebugMiddleware(MiddlewareMixin): def process_response(self, request, response): - if DEBUG: + """ + CORS headers are normally added in nginx response. + However, nginx isn't normally running when debugging with localhost + """ + origin = request.get_host() + if ('localhost' in origin or '127.0.0.1' in origin) and DEBUG: response["Access-Control-Allow-Origin"] = "*" response["Access-Control-Allow-Methods"] = "POST, GET" response["Access-Control-Allow-Headers"] = "*" diff --git a/sefaria/urls.py b/sefaria/urls.py index b782760b87..cd467b760e 100644 --- a/sefaria/urls.py +++ b/sefaria/urls.py @@ -7,7 +7,7 @@ from django.http import HttpResponseRedirect import django.contrib.auth.views as django_auth_views from sefaria.forms import SefariaPasswordResetForm, SefariaSetPasswordForm, SefariaLoginForm -from sefaria.settings import DOWN_FOR_MAINTENANCE, STATIC_URL +from sefaria.settings import DOWN_FOR_MAINTENANCE, STATIC_URL, ADMIN_PATH import reader.views as reader_views import sefaria.views as sefaria_views @@ -438,7 +438,7 @@ url(r'^admin/descriptions/authors/update', sefaria_views.update_authors_from_sheet), url(r'^admin/descriptions/categories/update', sefaria_views.update_categories_from_sheet), url(r'^admin/descriptions/texts/update', sefaria_views.update_texts_from_sheet), - url(r'^admin/?', include(admin.site.urls)), + url(fr'^{ADMIN_PATH}/?', include(admin.site.urls)), ] # Stats API - return CSV diff --git a/sefaria/utils/tests/util_test.py b/sefaria/utils/tests/util_test.py index 2c3e73f4e9..70992adab0 100644 --- a/sefaria/utils/tests/util_test.py +++ b/sefaria/utils/tests/util_test.py @@ -59,3 +59,10 @@ def test_string_length_equals_max(self): max_length = 24 expected_output = "string with length of 24" assert truncate_string(string, min_length, max_length) == expected_output + + def test_long_string_with_html_closing_tag_after_max_length(self): + string = 'This is a long string aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa a' + min_length = 10 + max_length = 22 + expected_output = "This is a long string…" + assert truncate_string(string, min_length, max_length) == expected_output diff --git a/sefaria/utils/util.py b/sefaria/utils/util.py index 28cd48ada1..dd12be8e33 100644 --- a/sefaria/utils/util.py +++ b/sefaria/utils/util.py @@ -472,7 +472,7 @@ def truncate_string(string, min_length, max_length): while min_length <= pos: while pos in html_element_indices: pos = html_element_indices[pos] - 1 - if string[pos] == break_char: + if string[pos] == break_char and pos <= max_length: return string[:pos] + "…" pos -= 1 return string diff --git a/sefaria/views.py b/sefaria/views.py index 76dd901c98..ba1d880f6e 100644 --- a/sefaria/views.py +++ b/sefaria/views.py @@ -343,6 +343,7 @@ def find_refs_api(request): @api_view(["GET"]) def websites_api(request, domain): cb = request.GET.get("callback", None) + domain = WebPage.normalize_url(domain) website = WebSite().load({"domains": domain}) if website is None: return jsonResponse({"error": f"no website found with domain: '{domain}'"}) diff --git a/sites/sefaria/urls.py b/sites/sefaria/urls.py index 4753abb7fb..d9c48c6436 100644 --- a/sites/sefaria/urls.py +++ b/sites/sefaria/urls.py @@ -80,8 +80,8 @@ # Redirects to Wikis etc site_urlpatterns += [ - url(r'^donate/mobile?$', lambda x: HttpResponseRedirect('https://donate.sefaria.org/' if x.interfaceLang == 'english' else 'https://donate.sefaria.org/he?c_src=mobile-app')), - url(r'^donate/?$', lambda x: HttpResponseRedirect('https://donate.sefaria.org/' if x.interfaceLang == 'english' else 'https://donate.sefaria.org/he')), + url(r'^donate/mobile?$', lambda x: HttpResponseRedirect('https://donate.sefaria.org/english?c_src=App' if x.interfaceLang == 'english' else 'https://donate.sefaria.org/he?c_src=App')), + url(r'^donate/?$', lambda x: HttpResponseRedirect('https://donate.sefaria.org/english' if x.interfaceLang == 'english' else 'https://donate.sefaria.org/he')), url(r'^wiki/?$', lambda x: HttpResponseRedirect('https://github.com/Sefaria/Sefaria-Project/wiki')), url(r'^developers/?$', lambda x: HttpResponseRedirect('https://github.com/Sefaria/Sefaria-Project/wiki#developers')), url(r'^request-a-text/?$', lambda x: HttpResponseRedirect('https://goo.gl/forms/ru33ivawo7EllQxa2')), @@ -90,7 +90,7 @@ url(r'^faq/?$', lambda x: HttpResponseRedirect('/collections/sefaria-faqs' if x.interfaceLang == 'english' else '/collections/%D7%A9%D7%90%D7%9C%D7%95%D7%AA-%D7%A0%D7%A4%D7%95%D7%A6%D7%95%D7%AA-%D7%91%D7%A1%D7%A4%D7%A8%D7%99%D7%90')), url(r'^help/?$', lambda x: HttpResponseRedirect('/collections/sefaria-faqs' if x.interfaceLang == 'english' else '/collections/%D7%A9%D7%90%D7%9C%D7%95%D7%AA-%D7%A0%D7%A4%D7%95%D7%A6%D7%95%D7%AA-%D7%91%D7%A1%D7%A4%D7%A8%D7%99%D7%90')), url(r'^gala/?$', lambda x: HttpResponseRedirect('https://donate.sefaria.org/event/sefarias-10-year-anniversary-gala/e486954')), - url(r'^give/(?P[a-zA-Z0-9]+)/?$', lambda x, campaign_id: HttpResponseRedirect(f'https://donate.sefaria.org/give/550774/#!/donation/checkout?c_src={campaign_id}')), + url(r'^give/(?P[a-zA-Z0-9]+)/?$', lambda x, channel_source: HttpResponseRedirect(f'https://donate.sefaria.org/give/550774/#!/donation/checkout?c_src={channel_source}')), url(r'^give/?$', lambda x: HttpResponseRedirect(f'https://donate.sefaria.org/give/550774/#!/donation/checkout?c_src=mu')), url(r'^giving/?$', lambda x: HttpResponseRedirect('https://donate.sefaria.org/give/524771/#!/donation/checkout')), url(r'^jfn?$', lambda x: HttpResponseRedirect('https://www.sefaria.org/sheets/60494')), diff --git a/static/css/s2.css b/static/css/s2.css index 22a0439703..d7be73e9df 100644 --- a/static/css/s2.css +++ b/static/css/s2.css @@ -5153,6 +5153,10 @@ body .ui-autocomplete.dictionary-toc-autocomplete .ui-menu-item a.ui-state-focus .readerOptions .int-he img { height: 18px; } +.rightButtons .readerOptionsTooltip.tooltip-toggle::before { + top: 47px; + left: -50px; +} .rightButtons .readerOptions { vertical-align: middle; } diff --git a/static/css/static.css b/static/css/static.css index 3ba84ceb05..5dc57ffe1f 100644 --- a/static/css/static.css +++ b/static/css/static.css @@ -814,6 +814,17 @@ p.registration-links a:hover{ width: 200px; content: ""; } + +@media (max-width: 450px) { + #teamPage .team-members { + justify-content: center; + } + + #teamPage .team-members::after { + width: auto !important; + } +} + #teamPage .teamMember { flex: 0 0 30%; } @@ -3663,4 +3674,4 @@ form.globalUpdateForm + div.notificationsList { } .about.section { padding-top: 40px; -} \ No newline at end of file +} diff --git a/static/js/Autocomplete.jsx b/static/js/Autocomplete.jsx index 1bdb9bc916..06cfac82d2 100644 --- a/static/js/Autocomplete.jsx +++ b/static/js/Autocomplete.jsx @@ -174,31 +174,37 @@ const EntitySearchSuggestion = ({label, onClick, type, url, ...props}) => { ); } -const SearchInputBox = ({getInputProps, suggestions, highlightedIndex, hideHebrewKeyboard, +const SearchInputBox = ({getInputProps, suggestions, highlightedIndex, hideHebrewKeyboard, setInputValue, setSearchFocused, searchFocused, submitSearch, redirectToObject}) => { + const getInputValue = () =>{ + return otherDownShiftProps.value || getVirtualKeyboardInputValue(); + } + const getVirtualKeyboardInputValue = () =>{ + return document.querySelector('#searchBox .keyboardInput').value; + } useEffect(() => { showVirtualKeyboardIcon(false); // Initially hide the virtual keyboard icon }, []); const { onBlur, onKeyDown, ...otherDownShiftProps } = getInputProps(); const handleSearchKeyDown = (event) => { - onKeyDown(event) + onKeyDown(event); if (event.keyCode !== 13) return; const highlightedItem = highlightedIndex > -1 ? suggestions[highlightedIndex] : null if (highlightedItem && highlightedItem.type != 'search'){ redirectToObject(highlightedItem); return; } - const inputQuery = otherDownShiftProps.value + const inputQuery = getInputValue(); if (!inputQuery) return; submitSearch(inputQuery); }; const handleSearchButtonClick = (event) => { - const inputQuery = otherDownShiftProps.value + const inputQuery = getInputValue(); if (inputQuery) { submitSearch(inputQuery); } else { @@ -224,12 +230,14 @@ const SearchInputBox = ({getInputProps, suggestions, highlightedIndex, hideHebre const blurSearch = (e) => { onBlur(e); + const oldValue = getVirtualKeyboardInputValue(); const parent = document.getElementById('searchBox'); if (!parent.contains(e.relatedTarget) && !document.getElementById('keyboardInputMaster')) { // debug: comment out the following line: setSearchFocused(false); showVirtualKeyboardIcon(false); } + !document.getElementById('keyboardInputMaster') && setInputValue(oldValue) }; const inputClasses = classNames({ @@ -353,6 +361,7 @@ const SuggestionsGroup = ({ suggestions, initialIndexForGroup, getItemProps, hig getInputProps, getItemProps, highlightedIndex, + setInputValue } = useCombobox({ items: suggestions, itemToString: (item) => (item ? item.name : ''), @@ -465,6 +474,7 @@ const SuggestionsGroup = ({ suggestions, initialIndexForGroup, getItemProps, hig suggestions={suggestions} hideHebrewKeyboard={hideHebrewKeyboard} highlightedIndex={highlightedIndex} + setInputValue={setInputValue} setSearchFocused={setSearchFocused} searchFocused={searchFocused} diff --git a/static/js/BookPage.jsx b/static/js/BookPage.jsx index be769caa79..b91af7dc06 100644 --- a/static/js/BookPage.jsx +++ b/static/js/BookPage.jsx @@ -11,7 +11,7 @@ import { AdminToolHeader, CategoryChooser, TitleVariants, - CategoryHeader, requestWithCallBack + CategoryHeader } from './Misc'; import {ContentText} from "./ContentText"; import {validateMarkdownLinks} from "./AdminEditor"; @@ -803,8 +803,8 @@ class JaggedArrayNodeSection extends Component { if (this.contentCountIsEmpty(contentCounts[i])) { continue; } let [section, heSection] = Sefaria.getSectionStringByAddressType(this.props.addressTypes[0], i, this.props.offset); let ref = (this.props.refPath + ":" + section).replace(":", " ") + this.refPathTerminal(contentCounts[i]); - let currentPlace = ref == this.props?.currentlyVisibleSectionRef || ref == this.props?.currentlyVisibleRef || Sefaria.refContains(this.props?.currentlyVisibleSectionRef, ref); //the second clause is for depth 1 texts - const linkClasses = classNames({"sectionLink": 1, "current": currentPlace}); + let currentPlace = ref == this.props?.currentlyVisibleSectionRef || ref == this.props?.currentlyVisibleRef || (Sefaria.refContains(this.props?.currentlyVisibleSectionRef, ref) && this.props.depth > 1); //the second clause is for depth 1 texts + const linkClasses = classNames({"sectionLink": 1, "current": currentPlace}); let link = ( @@ -1293,7 +1293,8 @@ const EditTextInfo = function({initTitle, close}) { const deleteObj = () => { setSavingStatus(true); const url = `/api/v2/index/${enTitle}`; - requestWithCallBack({url, type: "DELETE", redirect: () => window.location.href = `/texts`}); + Sefaria.adminEditorApiRequest(url, null, null, "DELETE") + .then(() => window.location.href = '/texts'); } const renderCollectiveTitle = () => { if (!creatingCollectiveTitle) { diff --git a/static/js/CategoryEditor.jsx b/static/js/CategoryEditor.jsx index 61f5b2c80e..f7636786aa 100644 --- a/static/js/CategoryEditor.jsx +++ b/static/js/CategoryEditor.jsx @@ -2,7 +2,7 @@ import {CategoryChooser, InterfaceText, ToggleSet} from "./Misc"; import Sefaria from "./sefaria/sefaria"; import $ from "./sefaria/sefariaJquery"; import {AdminEditor} from "./AdminEditor"; -import {requestWithCallBack, AdminToolHeader} from "./Misc"; +import {AdminToolHeader} from "./Misc"; import React, {useState, useRef} from "react"; const displayOptionForSources = (child) => { @@ -84,7 +84,9 @@ const ReorderEditor = ({close, type="", postURL="", redirect="", origItems = []} else if (type === 'sources') { postCategoryData = {sources: tocItems}; } - requestWithCallBack({url: postURL, data: postCategoryData, setSavingStatus, redirect: () => window.location.href = redirect}) + Sefaria.adminEditorApiRequest(postURL, null, postCategoryData) + .then(() => window.location.href = redirect) + .finally(() => setSavingStatus(false)); } return
@@ -187,7 +189,9 @@ const CategoryEditor = ({origData={}, close, origPath=[]}) => { if (urlParams.length > 0) { url += `?${urlParams.join('&')}`; } - requestWithCallBack({url, data: postCategoryData, setSavingStatus, redirect: () => window.location.href = "/texts/"+fullPath}); + Sefaria.adminEditorApiRequest(url, null, postCategoryData) + .then(() => window.location.href = "/texts/"+fullPath) + .finally(() => setSavingStatus(false)); } @@ -197,7 +201,8 @@ const CategoryEditor = ({origData={}, close, origPath=[]}) => { return; } const url = `/api/category/${origPath.concat(origData.origEn).join("/")}`; - requestWithCallBack({url, type: "DELETE", redirect: () => window.location.href = `/texts`}); + Sefaria.adminEditorApiRequest(url, null, null, "DELETE") + .then(() => window.location.href = `/texts`); } const primaryOptions = [ {name: "true", content: Sefaria._("True"), role: "radio", ariaLabel: Sefaria._("Set Primary Status to True") }, diff --git a/static/js/Misc.jsx b/static/js/Misc.jsx index 409e0f4558..6b9d179af9 100644 --- a/static/js/Misc.jsx +++ b/static/js/Misc.jsx @@ -1045,29 +1045,6 @@ class ToggleOption extends Component { } } - //style={this.props.style} - -const requestWithCallBack = ({url, setSavingStatus, redirect, type="POST", data={}, redirect_params}) => { - let ajaxPayload = {url, type}; - if (type === "POST") { - ajaxPayload.data = {json: JSON.stringify(data)}; - } - $.ajax({ - ...ajaxPayload, - success: function(result) { - if ("error" in result) { - if (setSavingStatus) { - setSavingStatus(false); - } - alert(result.error); - } else { - redirect(); - } - } - }).fail(function() { - alert(Sefaria._("Something went wrong. Sorry!")); - }); -} const TopicToCategorySlug = function(topic, category=null) { //helper function for AdminEditor @@ -1177,7 +1154,7 @@ const ReorderEditorWrapper = ({toggle, type, data}) => { return []; } // a topic can be connected to refs in one language and not in another so filter out those that are not in current interface lang - refs = refs.filter((x) => !x.is_sheet && x?.order?.availableLangs?.includes(Sefaria.interfaceLang.slice(0, 2))); + refs = refs.filter((x) => !x.is_sheet); // then sort the refs and take only first 30 sources because admins don't want to reorder hundreds of sources return refs.sort((a, b) => refSort('relevance', [a.ref, a], [b.ref, b])).slice(0, 30); } @@ -1186,7 +1163,7 @@ const ReorderEditorWrapper = ({toggle, type, data}) => { return { url: `/api/source/reorder?topic=${data.slug}&lang=${Sefaria.interfaceLang}`, redirect: `/topics/${data.slug}`, - origItems: _filterAndSortRefs(data.refs?.about?.refs) || [], + origItems: _filterAndSortRefs(data.tabs?.sources?.refs) || [], } } switch (type) { // at /texts or /topics @@ -1216,7 +1193,10 @@ const ReorderEditorWrapper = ({toggle, type, data}) => { const EditorForExistingTopic = ({ toggle, data }) => { const prepAltTitles = (lang) => { // necessary for use with TitleVariants component - return data.titles.filter(x => !x.primary && x.lang === lang).map((item, i) => ({["name"]: item.text, ["id"]: i})) + return data.titles.filter(x => !x.primary && x.lang === lang).map((item, i) => ({ + name: item.disambiguation ? `${item.text} (${item.disambiguation})` : item.text, + id: i + })) } const initCatSlug = TopicToCategorySlug(data); const origData = { @@ -1349,6 +1329,8 @@ class DisplaySettingsButton extends Component { render() { let style = this.props.placeholder ? {visibility: "hidden"} : {}; let icon; + const altText = Sefaria._('Text display options') + const classes = "readerOptionsTooltip tooltip-toggle"; if (Sefaria._siteSettings.TORAH_SPECIFIC) { icon = @@ -1359,17 +1341,21 @@ class DisplaySettingsButton extends Component { } else { icon = Aa; } - return ( - {icon} - ); + return ( + + + {icon} + + + ); } } DisplaySettingsButton.propTypes = { @@ -1670,7 +1656,7 @@ const TopicPictureUploader = ({slug, callback, old_filename, caption}) => { const deleteImage = () => { const old_filename_wout_url = old_filename.split("/").slice(-1); const url = `${Sefaria.apiHost}/api/topics/images/${slug}?old_filename=${old_filename_wout_url}`; - requestWithCallBack({url, type: "DELETE", redirect: () => alert("Deleted image.")}); + Sefaria.adminEditorApiRequest(url, null, null, "DELETE").then(() => alert("Deleted image.")); callback(""); fileInput.current.value = ""; } @@ -3379,7 +3365,6 @@ export { AdminToolHeader, CategoryChooser, TitleVariants, - requestWithCallBack, OnInView, TopicPictureUploader, ImageWithCaption diff --git a/static/js/Promotions.jsx b/static/js/Promotions.jsx index 508fb24cf9..ac1a489110 100644 --- a/static/js/Promotions.jsx +++ b/static/js/Promotions.jsx @@ -11,7 +11,8 @@ const Promotions = () => { const context = useContext(AdContext); const strapi = useContext(StrapiDataContext); useEffect(() => { - if (strapi.dataFromStrapiHasBeenReceived) { + // Disable Strapi for Sidebar Ads during Unbounce trial + if (false && strapi.dataFromStrapiHasBeenReceived) { Sefaria._inAppAds = []; const sidebarAds = strapi.strapiData?.sidebarAds?.data; diff --git a/static/js/ReaderApp.jsx b/static/js/ReaderApp.jsx index b1ee4e29aa..51707d9c6e 100644 --- a/static/js/ReaderApp.jsx +++ b/static/js/ReaderApp.jsx @@ -282,8 +282,14 @@ class ReaderApp extends Component { } else { state.panels = []; } - this.setState(state, () => { - if (state.scrollPosition) { + + // need to clone state and panels; if we don't clone them, when we run setState, it will make it so that + // this.state.panels refers to the same object as history.state.panels, which cause back button bugs + const newState = {...state}; + newState.panels = newState.panels.map(panel => this.clonePanel(panel)); + + this.setState(newState, () => { + if (newState.scrollPosition) { $(".content").scrollTop(event.state.scrollPosition) .trigger("scroll"); } @@ -2242,23 +2248,17 @@ toggleSignUpModal(modalContentKind = SignUpModalKind.Default) { var classes = classNames(classDict); return ( - // The Strapi context is put at the highest level of scope so any component or children within ReaderApp can use the static content received - // InterruptingMessage modals and Banners will always render if available but stay hidden initially - - -
- - -
- {header} - {panels} - {signUpModal} - {communityPagePreviewControls} - -
+ +
+
+ {header} + {panels} + {signUpModal} + {communityPagePreviewControls} +
- - +
+
); } } diff --git a/static/js/SourceEditor.jsx b/static/js/SourceEditor.jsx index e95cef8588..1755c74efe 100644 --- a/static/js/SourceEditor.jsx +++ b/static/js/SourceEditor.jsx @@ -1,7 +1,7 @@ import Sefaria from "./sefaria/sefaria"; import $ from "./sefaria/sefariaJquery"; import {AdminEditor} from "./AdminEditor"; -import {requestWithCallBack, Autocompleter, InterfaceText} from "./Misc"; +import {Autocompleter, InterfaceText} from "./Misc"; import React, {useState} from "react"; import {useRef} from "react"; @@ -46,10 +46,16 @@ const SourceEditor = ({topic, close, origData={}}) => { const save = async function () { setSavingStatus(true); let refInUrl = isNew ? displayRef : origData.ref; - let url = `/api/ref-topic-links/${Sefaria.normRef(refInUrl)}`; - let postData = {"topic": topic, "is_new": isNew, 'new_ref': displayRef, 'interface_lang': Sefaria.interfaceLang}; - postData['description'] = {"title": data.enTitle, "prompt": data.prompt, "ai_context": data.ai_context, "review_state": "edited"}; - requestWithCallBack({url, data: postData, setSavingStatus, redirect: () => window.location.href = "/topics/"+topic}); + const payload = { + new_ref: displayRef, + topic, + is_new: isNew, + interface_lang: Sefaria.interfaceLang, + description: {"title": data.enTitle, "prompt": data.prompt, "ai_context": data.ai_context, "review_state": "edited"}, + } + Sefaria.postRefTopicLink(refInUrl, payload) + .then(() => window.location.href = `/topics/${topic}`) + .finally(() => setSavingStatus(false)); } const handleChange = (x) => { @@ -86,8 +92,9 @@ const SourceEditor = ({topic, close, origData={}}) => { } const deleteTopicSource = function() { - const url = `/api/ref-topic-links/${origData.ref}?topic=${topic}&interface_lang=${Sefaria.interfaceLang}`; - requestWithCallBack({url, type: "DELETE", redirect: () => window.location.href = `/topics/${topic}`}); + const url = `/api/ref-topic-links/${Sefaria.normRef(origData.ref)}?topic=${topic}&interface_lang=${Sefaria.interfaceLang}`; + Sefaria.adminEditorApiRequest(url, null, null, "DELETE") + .then(() => window.location.href = `/topics/${topic}`); } const previousTitleItemRef = useRef(data.enTitle ? "Previous Title" : null); //use useRef to make value null even if component re-renders const previousPromptItemRef = useRef(data.prompt ? "Previous Prompt" : null); diff --git a/static/js/StaticPages.jsx b/static/js/StaticPages.jsx index 2dcbc3d63f..bd59a652ba 100644 --- a/static/js/StaticPages.jsx +++ b/static/js/StaticPages.jsx @@ -4,6 +4,8 @@ import { TwoOrThreeBox, ResponsiveNBox, NBox, InterfaceText, + LoadingMessage, + LoadingRing, } from './Misc'; import {NewsletterSignUpForm} from "./NewsletterSignUpForm"; import palette from './sefaria/palette'; @@ -1435,8 +1437,8 @@ const DonatePage = () => ( heText="" enButtonText="Donate Now" heButtonText="" - enButtonUrl="https://donate.sefaria.org/" - heButtonUrl="https://donate.sefaria.org/he" + enButtonUrl="https://donate.sefaria.org/english?c_src=waystogive" + heButtonUrl="https://donate.sefaria.org/he?c_src=waystogive" borderColor="#004E5F" />, ( heText="" enButtonText="Join the Sustainers" heButtonText="" - enButtonUrl="https://donate.sefaria.org/sustainers" - heButtonUrl="https://donate.sefaria.org/sustainershe" + enButtonUrl="https://donate.sefaria.org/sustainers?c_src=waystogive" + heButtonUrl="https://donate.sefaria.org/sustainershe?c_src=waystogive" borderColor="#97B386" />, ( heText="" enButtonText="Sponsor a Day of Learning" heButtonText="" - enButtonUrl="https://donate.sefaria.org/sponsor" - heButtonUrl="https://donate.sefaria.org/sponsorhe" + enButtonUrl="https://donate.sefaria.org/sponsor?c_src=waystogive" + heButtonUrl="https://donate.sefaria.org/sponsorhe?c_src=waystogive" borderColor="#4B71B7" />, ( heText="" enButtonText="Join Now or Learn More" heButtonText="" - enButtonUrl="https://donate.sefaria.org/campaign/giving-circles/c557214" + enButtonUrl="https://donate.sefaria.org/campaign/giving-circles/c557214?c_src=waystogive" heButtonUrl="" borderColor="#7C416F" /> @@ -1491,7 +1493,7 @@ const DonatePage = () => ( , @@ -1603,7 +1605,7 @@ const DonatePage = () => ( @@ -3021,6 +3023,7 @@ const NoJobsNotice = () => { const JobsPage = memo(() => { const [groupedJobPostings, setGroupedJobPostings] = useState({}); const [error, setError] = useState(null); + const [loading, setLoading] = useState(true); const fetchJobsJSON = async () => { const currentDateTime = new Date().toISOString(); @@ -3069,6 +3072,7 @@ const JobsPage = memo(() => { }; const loadJobPostings = async () => { + setLoading(true); if (typeof STRAPI_INSTANCE !== "undefined" && STRAPI_INSTANCE) { try { const jobsData = await fetchJobsJSON(); @@ -3102,20 +3106,27 @@ const JobsPage = memo(() => { } else { setError("Error: Sefaria's CMS cannot be reached"); } + setLoading(false); }; useEffect(() => { loadJobPostings(); }, []); + const jobsAvailable = Object.keys(groupedJobPostings)?.length; return (
{error ? (

{error}

+ ) : loading ? ( + <> + + + ) : ( <> - - {Object.keys(groupedJobPostings)?.length ? ( + + {jobsAvailable ? ( ) : ( diff --git a/static/js/Story.jsx b/static/js/Story.jsx index 12bf3df64c..8aaacc0d89 100644 --- a/static/js/Story.jsx +++ b/static/js/Story.jsx @@ -170,14 +170,14 @@ const ReviewStateIndicatorLang = ({reviewState, markReviewed}) => { } const markReviewedPostRequest = (lang, topic, topicLink) => { - const postData = { + const payload = { "topic": topic, "is_new": false, 'new_ref': topicLink.ref, 'interface_lang': lang === 'en' ? 'english' : 'hebrew', 'description' : {...topicLink.descriptions[lang], 'review_state': 'reviewed'} }; - return Sefaria.postToApi(`/api/ref-topic-links/${topicLink.ref}`, {}, postData); + return Sefaria.postRefTopicLink(topicLink.ref, payload); } const useReviewState = (topic, topicLink) => { diff --git a/static/js/TopicEditor.jsx b/static/js/TopicEditor.jsx index a916a71685..9680617df1 100644 --- a/static/js/TopicEditor.jsx +++ b/static/js/TopicEditor.jsx @@ -1,5 +1,5 @@ import Sefaria from "./sefaria/sefaria"; -import {InterfaceText, requestWithCallBack, TopicPictureUploader} from "./Misc"; +import {InterfaceText, TopicPictureUploader} from "./Misc"; import $ from "./sefaria/sefariaJquery"; import {AdminEditor} from "./AdminEditor"; import {Reorder} from "./CategoryEditor"; @@ -35,6 +35,8 @@ const TopicEditor = ({origData, onCreateSuccess, close, origWasCat}) => { const [isChanged, setIsChanged] = useState(false); const [changedPicture, setChangedPicture] = useState(false); + const disambiguationExtractionRegex = /\((.+)\)$/; + const toggle = function() { setSavingStatus(savingStatus => !savingStatus); } @@ -93,12 +95,12 @@ const TopicEditor = ({origData, onCreateSuccess, close, origWasCat}) => { alert(Sefaria._("Title must be provided.")); return false; } - if (data.enImgCaption.length > 150) { - alert("English caption is too long. It should not be more than 150 characters"); + if (data.enImgCaption.length > 300) { + alert("English caption is too long. It should not be more than 300 characters"); return false; } - if (data.heImgCaption.length > 150) { - alert("Hebrew caption is too long. It should not be more than 150 characters") + if (data.heImgCaption.length > 300) { + alert("Hebrew caption is too long. It should not be more than 300 characters") return false; } if (sortedSubtopics.length > 0 && !isNew) { @@ -109,14 +111,44 @@ const TopicEditor = ({origData, onCreateSuccess, close, origWasCat}) => { const saveReorderedSubtopics = function () { const url = `/api/topic/reorder`; const postCategoryData = {topics: sortedSubtopics}; - requestWithCallBack({url, data: postCategoryData, setSavingStatus, redirect: () => window.location.href = "/topics"}); + Sefaria.adminEditorApiRequest(url, null, postCategoryData) + .then(() => window.location.href = "/topics") + .finally(() => setSavingStatus(false)); } + const extractDisambiguationFromTitle = function(titleText){ + return titleText.match(disambiguationExtractionRegex)?.[1]; + } + const removeDisambiguationFromTitle = function(titleText){ + return titleText.replace(disambiguationExtractionRegex, "").trimEnd(); + } + + const createPrimaryTitleObj = function(rawTitle, lang){ + let primaryTitleObj = {'text': removeDisambiguationFromTitle(rawTitle), "lang": lang, "primary": true}; + let disambiguation = extractDisambiguationFromTitle(rawTitle); + if (disambiguation) {primaryTitleObj["disambiguation"]=disambiguation}; + return primaryTitleObj; + }; + const createNonPrimaryTitleObjArray = function(altTitles, lang){ + const titleObjArray = [] + altTitles.forEach((title) => { + let titleObj = {'text': removeDisambiguationFromTitle(title), "lang": lang}; + let disambiguation = extractDisambiguationFromTitle(title); + if (disambiguation) {titleObj["disambiguation"]=disambiguation} + titleObjArray.push(titleObj) + }); + return titleObjArray + }; + const prepData = () => { // always add category, title, heTitle, altTitles - let postData = { category: data.catSlug, title: data.enTitle, heTitle: data.heTitle, altTitles: {}}; - postData.altTitles.en = data.enAltTitles.map(x => x.name); // alt titles implemented using TitleVariants which contains list of objects with 'name' property. - postData.altTitles.he = data.heAltTitles.map(x => x.name); + let postData = { category: data.catSlug, titles: []}; + + //convert title and altTitles to the database format, including extraction of disambiguation from title string + postData['titles'].push(createPrimaryTitleObj(data.enTitle, 'en')); + postData['titles'].push(createPrimaryTitleObj(data.heTitle, 'he')); + postData['titles'] = postData['titles'].concat(createNonPrimaryTitleObjArray(data.enAltTitles.map(x => x.name), 'en')); + postData['titles'] = postData['titles'].concat(createNonPrimaryTitleObjArray(data.heAltTitles.map(x => x.name), 'he')); // add image if image or caption changed const origImageURI = origData?.origImage?.image_uri || ""; @@ -189,7 +221,7 @@ const TopicEditor = ({origData, onCreateSuccess, close, origWasCat}) => { const deleteObj = function() { const url = `/api/topic/delete/${data.origSlug}`; - requestWithCallBack({url, type: "DELETE", redirect: () => window.location.href = "/topics"}); + Sefaria.adminEditorApiRequest(url, null, null, "DELETE").then(() => window.location.href = "/topics"); } let items = ["Title", "Hebrew Title", "English Description", "Hebrew Description", "Category Menu", "English Alternate Titles", "Hebrew Alternate Titles",]; if (isCategory) { diff --git a/static/js/TopicPage.jsx b/static/js/TopicPage.jsx index 51686b1a37..0d25a0dac8 100644 --- a/static/js/TopicPage.jsx +++ b/static/js/TopicPage.jsx @@ -104,8 +104,6 @@ const refSort = (currSortOption, a, b) => { return a.order.comp_date - b.order.comp_date; } else { - const aAvailLangs = a.order.availableLangs; - const bAvailLangs = b.order.availableLangs; if ((Sefaria.interfaceLang === 'english') && (a.order.curatedPrimacy.en > 0 || b.order.curatedPrimacy.en > 0)) { return b.order.curatedPrimacy.en - a.order.curatedPrimacy.en; } @@ -113,12 +111,13 @@ const refSort = (currSortOption, a, b) => { (a.order.curatedPrimacy.he > 0 || b.order.curatedPrimacy.he > 0)) { return b.order.curatedPrimacy.he - a.order.curatedPrimacy.he; } + const aAvailLangs = a.order.availableLangs; + const bAvailLangs = b.order.availableLangs; if (Sefaria.interfaceLang === 'english' && aAvailLangs.length !== bAvailLangs.length) { if (aAvailLangs.indexOf('en') > -1) { return -1; } if (bAvailLangs.indexOf('en') > -1) { return 1; } return 0; } - else if (a.order.custom_order !== b.order.custom_order) { return b.order.custom_order - a.order.custom_order; } // custom_order, when present, should trump other data else if (a.order.pr !== b.order.pr) { return b.order.pr - a.order.pr; } @@ -345,7 +344,7 @@ const generatePrompts = async(topicSlug, linksToGenerate) => { }); const payload = {ref_topic_links: linksToGenerate}; try { - await Sefaria.postToApi(`/api/topics/generate-prompts/${topicSlug}`, {}, payload); + await Sefaria.apiRequestWithBody(`/api/topics/generate-prompts/${topicSlug}`, {}, payload); const refValues = linksToGenerate.map(item => item.ref).join(", "); alert("The following prompts are generating: " + refValues); } catch (error) { @@ -360,7 +359,7 @@ const publishPrompts = async (topicSlug, linksToPublish) => { ref.descriptions[lang]["published"] = true; }); try { - const response = await Sefaria.postToApi(`/api/ref-topic-links/bulk`, {}, linksToPublish); + const response = await Sefaria.apiRequestWithBody(`/api/ref-topic-links/bulk`, {}, linksToPublish); const refValues = response.map(item => item.anchorRef).join(", "); const shouldRefresh = confirm("The following prompts have been published: " + refValues + ". Refresh page to see results?"); if (shouldRefresh) { diff --git a/static/js/TopicSearch.jsx b/static/js/TopicSearch.jsx index c736cc72cf..6f4578805d 100644 --- a/static/js/TopicSearch.jsx +++ b/static/js/TopicSearch.jsx @@ -64,31 +64,25 @@ class TopicSearch extends Component { } post(slug) { - const postJSON = JSON.stringify({"topic": slug, 'interface_lang': Sefaria.interfaceLang}); + const postJSON = {"topic": slug, 'interface_lang': Sefaria.interfaceLang}; const srefs = this.props.srefs; const update = this.props.update; const reset = this.reset; - $.post("/api/ref-topic-links/" + Sefaria.normRef(this.props.srefs), {"json": postJSON}, async function (data) { - if (data.error) { - alert(data.error); - } else { + Sefaria.postRefTopicLink(Sefaria.normRef(this.props.srefs), postJSON).then(async () => { const sectionRef = await Sefaria.getRef(Sefaria.normRef(srefs)).sectionRef; srefs.map(sref => { - if (!Sefaria._refTopicLinks[sref]) { - Sefaria._refTopicLinks[sref] = []; - } - Sefaria._refTopicLinks[sref].push(data); + if (!Sefaria._refTopicLinks[sref]) { + Sefaria._refTopicLinks[sref] = []; + } + Sefaria._refTopicLinks[sref].push(data); }); if (!Sefaria._refTopicLinks[sectionRef]) { - Sefaria._refTopicLinks[sectionRef] = []; + Sefaria._refTopicLinks[sectionRef] = []; } Sefaria._refTopicLinks[sectionRef].push(data); update(); reset(); alert("Topic added."); - } - }).fail(function (xhr, status, errorThrown) { - alert("Unfortunately, there may have been an error saving this topic information: " + errorThrown); }); } diff --git a/static/js/context.js b/static/js/context.js index 8c9f1f03f4..9b9d0778bf 100644 --- a/static/js/context.js +++ b/static/js/context.js @@ -19,7 +19,8 @@ function StrapiDataProvider({ children }) { const [modal, setModal] = useState(null); const [banner, setBanner] = useState(null); useEffect(() => { - if (STRAPI_INSTANCE) { + // Disable Strapi API calls during Unbounce trial + if (false && typeof STRAPI_INSTANCE !== "undefined" && STRAPI_INSTANCE) { const getStrapiData = async () => { let getDateWithoutTime = (date) => date.toISOString().split("T")[0]; let getJSONDateStringInLocalTimeZone = (date) => { diff --git a/static/js/lib/keyboard.js b/static/js/lib/keyboard.js index c90c35f00c..0f3cc4353a 100644 --- a/static/js/lib/keyboard.js +++ b/static/js/lib/keyboard.js @@ -1486,6 +1486,8 @@ var VKI_attach, VKI_close; break; case "Enter": VKI_addListener(td, 'click', function() { + let element = document.querySelector('[vki_attached="true"]'); + element.dispatchEvent(new KeyboardEvent('keydown', {key: 'Enter', code: 'Enter', keyCode: 13, which: 13, bubbles: true, cancelable: true})); if (self.VKI_target.nodeName != "TEXTAREA") { if (self.VKI_enterSubmit && self.VKI_target.form) { for (var z = 0, subm = false; z < self.VKI_target.form.elements.length; z++) diff --git a/static/js/linker.v3/main.js b/static/js/linker.v3/main.js index b5dab6b852..700de691c4 100644 --- a/static/js/linker.v3/main.js +++ b/static/js/linker.v3/main.js @@ -6,6 +6,10 @@ import {LinkExcluder} from "./excluder"; (function(ns) { + function escapeRegExp(string) { + return string.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); // $& means the whole matched string + } + function sanitizeElem(elem) { const cleaned = DOMPurify.sanitize(elem, { USE_PROFILES: { html: true } }); const cleanedElem = document.createElement("div"); @@ -100,7 +104,6 @@ import {LinkExcluder} from "./excluder"; function findOccurrences(text) { const occurrences = []; findAndReplaceDOMText(document, { - preset: 'prose', find: text, replace: function(portion, match) { if (portion.index === 0) { @@ -114,7 +117,7 @@ import {LinkExcluder} from "./excluder"; function getNextWhiteSpaceIndex(text) { const match = text.match(/\S\s+/); // `\S` so whitespace can't be at beginning of string - if (match === null || text.substring(0, match.index+1).indexOf('\n') > -1) { return -1; } // \n's are added in by Readability and therefore make it challenging to match against. stop when you hit one. + if (match === null) { return -1; } return match.index + 1; } @@ -142,9 +145,12 @@ import {LinkExcluder} from "./excluder"; const newEndChar = getNthWhiteSpaceIndex(text, numWordsAround, endChar); const textRev = [...text].reverse().join(""); const newStartChar = text.length - getNthWhiteSpaceIndex(textRev, numWordsAround, text.length - startChar); - const wordsAroundText = text.substring(newStartChar, newEndChar); + const wordsAroundText = escapeRegExp(text.substring(newStartChar, newEndChar)); + // findAndReplaceDOMText and Readability deal with element boundaries differently + // in order to more flexibly find these boundaries, we treat all whitespace the same + const wordsAroundReg = wordsAroundText.replace(/\s+/g, '\\s+'); return { - text: wordsAroundText, + text: RegExp(wordsAroundReg, "g"), startChar: startChar - newStartChar, }; } @@ -193,6 +199,26 @@ import {LinkExcluder} from "./excluder"; return node; } } + function isMatchUniqueEnough(globalLinkStarts, match, charError=5) { + /** + * Return true if `match` represents one of the matches we've determined to be unique enough to represent this link + */ + for (let globalStart of globalLinkStarts) { + if (Math.abs(match.startIndex - globalStart) <= charError) { + return true; + } + } + return false; + } + function isMatchedTextUniqueEnough(occurrences, linkObj, maxSearchLength=30) { + /** + * return true if first occurrence is sufficiently long (longer than `maxSearchLength`) + * AND searchText includes more than just the text of the link. + */ + if (occurrences.length === 0) { return false; } + const firstOccurrenceLength = occurrences[0][1] - occurrences[0][0]; + return firstOccurrenceLength >= maxSearchLength && firstOccurrenceLength > linkObj.text.length; + } function wrapRef(linkObj, normalizedText, refData, iLinkObj, resultsKey, maxNumWordsAround = 10, maxSearchLength = 30) { /** @@ -218,9 +244,9 @@ import {LinkExcluder} from "./excluder"; ({ text: searchText, startChar: linkStartChar } = getNumWordsAround(linkObj, normalizedText, numWordsAround)); occurrences = findOccurrences(searchText); numWordsAround += 1; - if (searchText.length >= maxSearchLength) { break; } + if (isMatchedTextUniqueEnough(occurrences, linkObj, maxSearchLength)) { break; } } - if (occurrences.length === 0 || (occurrences.length > 1 && searchText.length < maxSearchLength)) { + if (occurrences.length !== 1 && !isMatchedTextUniqueEnough(occurrences, linkObj, maxSearchLength)) { if (ns.debug) { console.log("MISSED", numWordsAround, occurrences.length, linkObj); } @@ -228,11 +254,11 @@ import {LinkExcluder} from "./excluder"; } const globalLinkStarts = occurrences.map(([start, end]) => linkStartChar + start); findAndReplaceDOMText(document, { - preset: 'prose', find: linkObj.text, replace: function(portion, match) { - // check this is the unique match found above - if (globalLinkStarts.indexOf(match.startIndex) === -1) { return portion.text; } + if (!isMatchUniqueEnough(globalLinkStarts, match)) { + return portion.text; + } // check if should be excluded from linking and/or tracking const matchKey = match.startIndex + "|" + match.endIndex; diff --git a/static/js/linker.v3/popup.js b/static/js/linker.v3/popup.js index fe80d935cb..aaeeb01614 100644 --- a/static/js/linker.v3/popup.js +++ b/static/js/linker.v3/popup.js @@ -289,7 +289,7 @@ export class PopupManager { this.linkerHeader.style["border-top-color"] = this.category_colors[primaryCategory]; // TODO is this right? - if (this.contentLang !== "he") { + if (this.contentLang.slice(0, 2) !== "he") { // [].forEach.call(heElems, function(e) {e.style.display = "None"}); this.heTitle.style.display = "None"; [].forEach.call(this.enElems, function(e) {e.style.display = "Block"}); @@ -413,6 +413,9 @@ export class PopupManager { elem.addEventListener('mouseout', this.hidePopup, false); } else if (this.mode === "popup-click") { elem.addEventListener('click', (event) => { + if (event.ctrlKey) { + return; + } event.preventDefault(); event.stopPropagation(); this.showPopup(elem, source); @@ -420,4 +423,4 @@ export class PopupManager { }, false); } } -} \ No newline at end of file +} diff --git a/static/js/sefaria/sefaria.js b/static/js/sefaria/sefaria.js index ecffc50cd5..c9b99d7673 100644 --- a/static/js/sefaria/sefaria.js +++ b/static/js/sefaria/sefaria.js @@ -479,16 +479,16 @@ Sefaria = extend(Sefaria, { let refStrs = [""]; refs.map(ref => { let last = refStrs[refStrs.length-1]; - const encodedRef = encodeURIComponent(ref) - if (`${hostStr}${last}|${encodedRef}${paramStr}`.length > MAX_URL_LENGTH) { - refStrs.push(encodedRef) + const encodedFullURL = encodeURI(`${hostStr}${last}|${ref}${paramStr}`); + if (encodedFullURL.length > MAX_URL_LENGTH) { + refStrs.push(ref) } else { - refStrs[refStrs.length-1] += last.length ? `|${encodedRef}` : encodedRef; + refStrs[refStrs.length-1] += last.length ? `|${ref}` : ref; } }); let promises = refStrs.map(refStr => this._cachedApiPromise({ - url: `${hostStr}${refStr}${paramStr}`, + url: `${hostStr}${encodeURIComponent(refStr)}${paramStr}`, key: refStr + paramStr, store: this._bulkTexts })); @@ -623,27 +623,52 @@ Sefaria = extend(Sefaria, { firstName: firstName, lastName: lastName, }; - return await Sefaria.postToApi(`/api/subscribe/${email}`, null, payload); + return await Sefaria.apiRequestWithBody(`/api/subscribe/${email}`, null, payload); }, subscribeSteinsaltzNewsletter: async function(firstName, lastName, email) { const payload = {firstName, lastName}; - return await Sefaria.postToApi(`/api/subscribe/steinsaltz/${email}`, null, payload); + return await Sefaria.apiRequestWithBody(`/api/subscribe/steinsaltz/${email}`, null, payload); }, - - postToApi: async function(url, urlParams, payload) { + postRefTopicLink: function(refInUrl, payload) { + const url = `/api/ref-topic-links/${Sefaria.normRef(refInUrl)}`; + // payload will need to be refactored once /api/ref-topic-links takes a more standard input + return Sefaria.adminEditorApiRequest(url, null, payload); + }, + adminEditorApiRequest: async function(url, urlParams, payload, method="POST") { + /** + * Wraps apiRequestWithBody() with basic alerting if response has an error + */ + let result; + try { + result = await Sefaria.apiRequestWithBody(url, urlParams, payload, method); + } catch (e) { + alert(Sefaria._("Something went wrong. Sorry!")); + throw e; + } + if (result.error) { + alert(result.error); + throw result.error; + } else { + return result; + } + }, + apiRequestWithBody: async function(url, urlParams, payload, method="POST") { + /** + * Generic function for performing an API request with a payload. Payload and urlParams are optional and will not be used if falsy. + */ let apiUrl = this.apiHost + url; if (urlParams) { apiUrl += '?' + new URLSearchParams(urlParams).toString(); } const response = await fetch(apiUrl, { - method: "POST", + method, mode: 'same-origin', headers: { 'X-CSRFToken': Cookies.get('csrftoken'), 'Content-Type': 'application/json' }, credentials: 'same-origin', - body: JSON.stringify(payload) + body: payload && JSON.stringify(payload) }); if (!response.ok) { diff --git a/static/js/sefaria/strings.js b/static/js/sefaria/strings.js index c913571cf2..752f95cdf1 100644 --- a/static/js/sefaria/strings.js +++ b/static/js/sefaria/strings.js @@ -282,6 +282,7 @@ const Strings = { "Location: ": "מיקום: ", "Translations": "תרגומים", "Uncategorized": "לא מסווג", + "Text display options": "אפשרויות תצוגת טקסט", // Collections "Collections": "אסופות", diff --git a/templates/base.html b/templates/base.html index 13238c6d3e..9aba8e05ff 100644 --- a/templates/base.html +++ b/templates/base.html @@ -151,7 +151,8 @@ {% endif %} - + + diff --git a/templates/static/en/about.html b/templates/static/en/about.html index 11cf8b0a87..f8ec4373fd 100644 --- a/templates/static/en/about.html +++ b/templates/static/en/about.html @@ -272,6 +272,18 @@

Sefaria adds a French Jerusalem Talmud to its collection of translated texts, which includes a German translation of the Babylonian Talmud. +
+

2023

+
+ Along with the rest of the Jewish world, the Sefaria team mourned the horrific attacks carried out on October 7th. In the difficult months that followed, Sefaria worked to support our colleagues in Israel as well as to continue expanding access to Torah as a source of comfort and strength for the Jewish people. +
+
+ Sefaria’s R&D arm, Sefaria Ventures, launches a groundbreaking partnership with AppliedAI and the Technical University of Munich (TUM) to explore the possibilities of leveraging AI to significantly expand access to Torah. +
+
+ Sefaria partners with the Steinsaltz Center and Aleph Society to launch a digital collection of Rabbi Adin Steinsaltz’s complete works of commentary, making the renowned Rabbi’s writings available to all who wish to learn. +
+

diff --git a/templates/static/he/about.html b/templates/static/he/about.html index a48e15d850..b3a7ee5736 100644 --- a/templates/static/he/about.html +++ b/templates/static/he/about.html @@ -279,6 +279,18 @@

ספריא מוסיפה את התלמוד הירושלמי בצרפתית לאוסף המקורות המתורגמים, הכולל תרגום של התלמוד הבבלי לגרמנית. +
+

2023

+
+ יחד עם כל העולם היהודי, צוות ספריא התאבל על הטבח הנורא של ה-7 באוקטובר. בחודשים הקשים שלאחר מכן פעלנו כדי לתמוך בעמיתינו בישראל וכדי להמשיך את הגדלת הספריה והנגישות למקורות היהדות שמהווים עבור רבים בעם היהודי מקור לנחמה, כוח ותקווה. +
+
+ מחלקת המחקר והפיתוח של ספריא משיקה שותפות פורצת דרך עם AppliedAI והאוניברסיטה הטכנית של מינכן (TUM). מטרת השותפות היא בחינת האפשרויות הטמונות במינוף של אינטליגנציה מלאכותית ככלי להרחבה משמעותית של גישה ציבורית לתורה. +
+
+ ספריא יוצרת שותפות עם מרכז שטיינזלץ וה-Aleph Society כדי להשיק אוסף דיגיטלי של כל הפרשנויות שכתב הרב עדין שטיינזלץ, ובכך להנגיש את כלל כתביו של הרב הנודע לכל לומד או לומדת באשר יהיו. +
+

diff --git a/templates/static/link-to-annual-report.html b/templates/static/link-to-annual-report.html index fb4e72013a..111388ad80 100644 --- a/templates/static/link-to-annual-report.html +++ b/templates/static/link-to-annual-report.html @@ -15,7 +15,7 @@

Annual Report דו"ח שנתי

{% endif %} - +
diff --git a/templates/static/the-sefaria-story.html b/templates/static/the-sefaria-story.html index 00300fdccc..12c67e71a2 100644 --- a/templates/static/the-sefaria-story.html +++ b/templates/static/the-sefaria-story.html @@ -43,7 +43,7 @@