diff --git a/build/ci/integration-values.yaml b/build/ci/integration-values.yaml index cbdf7b6fd0..e8619f44c7 100644 --- a/build/ci/integration-values.yaml +++ b/build/ci/integration-values.yaml @@ -57,7 +57,7 @@ localSettings: DEBUG: true DOMAIN_LANGUAGE: {} APSCHEDULER_NAME: "apscheduler-{{ .Values.deployEnv }}" - SEARCH_ADMIN: "http://elasticsearch-data:9200" + SEARCH_URL: "http://elasticsearch-data:9200" TURN_SERVER: '' USE_CLOUDFLARE: false FRONT_END_URL: "http://${NAME}.integration.sefaria.org" diff --git a/build/ci/production-values.yaml b/build/ci/production-values.yaml index f55f4fcc41..bbf9b243d0 100644 --- a/build/ci/production-values.yaml +++ b/build/ci/production-values.yaml @@ -179,6 +179,8 @@ cronJobs: enabled: true reindexElasticSearch: enabled: true + SEARCH_HOST_ES6: "elasticsearch-data" + SEARCH_HOST_ES8: "elasticsearch-es-http.elasticsearch.svc" topicsIndexing: enabled: true trello: @@ -206,7 +208,7 @@ localSettings: } MONGO_HOST: "mongo" APSCHEDULER_NAME: "apscheduler-{{ .Values.deployEnv }}" - SEARCH_ADMIN: "http://elasticsearch-data:9200" + SEARCH_URL: "http://elasticsearch-data:9200" TURN_SERVER: '' USE_CLOUDFLARE: false FRONT_END_URL: "http://www.sefaria.org" @@ -216,7 +218,6 @@ localSettings: GLOBAL_WARNING: false GLOBAL_WARNING_MESSAGE: "Sefaria will be in Read-Only mode for scheduled maintenance from 4:45pm-6:45pm Pacific time. Edits will not be saved during that time." SITE_PACKAGE: "sites.sefaria" - SEARCH_HOST: elasticsearch.data DEFAULT_FROM_EMAIL: "Sefaria " SERVER_EMAIL: "dev@sefaria.org" MULTISERVER_ENABLED: "True" diff --git a/build/ci/sandbox-values.yaml b/build/ci/sandbox-values.yaml index 776d232677..12dcce6766 100644 --- a/build/ci/sandbox-values.yaml +++ b/build/ci/sandbox-values.yaml @@ -53,7 +53,7 @@ localSettings: DEBUG: false DOMAIN_LANGUAGE: {} APSCHEDULER_NAME: "apscheduler-{{ .Values.deployEnv }}" - SEARCH_ADMIN: "http://elasticsearch-data:9200" + SEARCH_URL: "http://elasticsearch-data:9200" TURN_SERVER: '' USE_CLOUDFLARE: false FRONT_END_URL: "http://${NAME}.cauldron.sefaria.org" diff --git a/helm-chart/sefaria-project/templates/_helpers.tpl b/helm-chart/sefaria-project/templates/_helpers.tpl index 0e7c9ecb57..ce020c7bb7 100644 --- a/helm-chart/sefaria-project/templates/_helpers.tpl +++ b/helm-chart/sefaria-project/templates/_helpers.tpl @@ -54,6 +54,22 @@ elastic-certificate-{{ .Values.deployEnv }} {{- end }} {{- end }} +{{- define "sefaria.secrets.elasticUser" }} +{{- if .Values.secrets.elasticUser.ref -}} +{{- .Values.secrets.elasticUser.ref }} +{{- else -}} +elastic-user-{{ .Values.deployEnv }} +{{- end }} +{{- end }} + +{{- define "sefaria.secrets.elasticAdmin" }} +{{- if .Values.secrets.elasticAdmin.ref -}} +{{- .Values.secrets.elasticAdmin.ref }} +{{- else -}} +elastic-admin-{{ .Values.deployEnv }} +{{- end }} +{{- end }} + {{- define "sefaria.secrets.originTls" }} {{- if .Values.ingress.secrets.originTls.ref -}} diff --git a/helm-chart/sefaria-project/templates/configmap/local-settings-file.yaml b/helm-chart/sefaria-project/templates/configmap/local-settings-file.yaml index 4a3a9bb6ac..3c57402e4e 100644 --- a/helm-chart/sefaria-project/templates/configmap/local-settings-file.yaml +++ b/helm-chart/sefaria-project/templates/configmap/local-settings-file.yaml @@ -136,16 +136,11 @@ data: } SERVER_EMAIL = os.getenv("SERVER_EMAIL") - SEARCH_HOST = "/api/search" - SEARCH_ADMIN = os.getenv("SEARCH_ADMIN") - SEARCH_ADMIN_USER = os.getenv("SEARCH_ADMIN_USER") - SEARCH_ADMIN_PW = os.getenv("SEARCH_ADMIN_PW") - SEARCH_ADMIN_K8S = os.getenv("SEARCH_ADMIN_K8S") + auth_str = f'{os.getenv("ELASTIC_USERNAME")}:{os.getenv("ELASTIC_PASSWORD")}@' if os.getenv("ELASTIC_USERNAME") else '' + SEARCH_URL = f'http://{auth_str}{os.getenv("SEARCH_HOST")}:9200' SEARCH_INDEX_ON_SAVE = True - SEARCH_INDEX_NAME = "sefaria" SEARCH_INDEX_NAME_TEXT = 'text' # name of the ElasticSearch index to use SEARCH_INDEX_NAME_SHEET = 'sheet' - SEARCH_INDEX_NAME_MERGED = 'merged' TURN_SERVER = os.getenv("TURN_SERVER") #coturn.cauldron.sefaria.org TURN_SECRET= os.getenv("TURN_SECRET") diff --git a/helm-chart/sefaria-project/templates/configmap/local-settings.yaml b/helm-chart/sefaria-project/templates/configmap/local-settings.yaml index ffe59971d5..aa39d2ddc4 100644 --- a/helm-chart/sefaria-project/templates/configmap/local-settings.yaml +++ b/helm-chart/sefaria-project/templates/configmap/local-settings.yaml @@ -9,7 +9,6 @@ data: DEBUG: "{{ .Values.localSettings.DEBUG }}" DOMAIN_LANGUAGE: {{ .Values.localSettings.DOMAIN_LANGUAGE | toJson | quote }} APSCHEDULER_NAME: {{ tpl .Values.localSettings.APSCHEDULER_NAME . | quote }} - SEARCH_ADMIN: "http://{{ .Values.nginx.SEARCH_HOST }}:9200" TURN_SERVER: {{ .Values.localSettings.TURN_SERVER | quote }} USE_CLOUDFLARE: "{{ .Values.localSettings.USE_CLOUDFLARE }}" FRONT_END_URL: {{ .Values.localSettings.FRONT_END_URL | quote }} @@ -26,3 +25,4 @@ data: SENTRY_ENVIRONMENT: {{ .Values.deployEnv | quote }} SENTRY_CODE_VERSION: {{ .Values.web.containerImage.tag }} FAIL_GRACEFULLY: "{{ .Values.localSettings.FAIL_GRACEFULLY }}" + SEARCH_HOST: {{ .Values.nginx.SEARCH_HOST | quote }} diff --git a/helm-chart/sefaria-project/templates/configmap/nginx.yaml b/helm-chart/sefaria-project/templates/configmap/nginx.yaml index 0b118d9b24..0c2fba7288 100644 --- a/helm-chart/sefaria-project/templates/configmap/nginx.yaml +++ b/helm-chart/sefaria-project/templates/configmap/nginx.yaml @@ -28,6 +28,16 @@ data: } } {{- end }} + entrypoint.sh: | + #!/bin/bash + + set -e + + export ELASTIC_AUTH_HEADER=$(echo -n $ELASTIC_USERNAME:$ELASTIC_PASSWORD | base64) + envsubst '${ENV_NAME},${VARNISH_HOST},${SEARCH_HOST},${RELEASE_TAG},${STRAPI_LOCATION},${ELASTIC_AUTH_HEADER}{{- if .Values.linker.enabled }},${LINKER_HOST}{{- end }}{{- if .Values.instrumentation.enabled }},${NGINX_VERSION}{{- end }}' < /conf/nginx.template.conf > /nginx.conf + + nginx -c /nginx.conf -g 'daemon off;' + nginx.template.conf: |- {{- if .Values.instrumentation.enabled }} load_module /etc/nginx/modules/ngx_http_opentracing_module.so; @@ -108,6 +118,7 @@ data: location /api/search/ { rewrite ^/(?:api/search)/(.*)$ /$1 break; proxy_set_header Content-Type application/json; # es 6.0 requires this header + proxy_set_header Authorization "Basic ${ELASTIC_AUTH_HEADER}"; add_header 'Access-Control-Allow-Origin' ''; proxy_pass http://elasticsearch_upstream/; } diff --git a/helm-chart/sefaria-project/templates/cronjob/reindex-elasticsearch-es6.yaml b/helm-chart/sefaria-project/templates/cronjob/reindex-elasticsearch-es6.yaml new file mode 100644 index 0000000000..9d15f9fb38 --- /dev/null +++ b/helm-chart/sefaria-project/templates/cronjob/reindex-elasticsearch-es6.yaml @@ -0,0 +1,77 @@ +{{- if .Values.cronJobs.reindexElasticSearch.enabled }} +--- +apiVersion: batch/v1 +kind: CronJob +metadata: + name: {{ .Values.deployEnv }}-reindex-elastic-search-es6 + labels: + {{- include "sefaria.labels" . | nindent 4 }} +spec: + schedule: "20 13 * * 0" + jobTemplate: + spec: + backoffLimit: 1 + template: + spec: + affinity: + podAntiAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + - labelSelector: + matchExpressions: + - key: app + operator: In + values: + - mongo + topologyKey: kubernetes.io.hostname + containers: + - name: reindex-elastic-search-es6 + image: "{{ .Values.web.containerImage.imageRegistry }}:{{ .Values.web.containerImage.tag }}" + resources: + limits: + memory: 9Gi + requests: + memory: 7Gi + env: + - name: SEARCH_HOST + value: "{{ .Values.cronjobs.reindexElasticSearch.SEARCH_HOST_ES6 }}" + - name: REDIS_HOST + value: "redis-{{ .Values.deployEnv }}" + - name: NODEJS_HOST + value: "node-{{ .Values.deployEnv }}-{{ .Release.Revision }}" + - name: VARNISH_HOST + value: "varnish-{{ .Values.deployEnv }}-{{ .Release.Revision }}" + - name: SLACK_URL + valueFrom: + secretKeyRef: + name: {{ template "sefaria.secrets.slackWebhook" . }} + key: slack-webhook + envFrom: + - secretRef: + name: {{ .Values.secrets.localSettings.ref }} + optional: true + - configMapRef: + name: local-settings-{{ .Values.deployEnv }} + - secretRef: + name: local-settings-secrets-{{ .Values.deployEnv }} + optional: true + volumeMounts: + - mountPath: /app/sefaria/local_settings.py + name: local-settings + subPath: local_settings.py + readOnly: true + command: ["bash"] + args: [ + "-c", + "mkdir -p /log && touch /log/sefaria_book_errors.log && pip install numpy && /app/run /app/scripts/reindex_elasticsearch_cronjob_ES6.py" + ] + restartPolicy: Never + volumes: + - name: local-settings + configMap: + name: local-settings-file-{{ .Values.deployEnv }} + items: + - key: local_settings.py + path: local_settings.py + successfulJobsHistoryLimit: 1 + failedJobsHistoryLimit: 2 +{{- end }} diff --git a/helm-chart/sefaria-project/templates/cronjob/reindex-elasticsearch.yaml b/helm-chart/sefaria-project/templates/cronjob/reindex-elasticsearch.yaml index 9cefe302fc..97ca61e379 100644 --- a/helm-chart/sefaria-project/templates/cronjob/reindex-elasticsearch.yaml +++ b/helm-chart/sefaria-project/templates/cronjob/reindex-elasticsearch.yaml @@ -7,7 +7,7 @@ metadata: labels: {{- include "sefaria.labels" . | nindent 4 }} spec: - schedule: "20 13 * * 0" + schedule: "20 13 * * 2" jobTemplate: spec: backoffLimit: 1 @@ -32,6 +32,8 @@ spec: requests: memory: 7Gi env: + - name: SEARCH_HOST + value: "{{ .Values.cronjobs.reindexElasticSearch.SEARCH_HOST_ES8 }}" - name: REDIS_HOST value: "redis-{{ .Values.deployEnv }}" - name: NODEJS_HOST @@ -44,14 +46,16 @@ spec: name: {{ template "sefaria.secrets.slackWebhook" . }} key: slack-webhook envFrom: + - secretRef: + name: {{ template "sefaria.secrets.elasticAdmin" . }} - secretRef: name: {{ .Values.secrets.localSettings.ref }} optional: true + - configMapRef: + name: local-settings-{{ .Values.deployEnv }} - secretRef: name: local-settings-secrets-{{ .Values.deployEnv }} optional: true - - configMapRef: - name: local-settings-{{ .Values.deployEnv }} volumeMounts: - mountPath: /app/sefaria/local_settings.py name: local-settings @@ -60,7 +64,7 @@ spec: command: ["bash"] args: [ "-c", - "mkdir -p /log && touch /log/sefaria_book_errors.log && pip install numpy && /app/run /app/scripts/reindex_elasticsearch_cronjob.py" + "mkdir -p /log && touch /log/sefaria_book_errors.log && pip install numpy elasticsearch==8.8.2 git+https://github.com/Sefaria/elasticsearch-dsl-py@v8.0.0#egg=elasticsearch-dsl && /app/run /app/scripts/reindex_elasticsearch_cronjob.py" ] restartPolicy: Never volumes: diff --git a/helm-chart/sefaria-project/templates/rollout/nginx.yaml b/helm-chart/sefaria-project/templates/rollout/nginx.yaml index f1b24d995d..e2e2334473 100644 --- a/helm-chart/sefaria-project/templates/rollout/nginx.yaml +++ b/helm-chart/sefaria-project/templates/rollout/nginx.yaml @@ -52,9 +52,7 @@ spec: - name: nginx image: "{{ .Values.nginx.containerImage.imageRegistry }}:{{ .Values.nginx.containerImage.tag }}" imagePullPolicy: Always - command: ["bash", "-c"] - # https://serverfault.com/questions/577370/how-can-i-use-environment-variables-in-nginx-conf - args: [ "envsubst '${ENV_NAME},${VARNISH_HOST},${SEARCH_HOST},${RELEASE_TAG},${STRAPI_LOCATION}{{- if .Values.linker.enabled }},${LINKER_HOST}{{- end }}{{- if .Values.instrumentation.enabled }},${NGINX_VERSION}{{- end }}' < /conf/nginx.template.conf > /nginx.conf && exec nginx -c /nginx.conf -g 'daemon off;'" ] + command: ["bash", "-c", "/usr/src/entrypoint.sh"] ports: - containerPort: 80 - containerPort: 443 @@ -76,6 +74,9 @@ spec: name: nginx-conf subPath: nginx.template.conf readOnly: true + - mountPath: /usr/src/entrypoint.sh + name: nginx-conf + subPath: entrypoint.sh {{- if .Values.instrumentation.enabled }} - mountPath: /etc/nginx/opentracing.json name: nginx-conf @@ -106,6 +107,8 @@ spec: value: "linker-{{ .Values.deployEnv }}-{{ .Release.Revision }}" {{- end }} envFrom: + - secretRef: + name: {{ template "sefaria.secrets.elasticUser" . }} - configMapRef: name: local-settings-nginx-{{ .Values.deployEnv }} optional: true @@ -116,6 +119,7 @@ spec: - name: nginx-conf configMap: name: nginx-conf-{{ .Values.deployEnv }} + defaultMode: 0755 - name: robots-txt configMap: name: robots-txt-{{ .Values.deployEnv }} diff --git a/helm-chart/sefaria-project/templates/rollout/web.yaml b/helm-chart/sefaria-project/templates/rollout/web.yaml index d1862c73dc..eced50467c 100644 --- a/helm-chart/sefaria-project/templates/rollout/web.yaml +++ b/helm-chart/sefaria-project/templates/rollout/web.yaml @@ -118,6 +118,8 @@ spec: value: k8s.container.name=app,k8s.deployment.name={{ .Values.deployEnv }}-web,k8s.namespace.name={{ .Release.Namespace }},k8s.node.name=$(OTEL_RESOURCE_ATTRIBUTES_NODE_NAME),k8s.pod.name=$(OTEL_RESOURCE_ATTRIBUTES_POD_NAME) {{- end }} envFrom: + - secretRef: + name: {{ template "sefaria.secrets.elasticUser" . }} - secretRef: name: {{ .Values.secrets.localSettings.ref }} optional: true diff --git a/helm-chart/sefaria-project/templates/secret/elastic-admin.yaml b/helm-chart/sefaria-project/templates/secret/elastic-admin.yaml new file mode 100644 index 0000000000..d6a3266af9 --- /dev/null +++ b/helm-chart/sefaria-project/templates/secret/elastic-admin.yaml @@ -0,0 +1,11 @@ +{{- if .Values.secrets.elasticAdmin.data }} +apiVersion: v1 +kind: Secret +metadata: + name: elastic-admin-{{ .Values.deployEnv }} + labels: + deployEnv: "{{ .Values.deployEnv }}" + {{- include "sefaria.labels" . | nindent 4 }} +type: Opaque +stringData: {{ .Values.secrets.elasticAdmin.data | toYaml | nindent 2 }} +{{- end }} diff --git a/helm-chart/sefaria-project/templates/secret/elastic-user.yaml b/helm-chart/sefaria-project/templates/secret/elastic-user.yaml new file mode 100644 index 0000000000..511d271a26 --- /dev/null +++ b/helm-chart/sefaria-project/templates/secret/elastic-user.yaml @@ -0,0 +1,11 @@ +{{- if .Values.secrets.elasticUser.data }} +apiVersion: v1 +kind: Secret +metadata: + name: elastic-user-{{ .Values.deployEnv }} + labels: + deployEnv: "{{ .Values.deployEnv }}" + {{- include "sefaria.labels" . | nindent 4 }} +type: Opaque +stringData: {{ .Values.secrets.elasticUser.data | toYaml | nindent 2 }} +{{- end }} diff --git a/helm-chart/sefaria-project/values.yaml b/helm-chart/sefaria-project/values.yaml index 563a5c6e01..72722c98b7 100644 --- a/helm-chart/sefaria-project/values.yaml +++ b/helm-chart/sefaria-project/values.yaml @@ -329,9 +329,7 @@ secrets: # SEFARIA_DB: # SEFARIA_DB_USER: # SEFARIA_DB_PASSWORD: - # SEARCH_ADMIN_USER: - # SEARCH_ADMIN_PW: - # SEARCH_ADMIN_K8S: + # SEARCH_URL # TURN_SECRET: # TURN_USER: # SEFARIA_BOT_API_KEY: @@ -372,6 +370,16 @@ secrets: # should be commented out and vice-versa. ref: trello-secret # data: + elasticUser: + # If you're using a reference to an existing secret then the data: section + # should be commented out and vice-versa. + ref: elastic-user + # data: + elasticAdmin: + # If you're using a reference to an existing secret then the data: section + # should be commented out and vice-versa. + ref: elastic-admin + # data: # Settings for various cronjobs @@ -391,6 +399,8 @@ cronJobs: enabled: false reindexElasticSearch: enabled: false + SEARCH_HOST_ES6: "" + SEARCH_HOST_ES8: "" topicsIndexing: enabled: false trello: diff --git a/reader/views.py b/reader/views.py index dfdceda26d..8701e0dfad 100644 --- a/reader/views.py +++ b/reader/views.py @@ -48,7 +48,7 @@ from sefaria.utils.util import text_preview, short_to_long_lang_code, epoch_time from sefaria.utils.hebrew import hebrew_term, has_hebrew from sefaria.utils.calendars import get_all_calendar_items, get_todays_calendar_items, get_keyed_calendar_items, get_parasha, get_todays_parasha -from sefaria.settings import STATIC_URL, USE_VARNISH, USE_NODE, NODE_HOST, DOMAIN_LANGUAGES, MULTISERVER_ENABLED, SEARCH_ADMIN, MULTISERVER_REDIS_SERVER, \ +from sefaria.settings import STATIC_URL, USE_VARNISH, USE_NODE, NODE_HOST, DOMAIN_LANGUAGES, MULTISERVER_ENABLED, MULTISERVER_REDIS_SERVER, \ MULTISERVER_REDIS_PORT, MULTISERVER_REDIS_DB, DISABLE_AUTOCOMPLETER, ENABLE_LINKER from sefaria.site.site_settings import SITE_SETTINGS from sefaria.system.multiserver.coordinator import server_coordinator @@ -4200,19 +4200,29 @@ def dummy_search_api(request): @csrf_exempt -def search_wrapper_api(request): +def search_wrapper_api(request, es6_compat=False): + """ + @param request: + @param es6_compat: True to return API response that's compatible with an Elasticsearch 6 compatible client + @return: + """ + from sefaria.helper.search import get_elasticsearch_client + if request.method == "POST": if "json" in request.POST: j = request.POST.get("json") # using form-urlencoded else: j = request.body # using content-type: application/json j = json.loads(j) - es_client = Elasticsearch(SEARCH_ADMIN) + es_client = get_elasticsearch_client() search_obj = Search(using=es_client, index=j.get("type")).params(request_timeout=5) search_obj = get_query_obj(search_obj=search_obj, **j) response = search_obj.execute() if response.success(): - return jsonResponse(response.to_dict(), callback=request.GET.get("callback", None)) + response_json = getattr(response.to_dict(), 'body', response.to_dict()) + if es6_compat and isinstance(response_json['hits']['total'], dict): + response_json['hits']['total'] = response_json['hits']['total']['value'] + return jsonResponse(response_json, callback=request.GET.get("callback", None)) return jsonResponse({"error": "Error with connection to Elasticsearch. Total shards: {}, Shards successful: {}, Timed out: {}".format(response._shards.total, response._shards.successful, response.timed_out)}, callback=request.GET.get("callback", None)) return jsonResponse({"error": "Unsupported HTTP method."}, callback=request.GET.get("callback", None)) diff --git a/scripts/reindex_elasticsearch_cronjob_ES6.py b/scripts/reindex_elasticsearch_cronjob_ES6.py new file mode 100644 index 0000000000..1a3f181eb2 --- /dev/null +++ b/scripts/reindex_elasticsearch_cronjob_ES6.py @@ -0,0 +1,49 @@ +""" +This file is meant to be temporary while we are migrating to elasticsearch 8 +""" +from datetime import datetime +import requests +import traceback +import os +import django +django.setup() +from sefaria.model import * +from sefaria.search_ES6 import index_all +from sefaria.local_settings import SEFARIA_BOT_API_KEY +from sefaria.pagesheetrank import update_pagesheetrank + +""" +Source sheets added after last_sheet_timestamp will be missing from the index process. We want to manually index all +source sheets created after this. Depending on the database being used to index the timestamp will be different. If +running against a production database, last_sheet_timestamp will be the time this script began running. Otherwise, this +value will need to be set to the time at which the last mongo dump was created (assuming the database is using the most +up-to-date mongo dump). +""" +# last_sheet_timestamp = datetime.fromtimestamp(os.path.getmtime("/var/data/sefaria_public/dump/sefaria")).isoformat() +try: + last_sheet_timestamp = datetime.now().isoformat() + update_pagesheetrank() + index_all() + r = requests.post("https://www.sefaria.org/admin/index-sheets-by-timestamp", data={"timestamp": last_sheet_timestamp, "apikey": SEFARIA_BOT_API_KEY}) + if "error" in r.text: + raise Exception("Error when calling admin/index-sheets-by-timestamp API: " + r.text) + else: + print("SUCCESS!", r.text) +except Exception as e: + tb_str = traceback.format_exc() + print("Caught exception") + post_object = { + "icon_emoji": ":facepalm:", + "username": "Reindex ElasticSearch", + "channel": "#engineering-discuss", + "attachments": [ + { + "fallback": tb_str, + "color": "#a30200", + "pretext": "Cronjob Error", + "text": tb_str + } + ] + } + requests.post(os.environ['SLACK_URL'], json=post_object) + raise e diff --git a/sefaria/helper/search.py b/sefaria/helper/search.py index 36aef9eb3e..757c931bfa 100644 --- a/sefaria/helper/search.py +++ b/sefaria/helper/search.py @@ -144,3 +144,9 @@ def make_filter(type, agg_type, agg_key): return Regexp(path=reg) elif type == "sheet": return Term(**{agg_type: agg_key}) + + +def get_elasticsearch_client(): + from elasticsearch import Elasticsearch + from sefaria.settings import SEARCH_URL + return Elasticsearch(SEARCH_URL) diff --git a/sefaria/helper/tests/normalization_tests.py b/sefaria/helper/tests/normalization_tests.py index 59e4ed7767..eaff8ff116 100644 --- a/sefaria/helper/tests/normalization_tests.py +++ b/sefaria/helper/tests/normalization_tests.py @@ -102,6 +102,21 @@ def test_nested_itag(): assert text[s:e] == """bullnestedThe.""" +def test_two_steps_normalization(): + test_string = ' This is a {{test}}' + + bracket_normalizer = RegexNormalizer(r'\{\{|}}', r'') + strip_normalizer = RegexNormalizer(r'^\s*|\s*$', r'') + normalizer = NormalizerComposer(steps=[bracket_normalizer, strip_normalizer]) + + mapping = normalizer.get_mapping_after_normalization(test_string, reverse=True) + assert mapping == {0: 1, 11: 3, 17: 5} + orig_inds = [(13, 17)] + new_start, new_end = normalizer.convert_normalized_indices_to_unnormalized_indices(orig_inds, mapping, reverse=True)[0] + normalized_string = normalizer.normalize(test_string) + assert normalized_string[new_start:new_end] == "test" + + def test_word_to_char(): test_string = 'some words go here\n\nhello world' words = ['go', 'here', 'hello'] diff --git a/sefaria/local_settings_ci.py b/sefaria/local_settings_ci.py index de5d56d849..d542f0b58a 100644 --- a/sefaria/local_settings_ci.py +++ b/sefaria/local_settings_ci.py @@ -69,7 +69,7 @@ APSCHEDULER_NAME = "apscheduler" # ElasticSearch server -SEARCH_ADMIN = "http://localhost:9200" +SEARCH_URL = "http://localhost:9200" SEARCH_INDEX_ON_SAVE = False # Whether to send texts and source sheet to Search Host for indexing after save SEARCH_INDEX_NAME_TEXT = 'text' # name of the ElasticSearch index to use SEARCH_INDEX_NAME_SHEET = 'sheet' diff --git a/sefaria/local_settings_example.py b/sefaria/local_settings_example.py index b6071153bb..268f30366f 100644 --- a/sefaria/local_settings_example.py +++ b/sefaria/local_settings_example.py @@ -156,7 +156,11 @@ # ElasticSearch server -SEARCH_ADMIN = "http://localhost:9200" +# URL to connect to ES server. +# Set this to https://sefaria.org/api/search to connect to production search. +# If ElasticSearch server has a password use the following format: http(s)://{username}:{password}@{base_url} +SEARCH_URL = "http://localhost:9200" + SEARCH_INDEX_ON_SAVE = False # Whether to send texts and source sheet to Search Host for indexing after save SEARCH_INDEX_NAME_TEXT = 'text' # name of the ElasticSearch index to use SEARCH_INDEX_NAME_SHEET = 'sheet' diff --git a/sefaria/search.py b/sefaria/search.py index d42d39e614..3f2c87f714 100644 --- a/sefaria/search.py +++ b/sefaria/search.py @@ -20,7 +20,6 @@ import time as pytime logger = structlog.get_logger(__name__) -from elasticsearch import Elasticsearch from elasticsearch.client import IndicesClient from elasticsearch.helpers import bulk from elasticsearch.exceptions import NotFoundError @@ -31,12 +30,13 @@ from sefaria.system.database import db from sefaria.system.exceptions import InputError from sefaria.utils.util import strip_tags -from .settings import SEARCH_ADMIN, SEARCH_INDEX_NAME_TEXT, SEARCH_INDEX_NAME_SHEET, STATICFILES_DIRS +from .settings import SEARCH_INDEX_NAME_TEXT, SEARCH_INDEX_NAME_SHEET +from sefaria.helper.search import get_elasticsearch_client from sefaria.site.site_settings import SITE_SETTINGS from sefaria.utils.hebrew import strip_cantillation import sefaria.model.queue as qu -es_client = Elasticsearch(SEARCH_ADMIN) +es_client = get_elasticsearch_client() index_client = IndicesClient(es_client) tracer = structlog.get_logger(__name__) @@ -52,7 +52,7 @@ def delete_text(oref, version, lang): curr_index = get_new_and_current_index_names('text')['current'] id = make_text_doc_id(oref.normal(), version, lang) - es_client.delete(index=curr_index, doc_type='text', id=id) + es_client.delete(index=curr_index, id=id) except Exception as e: logger.error("ERROR deleting {} / {} / {} : {}".format(oref.normal(), version, lang, e)) @@ -76,7 +76,7 @@ def delete_version(index, version, lang): def delete_sheet(index_name, id): try: - es_client.delete(index=index_name, doc_type='sheet', id=id) + es_client.delete(index=index_name, id=id) except Exception as e: logger.error("ERROR deleting sheet {}".format(id)) @@ -147,7 +147,7 @@ def index_sheet(index_name, id): "dateModified": sheet.get("dateModified", None), "views": sheet.get("views", 0) } - es_client.create(index=index_name, doc_type='sheet', id=id, body=doc) + es_client.create(index=index_name, id=id, body=doc) global doc_count doc_count += 1 return True @@ -220,7 +220,6 @@ def get_exact_english_analyzer(): "icu_normalizer", ], "filter": [ - "standard", "lowercase", "icu_folding", ], @@ -259,7 +258,7 @@ def create_index(index_name, type): } } print('Creating index {}'.format(index_name)) - index_client.create(index=index_name, body=settings) + index_client.create(index=index_name, settings=settings) if type == 'text': put_text_mapping(index_name) @@ -326,7 +325,7 @@ def put_text_mapping(index_name): } } } - index_client.put_mapping(doc_type='text', body=text_mapping, index=index_name) + index_client.put_mapping(body=text_mapping, index=index_name) def put_sheet_mapping(index_name): @@ -392,7 +391,7 @@ def put_sheet_mapping(index_name): } } } - index_client.put_mapping(doc_type='sheet', body=sheet_mapping, index=index_name) + index_client.put_mapping(body=sheet_mapping, index=index_name) def get_search_categories(oref, categories): toc_tree = library.get_toc_tree() @@ -593,7 +592,6 @@ def _cache_action(cls, segment_str, tref, heTref, version): cls._bulk_actions += [ { "_index": cls.index_name, - "_type": "text", "_id": make_text_doc_id(tref, vtitle, vlang), "_source": doc } diff --git a/sefaria/search_ES6.py b/sefaria/search_ES6.py new file mode 100644 index 0000000000..812610eb07 --- /dev/null +++ b/sefaria/search_ES6.py @@ -0,0 +1,844 @@ +# -*- coding: utf-8 -*- +""" +This file is meant to be temporary while we are migrating to elasticsearch 8 + +search.py - full-text search for Sefaria using ElasticSearch + +Writes to MongoDB Collection: index_queue +""" +import os +from datetime import datetime, timedelta +import re +import bleach +import pymongo + +# To allow these files to be run directly from command line (w/o Django shell) +os.environ['DJANGO_SETTINGS_MODULE'] = "settings" + +import structlog +import logging +from logging import NullHandler +from collections import defaultdict +import time as pytime +logger = structlog.get_logger(__name__) + +from elasticsearch import Elasticsearch +from elasticsearch.client import IndicesClient +from elasticsearch.helpers import bulk +from elasticsearch.exceptions import NotFoundError +from sefaria.model import * +from sefaria.model.text import AbstractIndex, AbstractTextRecord +from sefaria.model.user_profile import user_link, public_user_data +from sefaria.model.collection import CollectionSet +from sefaria.system.database import db +from sefaria.system.exceptions import InputError +from sefaria.utils.util import strip_tags +from .settings import SEARCH_URL, SEARCH_INDEX_NAME_TEXT, SEARCH_INDEX_NAME_SHEET, STATICFILES_DIRS +from sefaria.site.site_settings import SITE_SETTINGS +from sefaria.utils.hebrew import strip_cantillation +import sefaria.model.queue as qu + +es_client = Elasticsearch(SEARCH_URL) +index_client = IndicesClient(es_client) + +tracer = structlog.get_logger(__name__) +tracer.setLevel(logging.CRITICAL) +#tracer.addHandler(logging.FileHandler('/tmp/es_trace.log')) +tracer.addHandler(NullHandler()) + +doc_count = 0 + + +def delete_text(oref, version, lang): + try: + curr_index = get_new_and_current_index_names('text')['current'] + + id = make_text_doc_id(oref.normal(), version, lang) + es_client.delete(index=curr_index, doc_type='text', id=id) + except Exception as e: + logger.error("ERROR deleting {} / {} / {} : {}".format(oref.normal(), version, lang, e)) + + +def delete_version(index, version, lang): + assert isinstance(index, AbstractIndex) + + refs = [] + + if SITE_SETTINGS["TORAH_SPECIFIC"]: + all_gemara_indexes = library.get_indexes_in_category("Bavli") + davidson_indexes = all_gemara_indexes[:all_gemara_indexes.index("Horayot") + 1] + if Ref(index.title).is_bavli() and index.title not in davidson_indexes: + refs += index.all_section_refs() + + refs += index.all_segment_refs() + + for ref in refs: + delete_text(ref, version, lang) + + +def delete_sheet(index_name, id): + try: + es_client.delete(index=index_name, doc_type='sheet', id=id) + except Exception as e: + logger.error("ERROR deleting sheet {}".format(id)) + + +def make_text_doc_id(ref, version, lang): + """ + Returns a doc id string for indexing based on ref, versiona and lang. + + [HACK] Since Elasticsearch chokes on non-ascii ids, hebrew titles are converted + into a number using unicode_number. This mapping should be unique, but actually isn't. + (any tips welcome) + """ + if not version.isascii(): + version = str(unicode_number(version)) + + id = "%s (%s [%s])" % (ref, version, lang) + return id + + +def unicode_number(u): + """ + Returns a number corresponding to the sum value + of each unicode character in u + """ + n = 0 + for i in range(len(u)): + n += ord(u[i]) + return n + + +def index_sheet(index_name, id): + """ + Index source sheet with 'id'. + """ + + sheet = db.sheets.find_one({"id": id}) + if not sheet: return False + + pud = public_user_data(sheet["owner"]) + tag_terms_simple = make_sheet_tags(sheet) + tags = [t["en"] for t in tag_terms_simple] + topics = [] + for t in sheet.get('topics', []): + topic_obj = Topic.init(t['slug']) + if not topic_obj: + continue + topics += [topic_obj] + collections = CollectionSet({"sheets": id, "listed": True}) + collection_names = [c.name for c in collections] + try: + doc = { + "title": strip_tags(sheet["title"]), + "content": make_sheet_text(sheet, pud), + "owner_id": sheet["owner"], + "owner_name": pud["name"], + "owner_image": pud["imageUrl"], + "profile_url": pud["profileUrl"], + "version": "Source Sheet by " + user_link(sheet["owner"]), + "tags": tags, + "topic_slugs": [topic_obj.slug for topic_obj in topics], + "topics_en": [topic_obj.get_primary_title('en') for topic_obj in topics], + "topics_he": [topic_obj.get_primary_title('he') for topic_obj in topics], + "sheetId": id, + "summary": sheet.get("summary", None), + "collections": collection_names, + "datePublished": sheet.get("datePublished", None), + "dateCreated": sheet.get("dateCreated", None), + "dateModified": sheet.get("dateModified", None), + "views": sheet.get("views", 0) + } + es_client.create(index=index_name, doc_type='sheet', id=id, body=doc) + global doc_count + doc_count += 1 + return True + except Exception as e: + print("Error indexing sheet %d" % id) + print(e) + return False + + +def make_sheet_tags(sheet): + def get_primary_title(lang, titles): + return [t for t in titles if t.get("primary") and t.get("lang", "") == lang][0]["text"] + + tags = sheet.get('tags', []) + tag_terms = [(Term().load({'name': t}) or Term().load_by_title(t)) for t in tags] + tag_terms_simple = [ + { + 'en': tags[iterm], # save as en even if it's Hebrew + 'he': '' + } if term is None else + { + 'en': get_primary_title('en', term.titles), + 'he': get_primary_title('he', term.titles) + } for iterm, term in enumerate(tag_terms) + ] + #tags_en, tags_he = zip(*tag_terms_simple.values()) + return tag_terms_simple + +def make_sheet_text(sheet, pud): + """ + Returns a plain text representation of the content of sheet. + :param sheet: The sheet record + :param pud: Public User Database record for the author + """ + text = sheet["title"] + "\n{}".format(sheet.get("summary", '')) + if pud.get("name"): + text += "\nBy: " + pud["name"] + text += "\n" + if sheet.get("tags"): + text += " [" + ", ".join(sheet["tags"]) + "]\n" + for s in sheet["sources"]: + text += source_text(s) + " " + + text = bleach.clean(text, strip=True, tags=()) + + return text + + +def source_text(source): + """ + Recursive function to translate a source dictionary into text. + """ + str_fields = ["customTitle", "ref", "comment", "outsideText"] + dict_fields = ["text", "outsideBiText"] + content = [source.get(field, "") for field in str_fields] + content += [val for field in dict_fields for val in source.get(field, {}).values()] + text = " ".join([strip_tags(c) for c in content]) + + if "subsources" in source: + for s in source["subsources"]: + text += source_text(s) + + return text + + +def get_exact_english_analyzer(): + return { + "tokenizer": "standard", + "char_filter": [ + "icu_normalizer", + ], + "filter": [ + "standard", + "lowercase", + "icu_folding", + ], + } + + +def get_stemmed_english_analyzer(): + stemmed_english_analyzer = get_exact_english_analyzer() + stemmed_english_analyzer['filter'] += ["my_snow"] + return stemmed_english_analyzer + + +def create_index(index_name, type): + """ + Clears the indexes and creates it fresh with the below settings. + """ + clear_index(index_name) + + settings = { + "index": { + "blocks": { + "read_only_allow_delete": False + }, + "analysis": { + "analyzer": { + "stemmed_english": get_stemmed_english_analyzer(), + "exact_english": get_exact_english_analyzer(), + }, + "filter": { + "my_snow": { + "type": "snowball", + "language": "English" + } + } + } + } + } + print('Creating index {}'.format(index_name)) + index_client.create(index=index_name, body=settings) + + if type == 'text': + put_text_mapping(index_name) + elif type == 'sheet': + put_sheet_mapping(index_name) + + +def put_text_mapping(index_name): + """ + Settings mapping for the text document type. + """ + text_mapping = { + 'properties' : { + 'categories': { + 'type': 'keyword', + }, + "category": { + 'type': 'keyword', + }, + "he_category": { + 'type': 'keyword', + }, + "index_title": { + 'type': 'keyword', + }, + "path": { + 'type': 'keyword', + }, + "he_index_title": { + 'type': 'keyword', + }, + "he_path": { + 'type': 'keyword', + }, + "order": { + 'type': 'keyword', + }, + "pagesheetrank": { + 'type': 'double', + 'index': False + }, + "comp_date": { + 'type': 'integer', + 'index': False + }, + "version_priority": { + 'type': 'integer', + 'index': False + }, + "exact": { + 'type': 'text', + 'analyzer': 'exact_english' + }, + "naive_lemmatizer": { + 'type': 'text', + 'analyzer': 'sefaria-naive-lemmatizer', + 'search_analyzer': 'sefaria-naive-lemmatizer-less-prefixes', + 'fields': { + 'exact': { + 'type': 'text', + 'analyzer': 'exact_english' + } + } + } + } + } + index_client.put_mapping(doc_type='text', body=text_mapping, index=index_name) + + +def put_sheet_mapping(index_name): + """ + Sets mapping for the sheets document type. + """ + sheet_mapping = { + 'properties': { + 'owner_name': { + 'type': 'keyword' + }, + 'tags': { + 'type': 'keyword' + }, + "topics_en": { + "type": "keyword" + }, + "topics_he": { + "type": "keyword" + }, + "topic_slugs": { + "type": "keyword" + }, + 'owner_image': { + 'type': 'keyword' + }, + 'datePublished': { + 'type': 'date' + }, + 'dateCreated': { + 'type': 'date' + }, + 'dateModified': { + 'type': 'date' + }, + 'sheetId': { + 'type': 'integer' + }, + 'collections': { + 'type': 'keyword' + }, + 'title': { + 'type': 'keyword' + }, + 'views': { + 'type': 'integer' + }, + 'summary': { + 'type': 'keyword' + }, + 'content': { + 'type': 'text', + 'analyzer': 'stemmed_english' + }, + 'version': { + 'type': 'keyword' + }, + 'profile_url': { + 'type': 'keyword' + }, + 'owner_id': { + 'type': 'integer' + } + } + } + index_client.put_mapping(doc_type='sheet', body=sheet_mapping, index=index_name) + +def get_search_categories(oref, categories): + toc_tree = library.get_toc_tree() + cats = oref.index.categories + + indexed_categories = categories # the default + + # get the full path of every cat along the way. + # starting w/ the longest, + # check if they're root swapped. + paths = [cats[:i] for i in range(len(cats), 0, -1)] + for path in paths: + cnode = toc_tree.lookup(path) + if getattr(cnode, "searchRoot", None) is not None: + # Use the specified searchRoot, with the rest of the category path appended. + indexed_categories = [cnode.searchRoot] + cats[len(path) - 1:] + break + return indexed_categories + + +class TextIndexer(object): + + @classmethod + def clear_cache(cls): + cls.terms_dict = None + cls.version_priority_map = None + cls._bulk_actions = None + cls.best_time_period = None + + + @classmethod + def create_terms_dict(cls): + cls.terms_dict = {} + ts = TermSet() + for t in ts: + cls.terms_dict[t.name] = t.contents() + + @classmethod + def create_version_priority_map(cls): + toc = library.get_toc() + cls.version_priority_map = {} + + def traverse(mini_toc): + if type(mini_toc) == list: + for t in mini_toc: + traverse(t) + elif "contents" in mini_toc: + for t in mini_toc["contents"]: + traverse(t) + elif "title" in mini_toc and not mini_toc.get("isCollection", False): + title = mini_toc["title"] + try: + r = Ref(title) + except InputError: + print("Failed to parse ref, {}".format(title)) + return + vlist = cls.get_ref_version_list(r) + vpriorities = defaultdict(lambda: 0) + for i, v in enumerate(vlist): + lang = v.language + cls.version_priority_map[(title, v.versionTitle, lang)] = (vpriorities[lang], mini_toc["categories"]) + vpriorities[lang] += 1 + + traverse(toc) + + @staticmethod + def get_ref_version_list(oref, tries=0): + try: + return oref.index.versionSet().array() + except InputError as e: + print(f"InputError: {oref.normal()}") + return [] + except pymongo.errors.AutoReconnect as e: + if tries < 200: + pytime.sleep(5) + return TextIndexer.get_ref_version_list(oref, tries+1) + else: + print("get_ref_version_list -- Tried: {} times. Failed :(".format(tries)) + raise e + + @classmethod + def get_all_versions(cls, tries=0, versions=None, page=0): + versions = versions or [] + try: + version_limit = 10 + temp_versions = [] + first_run = True + while first_run or len(temp_versions) > 0: + temp_versions = VersionSet(limit=version_limit, page=page).array() + versions += temp_versions + page += 1 + first_run = False + return versions + except pymongo.errors.AutoReconnect as e: + if tries < 200: + pytime.sleep(5) + return cls.get_all_versions(tries+1, versions, page) + else: + print("Tried: {} times. Got {} versions".format(tries, len(versions))) + raise e + + @classmethod + def index_all(cls, index_name, debug=False, for_es=True, action=None): + cls.index_name = index_name + cls.create_version_priority_map() + cls.create_terms_dict() + Ref.clear_cache() # try to clear Ref cache to save RAM + + versions = sorted([x for x in cls.get_all_versions() if (x.title, x.versionTitle, x.language) in cls.version_priority_map], key=lambda x: cls.version_priority_map[(x.title, x.versionTitle, x.language)][0]) + versions_by_index = {} + # organizing by index for the merged case. There is no longer a merged case but keeping this logic b/c it seems fine + for v in versions: + key = (v.title, v.language) + if key in versions_by_index: + versions_by_index[key] += [v] + else: + versions_by_index[key] = [v] + print("Beginning index of {} versions.".format(len(versions))) + vcount = 0 + total_versions = len(versions) + versions = None # release RAM + for title, vlist in list(versions_by_index.items()): + cls.curr_index = vlist[0].get_index() if len(vlist) > 0 else None + if for_es: + cls._bulk_actions = [] + try: + cls.best_time_period = cls.curr_index.best_time_period() + except ValueError: + cls.best_time_period = None + for v in vlist: + if v.versionTitle == "Yehoyesh's Yiddish Tanakh Translation [yi]": + print("skipping yiddish. we don't like yiddish") + continue + + cls.index_version(v, action=action) + print("Indexed Version {}/{}".format(vcount, total_versions)) + vcount += 1 + if for_es: + bulk(es_client, cls._bulk_actions, stats_only=True, raise_on_error=False) + + @classmethod + def index_version(cls, version, tries=0, action=None): + if not action: + action = cls._cache_action + try: + version.walk_thru_contents(action, heTref=cls.curr_index.get_title('he'), schema=cls.curr_index.schema, terms_dict=cls.terms_dict) + except pymongo.errors.AutoReconnect as e: + # Adding this because there is a mongo call for dictionary words in walk_thru_contents() + if tries < 200: + pytime.sleep(5) + print("Retrying {}. Try {}".format(version.title, tries)) + cls.index_version(version, tries+1) + else: + print("Tried {} times to get {}. I have failed you...".format(tries, version.title)) + raise e + except StopIteration: + print("Could not find dictionary node in {}".format(version.title)) + + @classmethod + def index_ref(cls, index_name, oref, version_title, lang): + # slower than `cls.index_version` but useful when you don't want the overhead of loading all versions into cache + cls.index_name = index_name + cls.curr_index = oref.index + try: + cls.best_time_period = cls.curr_index.best_time_period() + except ValueError: + cls.best_time_period = None + version_priority = 0 + hebrew_version_title = None + for priority, v in enumerate(cls.get_ref_version_list(oref)): + if v.versionTitle == version_title: + version_priority = priority + hebrew_version_title = getattr(v, 'versionTitleInHebrew', None) + content = TextChunk(oref, lang, vtitle=version_title).ja().flatten_to_string() + categories = cls.curr_index.categories + tref = oref.normal() + doc = cls.make_text_index_document(tref, oref.he_normal(), version_title, lang, version_priority, content, categories, hebrew_version_title) + id = make_text_doc_id(tref, version_title, lang) + es_client.index(index_name, doc, id=id) + + @classmethod + def _cache_action(cls, segment_str, tref, heTref, version): + # Index this document as a whole + vtitle = version.versionTitle + vlang = version.language + hebrew_version_title = getattr(version, 'versionTitleInHebrew', None) + try: + version_priority, categories = cls.version_priority_map[(version.title, vtitle, vlang)] + #TODO include sgement_str in this func + doc = cls.make_text_index_document(tref, heTref, vtitle, vlang, version_priority, segment_str, categories, hebrew_version_title) + # print doc + except Exception as e: + logger.error("Error making index document {} / {} / {} : {}".format(tref, vtitle, vlang, str(e))) + return + + if doc: + try: + cls._bulk_actions += [ + { + "_index": cls.index_name, + "_type": "text", + "_id": make_text_doc_id(tref, vtitle, vlang), + "_source": doc + } + ] + except Exception as e: + logger.error("ERROR indexing {} / {} / {} : {}".format(tref, vtitle, vlang, e)) + + @classmethod + def remove_footnotes(cls, content): + ftnotes = AbstractTextRecord.find_all_itags(content, only_footnotes=True)[1] + if len(ftnotes) == 0: + return content + else: + for sup_tag in ftnotes: + i_tag = sup_tag.next_sibling + content += f" {sup_tag.text} {i_tag.text}" + content = AbstractTextRecord.strip_itags(content) + return content + + @classmethod + def modify_text_in_doc(cls, content): + content = AbstractTextRecord.strip_imgs(content) + content = cls.remove_footnotes(content) + content = strip_cantillation(content, strip_vowels=False).strip() + content = re.sub(r'<[^>]+>', ' ', content) # replace HTML tags with space so that words dont get smushed together + content = re.sub(r'\([^)]+\)', ' ', content) # remove all parens + while " " in content: # make sure there are not many spaces in a row + content = content.replace(" ", " ") + return content + + @classmethod + def make_text_index_document(cls, tref, heTref, version, lang, version_priority, content, categories, hebrew_version_title): + """ + Create a document for indexing from the text specified by ref/version/lang + """ + # Don't bother indexing if there's no content + if not content: + return False + content = cls.modify_text_in_doc(content) + if len(content) == 0: + return False + + oref = Ref(tref) + + indexed_categories = get_search_categories(oref, categories) + + tp = cls.best_time_period + if tp is not None: + comp_start_date = int(tp.start) + else: + comp_start_date = 3000 # far in the future + + ref_data = RefData().load({"ref": tref}) + pagesheetrank = ref_data.pagesheetrank if ref_data is not None else RefData.DEFAULT_PAGESHEETRANK + + return { + "ref": tref, + "heRef": heTref, + "version": version, + "lang": lang, + "version_priority": version_priority if version_priority is not None else 1000, + "titleVariants": oref.index_node.all_tree_titles("en"), + "categories": indexed_categories, + "order": oref.order_id(), + "path": "/".join(indexed_categories + [cls.curr_index.title]), + "pagesheetrank": pagesheetrank, + "comp_date": comp_start_date, + #"hebmorph_semi_exact": content, + "exact": content, + "naive_lemmatizer": content, + 'hebrew_version_title': hebrew_version_title, + } + + +def index_sheets_by_timestamp(timestamp): + """ + :param timestamp str: index all sheets modified after `timestamp` (in isoformat) + """ + + name_dict = get_new_and_current_index_names('sheet', debug=False) + curr_index_name = name_dict['current'] + try: + ids = db.sheets.find({"status": "public", "dateModified": {"$gt": timestamp}}).distinct("id") + except Exception as e: + print(e) + return str(e) + + succeeded = [] + failed = [] + + for id in ids: + did_succeed = index_sheet(curr_index_name, id) + if did_succeed: + succeeded += [id] + else: + failed += [id] + + return {"succeeded": {"num": len(succeeded), "ids": succeeded}, "failed": {"num": len(failed), "ids": failed}} + + +def index_public_sheets(index_name): + """ + Index all source sheets that are publicly listed. + """ + ids = db.sheets.find({"status": "public"}).distinct("id") + for id in ids: + index_sheet(index_name, id) + + +def index_public_notes(): + """ + Index all public notes. + + TODO + """ + pass + + +def clear_index(index_name): + """ + Delete the search index. + """ + try: + index_client.delete(index=index_name) + except Exception as e: + print("Error deleting Elasticsearch Index named %s" % index_name) + print(e) + + +def add_ref_to_index_queue(ref, version, lang): + """ + Adds a text to index queue to be indexed later. + """ + qu.IndexQueue({ + "ref": ref, + "lang": lang, + "version": version, + "type": "ref", + }).save() + + return True + + +def index_from_queue(): + """ + Index every ref/version/lang found in the index queue. + Delete queue records on success. + """ + index_name = get_new_and_current_index_names('text')['current'] + queue = db.index_queue.find() + for item in queue: + try: + TextIndexer.index_ref(index_name, Ref(item["ref"]), item["version"], item["lang"], False) + db.index_queue.remove(item) + except Exception as e: + logging.error("Error indexing from queue ({} / {} / {}) : {}".format(item["ref"], item["version"], item["lang"], e)) + + +def add_recent_to_queue(ndays): + """ + Look through the last ndays of the activitiy log, + add to the index queue any refs that had their text altered. + """ + cutoff = datetime.now() - timedelta(days=ndays) + query = { + "date": {"$gt": cutoff}, + "rev_type": {"$in": ["add text", "edit text"]} + } + activity = db.history.find(query) + refs = set() + for a in activity: + refs.add((a["ref"], a["version"], a["language"])) + for ref in list(refs): + add_ref_to_index_queue(ref[0], ref[1], ref[2]) + + +def get_new_and_current_index_names(type, debug=False): + base_index_name_dict = { + 'text': SEARCH_INDEX_NAME_TEXT, + 'sheet': SEARCH_INDEX_NAME_SHEET, + } + index_name_a = "{}-a{}".format(base_index_name_dict[type], '-debug' if debug else '') + index_name_b = "{}-b{}".format(base_index_name_dict[type], '-debug' if debug else '') + alias_name = "{}{}".format(base_index_name_dict[type], '-debug' if debug else '') + aliases = index_client.get_alias() + try: + a_alias = aliases[index_name_a]['aliases'] + choose_a = alias_name not in a_alias + except KeyError: + choose_a = True + + if choose_a: + new_index_name = index_name_a + old_index_name = index_name_b + else: + new_index_name = index_name_b + old_index_name = index_name_a + return {"new": new_index_name, "current": old_index_name, "alias": alias_name} + + +def index_all(skip=0, debug=False): + """ + Fully create the search index from scratch. + """ + start = datetime.now() + index_all_of_type('text', skip=skip, debug=debug) + index_all_of_type('sheet', skip=skip, debug=debug) + end = datetime.now() + db.index_queue.delete_many({}) # index queue is now stale + print("Elapsed time: %s" % str(end-start)) + + +def index_all_of_type(type, skip=0, debug=False): + index_names_dict = get_new_and_current_index_names(type=type, debug=debug) + print('CREATING / DELETING {}'.format(index_names_dict['new'])) + print('CURRENT {}'.format(index_names_dict['current'])) + for i in range(10): + print('STARTING IN T-MINUS {}'.format(10 - i)) + pytime.sleep(1) + + index_all_of_type_by_index_name(type, index_names_dict['new'], skip, debug) + + try: + #index_client.put_settings(index=index_names_dict['current'], body={"index": { "blocks": { "read_only_allow_delete": False }}}) + index_client.delete_alias(index=index_names_dict['current'], name=index_names_dict['alias']) + print("Successfully deleted alias {} for index {}".format(index_names_dict['alias'], index_names_dict['current'])) + except NotFoundError: + print("Failed to delete alias {} for index {}".format(index_names_dict['alias'], index_names_dict['current'])) + + clear_index(index_names_dict['alias']) # make sure there are no indexes with the alias_name + + #index_client.put_settings(index=index_names_dict['new'], body={"index": { "blocks": { "read_only_allow_delete": False }}}) + index_client.put_alias(index=index_names_dict['new'], name=index_names_dict['alias']) + + if index_names_dict['new'] != index_names_dict['current']: + clear_index(index_names_dict['current']) + + +def index_all_of_type_by_index_name(type, index_name, skip=0, debug=False): + if skip == 0: + create_index(index_name, type) + if type == 'text': + TextIndexer.clear_cache() + TextIndexer.index_all(index_name, debug=debug) + elif type == 'sheet': + index_public_sheets(index_name) \ No newline at end of file diff --git a/sefaria/urls.py b/sefaria/urls.py index 757fcef1aa..cc69beab0a 100644 --- a/sefaria/urls.py +++ b/sefaria/urls.py @@ -238,7 +238,9 @@ # Search API urlpatterns += [ url(r'^api/dummy-search$', reader_views.dummy_search_api), - url(r'^api/search-wrapper$', reader_views.search_wrapper_api), + url(r'^api/search-wrapper/es6$', reader_views.search_wrapper_api, {'es6_compat': True}), + url(r'^api/search-wrapper/es8$', reader_views.search_wrapper_api), + url(r'^api/search-wrapper$', reader_views.search_wrapper_api, {'es6_compat': True}), url(r'^api/search-path-filter/(?P.+)$', reader_views.search_path_filter), ] diff --git a/static/js/SearchPage.jsx b/static/js/SearchPage.jsx index ece9f8cf43..438b2bc3c9 100644 --- a/static/js/SearchPage.jsx +++ b/static/js/SearchPage.jsx @@ -46,9 +46,9 @@ class SearchPage extends Component { { this.props.query } - {this.state.totalResults ? + {this.state.totalResults?.getValue() > 0 ?
- {this.state.totalResults.addCommas()}  + {this.state.totalResults.asString()}  Results
: null } @@ -71,7 +71,7 @@ class SearchPage extends Component { {(Sefaria.multiPanel && !this.props.compare) || this.state.mobileFiltersOpen ?
- {this.state.totalResults ? + {this.state.totalResults?.getValue() > 0 ? { } +class SearchTotal { + constructor({value=0, relation="eq"} = {}) { + this._value = value; + this._relation = relation; + } + getValue = () => this._value; + add = (num) => this._value += num; + asString = () => `${this._value.addCommas()}${this._getRelationString()}`; + _getRelationString = () => this._relation === 'gte' ? '+' : ''; + combine = (other) => { + if (!(other instanceof SearchTotal)) { + throw new TypeError('Parameter must be an instance of SearchTotal.'); + } + const newValue = this.getValue() + other.getValue(); + let newRelation = this._relation; + if (other._relation === 'gte' || this._relation === 'gte') { + newRelation = 'gte'; + } + return new SearchTotal({value: newValue, relation: newRelation}); + }; +} + + +function createSearchTotal(total) { + /** + * this function ensures backwards compatibility between the way elasticsearch formats the total pre-v8 and post-v8 + */ + const totalObj = typeof(total) === 'number' ? {value: total} : {value: total.value, relation: total.relation}; + return new SearchTotal(totalObj) +} class SearchResultList extends Component { @@ -87,7 +117,7 @@ class SearchResultList extends Component { runningQueries: this._typeObjDefault(null), isQueryRunning: this._typeObjDefault(false), moreToLoad: this._typeObjDefault(true), - totals: this._typeObjDefault(0), + totals: this._typeObjDefault(new SearchTotal()), pagesLoaded: this._typeObjDefault(0), hits: this._typeObjDefault([]), error: false, @@ -104,7 +134,7 @@ class SearchResultList extends Component { //console.log("Loaded cached query for") //console.log(args); this.state.hits[t] = this.state.hits[t].concat(cachedQuery.hits.hits); - this.state.totals[t] = cachedQuery.hits.total; + this.state.totals[t] = createSearchTotal(cachedQuery.hits.total); this.state.pagesLoaded[t] += 1; args.start = this.state.pagesLoaded[t] * this.querySize[t]; if (t === "text") { @@ -127,7 +157,7 @@ class SearchResultList extends Component { componentWillReceiveProps(newProps) { if(this.props.query !== newProps.query) { this.setState({ - totals: this._typeObjDefault(0), + totals: this._typeObjDefault(new SearchTotal()), hits: this._typeObjDefault([]), moreToLoad: this._typeObjDefault(true), }); @@ -245,7 +275,7 @@ class SearchResultList extends Component { this.setState(this.state); } totalResults() { - return this.types.reduce((accum, type) => (this.state.totals[type] + accum), 0); + return this.types.reduce((accum, type) => (this.state.totals[type].combine(accum)), new SearchTotal()); } updateTotalResults() { this.props.updateTotalResults(this.totalResults()); @@ -324,11 +354,12 @@ class SearchResultList extends Component { args.success = data => { this.updateRunningQuery(type, null); if (this.state.pagesLoaded[type] === 0) { // Skip if pages have already been loaded from cache, but let aggregation processing below occur + const currTotal = createSearchTotal(data.hits.total); let state = { hits: extend(this.state.hits, {[type]: data.hits.hits}), - totals: extend(this.state.totals, {[type]: data.hits.total}), + totals: extend(this.state.totals, {[type]: currTotal}), pagesLoaded: extend(this.state.pagesLoaded, {[type]: 1}), - moreToLoad: extend(this.state.moreToLoad, {[type]: data.hits.total > this.querySize[type]}) + moreToLoad: extend(this.state.moreToLoad, {[type]: currTotal.getValue() > this.querySize[type]}) }; this.setState(state, () => { this.updateTotalResults(); @@ -336,7 +367,7 @@ class SearchResultList extends Component { }); const filter_label = (request_applied && request_applied.length > 0) ? (' - ' + request_applied.join('|')) : ''; const query_label = props.query + filter_label; - Sefaria.track.event("Search", `${this.props.searchInBook? "SidebarSearch ": ""}Query: ${type}`, query_label, data.hits.total); + Sefaria.track.event("Search", `${this.props.searchInBook? "SidebarSearch ": ""}Query: ${type}`, query_label, createSearchTotal(data.hits.total).getValue()); } if (data.aggregations) { @@ -395,7 +426,7 @@ class SearchResultList extends Component { this.state.hits[type] = nextHits; this.state.pagesLoaded[type] += 1; - if (this.state.pagesLoaded[type] * this.querySize[type] >= this.state.totals[type] ) { + if (this.state.pagesLoaded[type] * this.querySize[type] >= this.state.totals[type].getValue() ) { this.state.moreToLoad[type] = false; } @@ -522,14 +553,13 @@ const SearchTabs = ({clickTextButton, clickSheetButton, textTotal, sheetTotal, c const SearchTab = ({label, total, onClick, active}) => { - total = total.addCommas() const classes = classNames({"search-dropdown-button": 1, active}); return (
{e.charCode === 13 ? onClick(e) : null}} role="button" tabIndex="0">
{label}  - {`(${total})`} + {`(${total.asString()})`}
);