Skip to content

Commit

Permalink
chore(search): duplicate elasticsearch reindex cronjob and associated…
Browse files Browse the repository at this point in the history
… files while migrating to elasticsearch 8
  • Loading branch information
nsantacruz committed Sep 20, 2023
1 parent 56063c2 commit 841978e
Show file tree
Hide file tree
Showing 3 changed files with 970 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
{{- if .Values.cronJobs.reindexElasticSearch.enabled }}
---
apiVersion: batch/v1
kind: CronJob
metadata:
name: {{ .Values.deployEnv }}-reindex-elastic-search-es6
labels:
{{- include "sefaria.labels" . | nindent 4 }}
spec:
schedule: "20 13 * * 0"
jobTemplate:
spec:
backoffLimit: 1
template:
spec:
affinity:
podAntiAffinity:
requiredDuringSchedulingIgnoredDuringExecution:
- labelSelector:
matchExpressions:
- key: app
operator: In
values:
- mongo
topologyKey: kubernetes.io.hostname
containers:
- name: reindex-elastic-search-es6
image: "{{ .Values.web.containerImage.imageRegistry }}:{{ .Values.web.containerImage.tag }}"
resources:
limits:
memory: 9Gi
requests:
memory: 7Gi
env:
- name: REDIS_HOST
value: "redis-{{ .Values.deployEnv }}"
- name: NODEJS_HOST
value: "node-{{ .Values.deployEnv }}-{{ .Release.Revision }}"
- name: VARNISH_HOST
value: "varnish-{{ .Values.deployEnv }}-{{ .Release.Revision }}"
- name: SLACK_URL
valueFrom:
secretKeyRef:
name: {{ template "sefaria.secrets.slackWebhook" . }}
key: slack-webhook
envFrom:
- secretRef:
name: {{ template "sefaria.secrets.elasticAdmin" . }}
- secretRef:
name: {{ .Values.secrets.localSettings.ref }}
optional: true
- configMapRef:
name: local-settings-{{ .Values.deployEnv }}
- secretRef:
name: local-settings-secrets-{{ .Values.deployEnv }}
optional: true
volumeMounts:
- mountPath: /app/sefaria/local_settings.py
name: local-settings
subPath: local_settings.py
readOnly: true
command: ["bash"]
args: [
"-c",
"mkdir -p /log && touch /log/sefaria_book_errors.log && pip install numpy && /app/run /app/scripts/reindex_elasticsearch_cronjob_ES6.py"
]
restartPolicy: Never
volumes:
- name: local-settings
configMap:
name: local-settings-file-{{ .Values.deployEnv }}
items:
- key: local_settings.py
path: local_settings.py
successfulJobsHistoryLimit: 1
failedJobsHistoryLimit: 2
{{- end }}
49 changes: 49 additions & 0 deletions scripts/reindex_elasticsearch_cronjob_ES6.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
"""
This file is meant to be temporary while we are migrating to elasticsearch 8
"""
from datetime import datetime
import requests
import traceback
import os
import django
django.setup()
from sefaria.model import *
from sefaria.search_ES6 import index_all
from sefaria.local_settings import SEFARIA_BOT_API_KEY
from sefaria.pagesheetrank import update_pagesheetrank

"""
Source sheets added after last_sheet_timestamp will be missing from the index process. We want to manually index all
source sheets created after this. Depending on the database being used to index the timestamp will be different. If
running against a production database, last_sheet_timestamp will be the time this script began running. Otherwise, this
value will need to be set to the time at which the last mongo dump was created (assuming the database is using the most
up-to-date mongo dump).
"""
# last_sheet_timestamp = datetime.fromtimestamp(os.path.getmtime("/var/data/sefaria_public/dump/sefaria")).isoformat()
try:
last_sheet_timestamp = datetime.now().isoformat()
update_pagesheetrank()
index_all()
r = requests.post("https://www.sefaria.org/admin/index-sheets-by-timestamp", data={"timestamp": last_sheet_timestamp, "apikey": SEFARIA_BOT_API_KEY})
if "error" in r.text:
raise Exception("Error when calling admin/index-sheets-by-timestamp API: " + r.text)
else:
print("SUCCESS!", r.text)
except Exception as e:
tb_str = traceback.format_exc()
print("Caught exception")
post_object = {
"icon_emoji": ":facepalm:",
"username": "Reindex ElasticSearch",
"channel": "#engineering-discuss",
"attachments": [
{
"fallback": tb_str,
"color": "#a30200",
"pretext": "Cronjob Error",
"text": tb_str
}
]
}
requests.post(os.environ['SLACK_URL'], json=post_object)
raise e
Loading

0 comments on commit 841978e

Please sign in to comment.