TM416 alfresco reindexing job (#100)

* Update reindexing-job.yaml * 🔧 make all config dynamic, create taskfile and add index task * Update Taskfile.yml * new logic to create new jobs as they complete * Update Taskfile.yml * add continue feature * Update Taskfile.yml * patch pvc * test new index name * Update Taskfile.yml * reindexing optimisations * version mapping * filestore patch * updates * Update data-refresh.yaml * Update data-refresh.yaml * Update job.yaml * Update data-refresh.yaml * Update job.yaml * pooling * indexing updates * optimisations * amq port forwarding bug * feat: new script to stat amq queues * feat: count any amq stat * updates * new migration job * Update migrate-s3.yaml * Update migrate-s3.yaml * Update migrate-s3.yaml * Update migrate-s3.yaml * Update migrate-s3.yaml * fix * Update migrate-s3.yaml * Update migrate-s3.yaml * Update migrate-s3.yaml * updates for indexing * Update Taskfile.yml * Update migrate-s3.yaml * Update migrate-s3.yaml * Update migrate-s3.yaml * add helm job * Update migrate-s3.yaml * fix * modify jobs
ministryofjustice · Nov 8, 2024 · d2b3b15 · d2b3b15
1 parent 54e4e14
commit d2b3b15
Show file tree

Hide file tree

Showing 53 changed files with 12,930 additions and 122 deletions.
diff --git a/.github/workflows/data-refresh.yaml b/.github/workflows/data-refresh.yaml
@@ -42,7 +42,11 @@ jobs:
       run: |
         kubectl scale deployment alfresco-content-services-alfresco-cs-repository --replicas=0
         kubectl scale deployment alfresco-content-services-alfresco-cs-share --replicas=0        
-        kubectl scale deployment alfresco-content-services-alfresco-search-enterprise-liveindexing --replicas=0
+        kubectl scale deployment alfresco-content-services-alfresco-search-enterprise-content --replicas=0
+        kubectl scale deployment alfresco-content-services-alfresco-search-enterprise-metadata --replicas=0
+        kubectl scale deployment alfresco-content-services-alfresco-search-enterprise-path --replicas=0
+        kubectl scale deployment alfresco-content-services-alfresco-search-enterprise-mediation --replicas=0
+        kubectl scale deployment alfresco-content-services-alfresco-router --replicas=0
 
   refresh-db:
     name: Refresh DB
@@ -53,7 +57,7 @@ jobs:
     steps:
       - uses: actions/[email protected]
 
-      - name: Install Kubernetes
+      - name: Install kubectl
         uses: azure/[email protected]
         with:
           version: 'v1.26.0' # default is latest stable
@@ -95,7 +99,7 @@ jobs:
     steps:
       - uses: actions/[email protected]
 
-      - name: Install Kubernetes
+      - name: Install k
         uses: azure/[email protected]
         with:
           version: 'v1.26.0' # default is latest stable
@@ -130,12 +134,12 @@ jobs:
 
           SRC_BUCKET=$(kubectl get secrets s3-bucket-output -o jsonpath='{.data.BUCKET_NAME}' | base64 -d)
 
-          DIRS=$(kubectl exec $SERVICE_POD_NAME -- aws s3 ls $SRC_BUCKET | grep -v contentstore | awk -F ' ' '{print $2}' | tr -d '/' | tr '\n' ',')
+          DIRS=$(kubectl exec $SERVICE_POD_NAME -- aws s3 ls $SRC_BUCKET | grep -v contentstore | awk '/\/$/ && NF {print $2}' | tr -d '/' | tr '\n' ',' | sed 's/,$/\n/')
 
           helm install refresh-s3 . \
             --set sourceEnvironment=${{ github.event.inputs.source_env }} \
             --set destinationEnvironment=${{ github.event.inputs.destination_env }} \
-            --set directories="{${DIRS:0:-1}}"
+            --set directories="{${DIRS}}"
 
           kubectl wait jobs -l name-prefix=refresh-s3 --for=condition=complete --timeout 10h
 
@@ -197,4 +201,8 @@ jobs:
 
         kubectl scale deployment alfresco-content-services-alfresco-cs-repository --replicas=$(echo $HELM_VALUES | jq '.repository.replicaCount')
         kubectl scale deployment alfresco-content-services-alfresco-cs-share --replicas=$(echo $HELM_VALUES | jq '.share.replicaCount')
-        kubectl scale deployment alfresco-content-services-alfresco-search-enterprise-liveindexing --replicas=1
+        kubectl scale deployment alfresco-content-services-alfresco-search-enterprise-content --replicas=1
+        kubectl scale deployment alfresco-content-services-alfresco-search-enterprise-metadata --replicas=1
+        kubectl scale deployment alfresco-content-services-alfresco-search-enterprise-path --replicas=1
+        kubectl scale deployment alfresco-content-services-alfresco-search-enterprise-mediation --replicas=1
+        kubectl scale deployment alfresco-content-services-alfresco-router --replicas=1
diff --git a/.github/workflows/migrate-s3.yaml b/.github/workflows/migrate-s3.yaml
@@ -19,9 +19,11 @@ on:
 permissions:
   contents: read
 
+run-name: Migrate S3 - ${{ github.event.inputs.environment }}
+
 jobs:
-  refresh-s3:
-    name: Refresh S3
+  migrate-s3:
+    name: Migrate S3
     runs-on: ubuntu-22.04
     environment:
       name: ${{ github.event.inputs.environment }}
@@ -51,7 +53,7 @@ jobs:
           KUBE_CLUSTER: ${{ secrets.KUBE_CLUSTER }}
 
       - name: Uninstall S3 Refresh chart
-        run: helm uninstall refresh-s3 --ignore-not-found
+        run: helm uninstall migrate-s3 --ignore-not-found
 
       - name: S3 migration
         working-directory: jobs/migrate-s3
@@ -61,24 +63,31 @@ jobs:
           SERVICE_POD_DEPLOYMENT=$(kubectl get deployment -l app=service-pod -o jsonpath="{.items[0].metadata.name}")
           SERVICE_POD_NAME=$(kubectl get pod -l app=$SERVICE_POD_DEPLOYMENT -o jsonpath="{.items[0].metadata.name}")
 
-          SRC_BUCKET=${{ vars.MIGRATION_SRC_BUCKET }}
-          
+          SRC_BUCKET="${{ vars.MIGRATION_SRC_BUCKET }}"
+
+          prefixes=$(kubectl exec $SERVICE_POD_NAME -- aws s3api list-objects-v2 --bucket $SRC_BUCKET --delimiter '/' --query 'CommonPrefixes[*].Prefix' --output text)
+          # remove all spaces and put one comma between prefixes
+          cleaned_prefixes=$(echo $prefixes | tr -s '[:space:]' ',' | sed 's/[,/]*$//')
           DIRS=""
-          for prefix in $(aws s3api list-objects-v2 --bucket $SRC_BUCKET --delimiter '/' --query 'CommonPrefixes[*].Prefix' --output text); do
-              DIR=$(aws s3api list-objects-v2 --bucket $SRC_BUCKET --prefix "$prefix" --delimiter '/' --query 'CommonPrefixes[*].Prefix' --output text)
-              #append to DIRS comma separated
-              DIRS+="${DIR},"
+          IFS=','
+          for prefix in $cleaned_prefixes; do
+              DIR=$(kubectl exec $SERVICE_POD_NAME -- aws s3api list-objects-v2 --bucket $SRC_BUCKET --prefix "$prefix" --delimiter '/' --query 'CommonPrefixes[*].Prefix' --output text)
+              if [ -n "$DIR" ]; then
+                  DIR=$(echo $DIR | tr -s '[:space:]' ',' | sed 's/[,/]*$//')
+                  DIRS="${DIRS}${DIR},"
+              fi
           done
           DIRS=${DIRS%,}
 
 
           echo "DIRS: $DIRS"
 
-          # helm install migrate-s3 . \
-          #   --set environment=${{ github.event.inputs.environment }} \
-          #   --set directories="{${DIRS}}"
+          helm install migrate-s3 . \
+            --set environment=${{ github.event.inputs.environment }} \
+            --set srcBucket=$SRC_BUCKET \
+            --set "directories={${DIRS}}"
 
-          # kubectl wait jobs -l name-prefix=migrate-s3 --for=condition=complete --timeout 10h
+          kubectl wait jobs -l name-prefix=migrate-s3 --for=condition=complete --timeout 10h
 
       - name: Uninstall S3 Refresh chart
-        run: helm uninstall refresh-s3 --ignore-not-found
+        run: helm uninstall migrate-s3 --ignore-not-found
diff --git a/.gitignore b/.gitignore
@@ -6,3 +6,6 @@ kustomize/base/charts/
 kustomize/**/charts/
 kustomize/**/output.yaml
 kustomize/base/resources.yaml
+jobs/reindex/values-reindex-*.yaml
+completed.txt
+ids.json
diff --git a/Taskfile.yml b/Taskfile.yml
@@ -87,7 +87,7 @@ tasks:
         --set database.url={{.RDS_JDBC_URL}} \
         --set global.elasticsearch.host={{.OPENSEARCH_HOST}} \
         --set alfresco-search-enterprise.searchIndex.host={{.OPENSEARCH_HOST}} \
-        --wait --timeout=20m \
+        --wait --timeout=60m \
         --post-renderer ../kustomizer.sh --post-renderer-args "{{.HELM_POST_RENDERER_ARGS}}" \
         {{.DEBUG_FLAG}} {{.ATOMIC_FLAG}}
         echo " "
@@ -103,3 +103,231 @@ tasks:
     cmds:
       - yq '.metadata.annotations."nginx.ingress.kubernetes.io/whitelist-source-range" = "placeholder"' -i patch-ingress-repository.yaml
       - yq '.metadata.annotations."nginx.ingress.kubernetes.io/whitelist-source-range" = "placeholder"' -i patch-ingress-share.yaml
+
+  simple_reindex:
+    cmds:
+      - | 
+        helm install "reindex-default-$(openssl rand -hex 4)" ./jobs/reindex --set "global.elasticsearch.host={{.OPENSEARCH_HOST}}" --set "fromId=27451380" --set "toId=27908429" --namespace {{.NAMESPACE}}  
+
+  reindex_list:
+    cmds:
+      - | 
+        # Set your batch size (you can adjust this number as needed)
+        BATCH_SIZE=40
+
+        # Path to your JSON file containing the list of IDs
+        JSON_FILE="ids.json"
+
+        RANDOM_ID=$(openssl rand -hex 4)
+
+        # Function to create Helm job for a given batch of IDs
+        create_helm_job() {
+            # Concatenate the batch of IDs into a comma-separated string
+            # $1, $2, ... represent individual IDs
+            local idList=""
+            for id in "$@"; do
+                if [ -z "$idList" ]; then
+                    idList="$id"
+                else
+                    idList="$idList,$id"
+                fi
+            done
+
+            # Debugging: print the batch being passed
+            echo "Creating job for IDs: $idList"  # This will show only the batch, not the whole list
+
+            # Run Helm command to create the job with the current batch of IDs
+            helm upgrade --install "reindex-list-${RANDOM_ID}" \
+                --set "idList={${idList}}" \
+                --set "global.elasticsearch.host={{.OPENSEARCH_HOST}}" \
+                --set "global.namespace={{.NAMESPACE}}" \
+                ./jobs/reindex-list \
+                --namespace "{{.NAMESPACE}}"
+            echo "Waiting for the jobs to complete..."
+            kubectl wait --for=condition=complete job --namespace {{.NAMESPACE}} -l "reindex-type=list" --timeout=10h || echo "Jobs completed!"
+            echo "Jobs completed!"
+        }
+
+        # Parse the list of IDs from the JSON file using jq
+        # The IDs will be saved as a space-separated list into the 'ids' variable
+        ids=$(jq -r '.list[]' "$JSON_FILE")
+
+        # Initialize the index for processing
+        index=0
+
+        # Loop over the IDs and create jobs in batches
+        for id in $ids; do
+            # Add the current ID to the current batch
+            batch[$index]="$id"
+            index=$((index + 1))
+
+            # If the batch reaches the specified batch size, process it
+            if [ "$index" -ge "$BATCH_SIZE" ]; then
+                # Create the Helm job for the current batch
+                create_helm_job "${batch[@]}"
+
+                # Reset the batch for the next set of IDs
+                index=0
+                unset batch
+                # kubectl wait --for=condition=complete job --namespace {{.NAMESPACE}} -l "reindex-type=list" --timeout=10h || echo "Jobs completed!"
+                helm uninstall "reindex-list-${RANDOM_ID}" --namespace {{.NAMESPACE}}
+            fi
+        done
+
+        # If there are any remaining IDs (less than BATCH_SIZE), create the last job
+        if [ "$index" -gt 0 ]; then
+            create_helm_job "${batch[@]}"
+        fi
+        
+        echo "All jobs have been created!"
+        echo "Cleaning up..."
+        helm uninstall "reindex-list-${RANDOM_ID}" --namespace {{.NAMESPACE}}
+        echo "Cleanup complete!"
+
+
+
+
+  simple_reindex_date:
+    cmds:
+      - | 
+        helm install "reindex-default-$(openssl rand -hex 4)" ./jobs/reindex_date --set "global.elasticsearch.host={{.OPENSEARCH_HOST}}" --set "fromTime=201707060001" --set "toTime=201707070001" --namespace {{.NAMESPACE}}
+
+  simple_reindex_date_metadata-only:
+    cmds:
+      - | 
+        helm install "reindex-default-date-meta" ./jobs/reindex_date --set "global.elasticsearch.host={{.OPENSEARCH_HOST}}" --set "fromTime=202402010100" --set "toTime=202402100100" --set "content=false" --namespace {{.NAMESPACE}}
+
+  batch_reindex:
+    vars:
+      START: "{{.START | default 0}}"
+      END: "{{.END | default 10050}}"
+      CHUNK: "{{.CHUNK | default 1000}}"
+      CONCURRENCY: "{{.CONCURRENCY | default 5}}"
+      ARRAY:
+        sh: |
+          arr=$(
+            for i in $(seq -f "%.0f" {{.START}} {{.CHUNK}} {{.END}}); do
+              new_start=$i
+              end=$((i + {{.CHUNK}} - 1))
+              if [ $end -gt {{.END}} ]; then
+                end={{.END}}
+              fi
+          cat << EOF
+          ${new_start}-${end}
+          EOF
+            done
+          )
+          echo "$arr"
+    cmds:
+      - echo "Starting batch reindex from {{.START}} to {{.END}} in chunks of {{.CHUNK}}"
+      - task: run_reindex_batches
+        vars:
+          OPENSEARCH_HOST: "{{.OPENSEARCH_HOST}}"
+          NAMESPACE: "{{.NAMESPACE}}"
+          ARRAY: "{{.ARRAY}}"
+          CONCURRENCY: "{{.CONCURRENCY}}"
+      - task: reindex_helm_cleanup
+        vars:
+          NAMESPACE: "{{.NAMESPACE}}"
+
+  run_reindex_batches:
+    cmds:
+      - |
+        pending="{{.ARRAY}}"
+        # count the number of items
+        total_items=$(echo "$pending" | wc -l)
+        echo "Total items: $total_items"
+
+        previous_completed=$(cat completed.txt) || true
+        if [ -z "$previous_completed" ]; then
+          echo "No previous completed items"
+        else
+          echo "Count of previous completed items: $(echo "$previous_completed" | wc -l)"
+        fi
+
+        # remove the completed items from the pending list
+        for item in $previous_completed; do
+          pending=$(echo "$pending" | grep -v "$item")
+        done
+        
+        total_items=$(echo "$pending" | wc -l)
+        echo "Total items: $total_items"
+
+
+        started=()
+        completed=()
+
+        # while pending is not empty
+        while [ -n "$pending" ]; do
+            # echo "Pending: $pending"
+            # Get the first item
+            item=$(echo "$pending" | head -n 1)
+            echo "Processing item: $item"
+
+            # Get the start and end values
+            start=$(echo "$item" | cut -d '-' -f 1)
+            end=$(echo "$item" | cut -d '-' -f 2)
+            echo "Start: $start, End: $end"
+
+            # check the number of jobs running
+            running_jobs=$(kubectl get jobs --namespace {{.NAMESPACE}} -l "reindex-job" -o json | jq '.items | length')
+            echo "Running jobs: $running_jobs"
+            if [ $running_jobs -ge {{.CONCURRENCY}} ]; then
+                echo "No available slots, waiting for 5 seconds"
+                sleep 5
+            else 
+              echo "Found at least 1 available slot!"
+              echo "Available slots left: $(({{.CONCURRENCY}} - $running_jobs))"
+              # run the job
+              echo "helm install reindex-${start}-${end} ./jobs/reindex --set global.elasticsearch.host={{.OPENSEARCH_HOST}} --set fromId=${start} --set toId=${end} --namespace {{.NAMESPACE}}"  
+              helm install "reindex-${start}-${end}" ./jobs/reindex --set "global.elasticsearch.host={{.OPENSEARCH_HOST}}" --set "fromId=${start}" --set "toId=${end}" --namespace {{.NAMESPACE}}
+              # Remove the item from the list
+              pending=$(echo "$pending" | tail -n +2)
+            fi
+
+            # check for completed jobs
+            completed_jobs=$(kubectl get jobs --namespace {{.NAMESPACE}} -l "reindex-job" -o json | jq -r '.items[] | select(.status.succeeded == 1) | .metadata.labels["reindex-job"]')
+            if [ -z "$completed_jobs" ]; then
+              echo "No completed jobs"
+            else
+              echo "Completed jobs: $completed_jobs"
+              echo "$completed_jobs" | while IFS= read -r job; do
+                echo "Processing completed job: $job"
+                completed+=("$job")
+                echo "$job" >> completed.txt
+                echo "Job $job completed"
+                helm uninstall "reindex-$job" --namespace {{.NAMESPACE}}
+              done
+            fi
+        done
+
+  reindex_helm_cleanup:
+    cmds:
+      - |
+        # wait for all jobs to complete
+        kubectl wait --for=condition=complete jobs --namespace {{.NAMESPACE}} -l "reindex-job" --timeout=4h
+
+        completed_jobs=$(kubectl get jobs --namespace {{.NAMESPACE}} -l "reindex-job" -o json | jq -r '.items[] | select(.status.succeeded == 1) | .metadata.labels["reindex-job"]')
+        if [ -z "$completed_jobs" ]; then
+          echo "No completed jobs"
+        else
+          echo "Completed jobs: $completed_jobs"
+          echo "$completed_jobs" | while IFS= read -r job; do
+            echo "Processing completed job: $job"
+            completed+=("$job")
+            echo "$job" >> completed.txt
+            echo "Job $job completed"
+            helm uninstall "reindex-$job" --namespace {{.NAMESPACE}}
+          done
+        fi
+
+  helm_bulk_uninstall:
+    vars:
+      PREFIX: "{{.PREFIX}}"
+    cmds:
+      - |
+        helm list -n {{.NAMESPACE}} -q | grep "^{{.PREFIX}}" | while IFS= read -r release; do
+          echo "Uninstalling release: $release"
+          helm uninstall "$release" -n "$NAMESPACE"
+        done
+        
diff --git a/jobs/migrate-db/values_preprod.yaml b/jobs/migrate-db/values_preprod.yaml
@@ -0,0 +1 @@
+environment: preprod
diff --git a/jobs/migrate-s3/templates/job.yaml b/jobs/migrate-s3/templates/job.yaml
@@ -14,12 +14,12 @@ data:
     aws s3 sync s3://$SRC_BUCKET/$DIR s3://$DST_BUCKET/$DIR --delete --only-show-errors --region eu-west-2
 
     echo sync of $DIR directory completed
-{{- range .Values.dirs }}
+{{- range .Values.directories }}
 ---
 apiVersion: batch/v1
 kind: Job
 metadata:
-  name: migrate-s3-{{ . | toString | replace "/" "-" }}
+  name: migrate-s3-{{ . | toString | replace "/" "-" | replace "."  "" | trimSuffix "-" }}
 spec:
   template:
     spec:
@@ -29,8 +29,8 @@ spec:
         imagePullPolicy: IfNotPresent
         resources:
           limits:
-            cpu: 2
-            memory: 4Gi
+            cpu: 1
+            memory: 2Gi
         command:
         - /bin/entrypoint.sh
         env: