Merge pull request #392 from arXiv/ntai/arxivce-608-missing-file

[ARXIVCD-608] missing file to log in JSON
arXiv · Oct 4, 2023 · f4d3c3b · f4d3c3b
2 parents 30ceb00 + 5b8818e
commit f4d3c3b
Show file tree

Hide file tree

Showing 22 changed files with 27,164 additions and 95 deletions.
diff --git a/script/sync_prod_to_gcp/.gitignore b/script/sync_prod_to_gcp/.gitignore
@@ -0,0 +1,4 @@
+sync.venv
+test/test-output/
+test/arxiv-development_arxiv-stanza.json
+test/stanza.db
diff --git a/script/sync_prod_to_gcp/Makefile b/script/sync_prod_to_gcp/Makefile
@@ -0,0 +1,33 @@
+.PHONY: test stanza
+SHELL=/bin/bash
+APP_CRED=${HOME}/.config/gcloud/application_default_credentials.json
+
+default: sync.venv/bin/uwsgi
+
+sync.venv:
+	-[[ -x /opt/rh/rh-python38/root/bin/python3.8 ]] && /opt/rh/rh-python38/root/bin/python3.8 -m venv sync.venv
+	-[[ -d sync.venv ]] || [[ -x /usr/bin/python3.8 ]] && /usr/bin/python3.8 -m venv sync.venv
+
+sync.venv/bin/uwsgi: sync.venv 
+	. sync.venv/bin/activate && pip install --upgrade pip
+	. sync.venv/bin/activate && pip install -r requirements.txt
+
+test: sync.venv  test/arxiv-development_sync-test.json
+	. sync.venv/bin/activate && python3 -m test.test_sync_prod_to_gcp
+
+stanza.local:
+	-rm -f test/stanza.db
+	stanza --config stanza/config.test-local.yaml --database test/stanza.db --plugin_dir stanza/plugins
+
+stanza.gcp.dev: test/arxiv-development_arxiv-stanza.json
+	-rm -f test/stanza.db
+	stanza --config stanza/config.test-to-gcp.yaml --database test/stanza.db --plugin_dir stanza/plugins
+
+test/arxiv-development_arxiv-stanza.json:
+	op read "op://hs3xn7ldhg3pgrql5j524rgpee/grvumhkt4kc47f53t7qjntpg3q/j3msyqb2pa5wec7hbsr3cirnky" > $@
+
+test/arxiv-development_sync-test.json:
+	op read "op://hs3xn7ldhg3pgrql5j524rgpee/w2wtsf5v7kahbngr64m43mciou/qrdfsd5gbnatjpv7zof6vwca4q" > $@
+
+${APP_CRED}:
+	gcloud auth application-default login
diff --git a/script/sync_prod_to_gcp/README.md b/script/sync_prod_to_gcp/README.md
@@ -2,14 +2,102 @@
 
 This is a script to sync the PDFs from a publish cycle to a GS Bucket.
 
-# Usage
+# Synopsys
 
     cd sync_prod_to_gcp
     python --version
     # Python 3.8.0
     echo $GOOGLE_APPLICATION_CREDENTIALS
     # /users/e-prints/arxiv-production-1234.json or your own credentials
-    python -venv venv
-    pip install -r requirements.txt
+    # make will set up `sync.venv` the virtual env for sync to run.
+    make
+    #
+    . sync.venv/bin/activate
     python sync_published_to_gcp.py /data/new/logs/publish_221101.log
 
+# Development/Testing
+
+## test_json_log
+
+This is a quick test to make sure the JSON logging is working by running the dry run.
+
+## test_sync
+
+This is a test that uploads the test files and make sure the machinery is correct, for both the uploading and
+the error reporting.
+To force the sync to upload the file that exists in the bucket, it sets "RELOADS" so the load ignores that 
+the item exists in the bucket.
+
+### Ingredients
+
+* GCP Bucket
+* GCP service account
+* A known storage object that is locked
+
+## GCP
+
+In arxiv-development, a bucket is created for the test. "arxiv-sync-test-01" is the name.
+
+To access the bucket, a service account "sync_test_admen" is created. The account has the storage read/write 
+permission.
+
+    # Create the test admin role that has the storage I/O.
+    gcloud beta iam roles create sync_test_admin --project=arxiv-development --file=test/gcp-sync-test-role.json
+
+    # Create the test ademin account 
+    gcloud iam service-accounts create sync-test-admin --project=arxiv-development --display-name="Sync Test Admin" --display-name="Sync Test Admin"
+
+    # Give it the role of storage I/O
+    gcloud projects add-iam-policy-binding arxiv-development --member="serviceAccount:[email protected]" --role="projects/arxiv-development/roles/sync_test_admin"
+
+    # Allow the service account to access the storaeg bucket.
+    gsutil iam ch serviceAccount:[email protected]:projects/arxiv-development/roles/sync_test_admin gs://arxiv-sync-test-01
+
+    # DO NOT CREATE A NEW KEY UNLESS YOU NEET TO ROTATE THE KEY
+    # gcloud iam service-accounts keys create sync-test.json --iam-account [email protected]
+    # Created key is stored in 1password.
+    # You can get the existing key in 1password with:
+
+    op read "op:///hs3xn7ldhg3pgrql5j524rgpee/w2wtsf5v7kahbngr64m43mciou/qrdfsd5gbnatjpv7zof6vwca4q"
+
+See `Makefile`
+
+# Deployment
+
+The script is designed to run as a cron job. 
+
+## cronjob 
+
+Old:
+
+    15 21 * * 0-4 /opt_arxiv/e-prints/dissemination/sync_prod_to_gcp/sync_published.sh
+
+New:
+
+    15 21 * * 0-4 /users/e-prints/arxiv-browse/scrip/sync_prod_to_gcp/sync_published.sh
+
+## Logging
+
+There are two logging, one plain text, and the second is NDJSON that is designed for GCP. The JSON logging goes into
+`/opt_arxiv/e-prints/logs/sync` and Stanza sends it out to GCP. See JSON logging.
+
+### Stanza plug-in for sync
+
+The plugin must be deployed on "arxiv-sync" host.
+
+`arxiv-browse/scripts/sync_prod_to_gcp/stanza/plugins/arxiv_sync2gcp_log.yaml` -> `/opt/observiq/stanza/plugins/arxiv_sync2gcp_log.yaml`
+
+On arxiv-sync, `/opt/observeiq/stanza/config.yaml` must include the following:
+
+      - type: arxiv_sync2gcp_log
+        log_path: "/opt_arxiv/e-prints/logs/sync/*"
+
+### JSON logging
+
+Note that, the JSON logger does log rotation. You do not need to set up the log cleaning.
+OTOH, because of this, it is rather important for Stanza to be running.
+Currently, the max file size is set to 4MiB, with 10 log files. It should be fine for a few days.
+
+
+
+
diff --git a/script/sync_prod_to_gcp/__init__.py b/script/sync_prod_to_gcp/__init__.py
diff --git a/script/sync_prod_to_gcp/digester.py b/script/sync_prod_to_gcp/digester.py
@@ -0,0 +1,34 @@
+import hashlib
+import io
+import os
+from datetime import datetime, timezone
+
+IO_BUFFER_1 = bytearray(2 ** 18)  # Reusable buffer to reduce allocations.
+IO_VIEW_1 = memoryview(IO_BUFFER_1)
+
+def binary_file_digest(fileobj: io.BytesIO, hasher=None) -> str:
+    """You'd need to open file with binary - rb
+    default hasher is sha256.
+    """
+    digestobj = hashlib.new("sha256" if hasher is None else hasher)
+    while True:
+        size = fileobj.readinto(IO_BUFFER_1)
+        if size == 0:
+            break  # EOF
+        digestobj.update(IO_VIEW_1[:size])
+        pass
+    return digestobj.hexdigest()
+
+
+def digest_from_filepath(localfile: str, hasher=None) -> str:
+    """File digest from local file.
+    default hasher is sha256.
+    """
+    with open(localfile, "rb") as fd:
+        return binary_file_digest(fd, hasher)
+    pass
+
+
+def get_file_mtime(localfile: str) -> str:
+    file_stat = os.stat(localfile)
+    return datetime.fromtimestamp(file_stat.st_mtime, tz=timezone.utc).isoformat()
diff --git a/script/sync_prod_to_gcp/requirements.txt b/script/sync_prod_to_gcp/requirements.txt
@@ -24,10 +24,12 @@ itsdangerous==2.0.1
 Jinja2==3.0.3
 jmespath==0.10.0
 jsonschema==4.0.0
+logging-json==0.4.0
 MarkupSafe==2.0.1
 mypy==0.971
 mypy-extensions==0.4.3
 packaging==21.3
+pathspec==0.11.2
 proto-plus==1.22.1
 protobuf==3.19.6
 pyasn1==0.4.8

diff --git a/script/sync_prod_to_gcp/stanza/config.test-local.yaml b/script/sync_prod_to_gcp/stanza/config.test-local.yaml
@@ -0,0 +1,130 @@
+# File: config-localhost.yaml
+# Desc: Test stanza locally.
+# Use:  ./stanza -c config-localhost.yaml | jq .
+# Install:
+#   https://github.com/observiq/stanza/releases
+#   probably this one: stanza_darwin_amd64
+
+# sudo /bin/bash
+# vi /opt/observiq/stanza/config.yaml
+# systemctl restart stanza
+
+# https://github.com/observIQ/stanza/blob/main/docs/operators/README.md
+# https://github.com/observIQ/stanza/blob/main/docs/types/expression.md
+# https://github.com/antonmedv/expr/blob/master/docs/Language-Definition.md
+
+# https://github.com/observIQ/stanza/blob/main/docs/pipeline.md
+pipeline:
+
+  ############################################
+  # Use operators (or plugins) to load and logs files.
+  # Use parsers to clean up the data. 
+  # The "id" and "output" keys define the graph of how data flows.
+  #   "id" defaults to the value of "type".
+  #   "output" defaults to the "id" of the next item in the pipeline.
+  ############################################
+
+  # Quickstart:
+  # cat arxiv-operations/stanza/logs/arxiv_access_log.example \
+  #     >> arxiv-operations/stanza/logs/arxiv_access_log
+
+  #- type: arxiv_python_log
+  #  log_path: logs/modapi_log
+  #  start_at: beginning
+  #  output: host_metadata
+  #  #parse_to: parsed
+  #  #preserve: true
+
+  #- type: arxiv_access_log
+  #  log_path: logs/arxiv_access_log
+  #  output: host_metadata
+
+  #- type: arxiv_error_log
+  #  log_path: logs/arxiv_error_log
+  #  output: host_metadata
+
+  #- type: arxiv_pdf_log
+  #  log_path: logs/arxiv_pdf_log
+  #  output: host_metadata
+
+  #- type: arxiv_stats_log
+  #  log_path: logs/arxiv_stats_log
+  #  output: host_metadata
+
+  - type: arxiv_sync2gcp_log
+    log_path: "test/test-output/*"
+
+  #- type: arxiv_classifier_log
+  #  log_path: logs/arxiv_classifier_log
+  #  output: host_metadata
+
+  #- type: file_input
+  #  start_at: beginning
+  #  include: 
+  #    - logs/modapi_log
+  #  output: modapi_log
+
+  # https://github.com/observIQ/stanza/blob/main/docs/operators/metadata.md
+  #- type: metadata
+  #  id: modapi_log
+  #  resource:
+  #    nicename: arxiv-localhost
+  #  output: json_parser
+
+  #- type: json_parser
+  #  output: host_metadata
+  #      timestamp:
+  #    parse_from: time
+  #    layout: '%d/%b/%Y:%H:%M:%S %z'
+  ##  #source: file
+  #  #log_path: logs/modapi_log
+  #  #pod_name: ""
+  #  #include: logs/modapi_log
+
+  # https://github.com/observIQ/stanza/blob/master/docs/operators/file_input.md
+  #- type: file_input
+
+  # https://github.com/observIQ/stanza-plugins/blob/main/docs/plugins/apache_combined.md
+  # https://github.com/observIQ/stanza-plugins/blob/main/docs/plugins/apache_common.md
+  #- type: arxiv_stats_log
+  #  log_path: /Users/bgm37/Documents/apps/stanza/test_stats_log
+  #  output: host_metadata
+
+  #######################################
+  # Below is intended for all log streams
+  #######################################
+
+  # https://github.com/observIQ/stanza/blob/main/docs/operators/host_metadata.md
+  - type: host_metadata
+
+  # https://github.com/observIQ/stanza/blob/main/docs/operators/restructure.md
+  #- type: restructure
+  #  id: nicename_restructure
+  #  if: '$labels["host.name"] != nil'
+  #  ops:
+  #    - move:
+  #        from: '$labels["host.nicename"]'
+  #        to: '$resource["host.nicename"]'
+
+  # https://github.com/observIQ/stanza/blob/main/docs/operators/metadata.md
+  - type: metadata
+    resource:
+      nicename: arxiv-localhost
+
+  # https://github.com/observIQ/stanza/blob/main/docs/types/severity.md
+  # - type: severity
+
+  ############################################
+  # Send to google logging or test with stdout
+  ############################################
+
+  # https://github.com/observIQ/stanza/blob/master/docs/operators/google_cloud_output.md
+  #- type: google_cloud_output
+  #  credentials_file: /opt/observiq/stanza/arxiv-production_arxiv-stanza.json
+
+  # For testing
+  - type: stdout
+
+  # For testing
+  #- type: file_output
+  #  path: ./logs/arxiv-localhost-output.log