From 01e5b56e6be402b9da9ecfccc04f737142fcd945 Mon Sep 17 00:00:00 2001 From: Everaldo Date: Thu, 3 Oct 2024 12:36:44 -0700 Subject: [PATCH] Use zip in backup (#268) * Refactor auto backup. * Backup to .zip file * Backup to .zip file * Backup to .zip file * Look for any .json file in the .zip file * Add github action to check if the backup file was created. * Create github action to check backup. * Test github action. * Test github action. * Test github action. * Test github action. * Test github action. * Test github action. * Test github action. * Test github action. * Test github action. * Test github action. * Test github action. * Add logs. * Add backup flag validation. * Import no_backup var. * Import no_backup var from config file. * Enable tmate for debugging. * Force error. * Force error. * Refactor. * Remove tmate. * Remove no backup variable from config. * Improve messages. * Improve messages. * Improve messages. * Check backup file size. * Update slack channel. --- .github/scripts/check_backup.py | 96 ++++++++++++++++++++++++++ .github/workflows/check_backup.yml | 35 ++++++++++ src/admin.py | 105 ++++++++++++++++++++++------- 3 files changed, 213 insertions(+), 23 deletions(-) create mode 100644 .github/scripts/check_backup.py create mode 100644 .github/workflows/check_backup.yml diff --git a/.github/scripts/check_backup.py b/.github/scripts/check_backup.py new file mode 100644 index 0000000..fbd43c0 --- /dev/null +++ b/.github/scripts/check_backup.py @@ -0,0 +1,96 @@ +""" +This script checks if a backup file for the current date exists in a specified S3 bucket. +If the backup file does not exist, a notification is sent to a Slack channel. + +Expected file format in the S3 bucket: +- The file should be in the folder 'db_backup/' with the following naming pattern: + 'smartapi_YYYYMMDD.zip', where YYYYMMDD corresponds to the current date. + +Required Environment Variables: +- AWS_ACCESS_KEY_ID: The AWS access key ID to read the AWS s3 bucket. +- AWS_SECRET_ACCESS_KEY: The AWS secret access key to read the AWS s3 bucket. +- BACKUP_BUCKET_NAME: The name of the AWS S3 bucket where backups are stored. +- S3_FOLDER: The folder path within the S3 bucket where backups are stored (e.g., 'db_backup/'). +- AWS_REGION: The AWS region where the S3 bucket is located. +- SLACK_CHANNEL: The Slack channel where notifications should be sent (e.g., '#observability-test'). +- SLACK_WEBHOOK_URL: The Slack Webhook URL used to send the notification. + +Functionality: +1. The script uses the AWS SDK (boto3) to check for the existence of the backup file in the specified S3 bucket. +2. If the file is found, it logs that no action is needed. +3. If the file is not found, it sends a notification to the configured Slack channel. + +Dependencies: +- boto3: For interacting with AWS S3. +- requests: For sending HTTP POST requests to Slack. + +""" + +import boto3 +import botocore +import os +import requests + +from datetime import datetime + + +def send_slack_notification(message): + + print(f" └─ {message}") + + # Create the payload for Slack + slack_data = { + "channel": os.getenv("SLACK_CHANNEL"), + "username": "SmartAPI", + "icon_emoji": ":thumbsdown:", + "text": message, + } + + try: + print(" └─ Sending Slack notification.") + response = requests.post(os.getenv("SLACK_WEBHOOK_URL"), json=slack_data, timeout=10) + if response.status_code == 200: + print(" └─ Slack notification sent successfully.") + else: + print(f" └─ Failed to send message to Slack: {response.status_code}, {response.text}") + except requests.exceptions.Timeout as e: + print(" └─ Request timed out to Slack WebHook URL.") + raise e + except requests.exceptions.RequestException as e: + print(f" └─ Failed to send Slack notification. Error: {str(e)}") + raise e + + +def check_backup_file(): + + # Create the expected file name + today_date = datetime.today().strftime("%Y%m%d") + expected_file = f"{os.getenv('S3_FOLDER')}smartapi_{today_date}.zip" + + # Create the S3 client + s3_client = boto3.client("s3", region_name=os.getenv("AWS_REGION")) + + # Try to fetch the file metadata + try: + response = s3_client.head_object(Bucket=os.getenv("BACKUP_BUCKET_NAME"), Key=expected_file) + print(f" └─ Backup file {expected_file} exists!") + + # Get the file size in bytes + file_size = response['ContentLength'] + + # Check if the file is larger than 1MB + if file_size > 1048576: # 1MB in bytes + print(f" └─ Backup file is larger than 1MB! Size: {file_size} bytes.") + print(" └─ Nothing to do!") + else: + message = f":alert: The backup file {expected_file} is smaller than 1MB!" + send_slack_notification(message) + + except botocore.exceptions.ClientError as e: + print(e) + message = f":alert: The backup file {expected_file} was NOT created today!" + send_slack_notification(message) + + +if __name__ == "__main__": + check_backup_file() diff --git a/.github/workflows/check_backup.yml b/.github/workflows/check_backup.yml new file mode 100644 index 0000000..28e86bf --- /dev/null +++ b/.github/workflows/check_backup.yml @@ -0,0 +1,35 @@ +name: Check S3 Backup and Notify Slack + +on: + workflow_dispatch: # Allows manual trigger from GitHub Actions UI + schedule: + - cron: '0 13 * * *' # 5:00 AM PST (UTC-8) + +jobs: + check-backup: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v2 + + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: '3.x' + + - name: Install boto3 (AWS SDK for Python) + run: | + python -m pip install --upgrade pip + pip install boto3 requests + + - name: Check if backup exists in S3 + run: python .github/scripts/check_backup.py + env: + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + AWS_REGION: ${{ secrets.AWS_REGION }} + BACKUP_BUCKET_NAME: "${{ secrets.BACKUP_BUCKET_NAME }}" + S3_FOLDER: "db_backup/" + SLACK_CHANNEL: "#ncats-translator" + SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }} diff --git a/src/admin.py b/src/admin.py index e841933..e15ca47 100644 --- a/src/admin.py +++ b/src/admin.py @@ -26,6 +26,8 @@ import logging import random import time +import zipfile +import io from datetime import datetime import boto3 @@ -37,20 +39,48 @@ logging.basicConfig(level="INFO") -def _default_filename(): - return "smartapi_" + datetime.today().strftime("%Y%m%d") + ".json" +def _default_filename(extension=".json"): + return "smartapi_" + datetime.today().strftime("%Y%m%d") + extension -def save_to_file(mapping, filename=None): - filename = filename or _default_filename() - with open(filename, "w") as file: - json.dump(mapping, file, indent=2) - - -def save_to_s3(mapping, filename=None, bucket="smartapi"): - filename = filename or _default_filename() +def save_to_file(mapping, filename=None, format="zip"): + """ + Save data to a file in either JSON or ZIP format. + :param mapping: Data to save + :param filename: File name + :param format: File format, either 'json' or 'zip' + """ + if format == "zip": + filename = filename or _default_filename(".zip") + with zipfile.ZipFile(filename, 'w', zipfile.ZIP_DEFLATED) as zfile: + json_data = json.dumps(mapping, indent=2) + zfile.writestr(filename.replace(".zip", ".json"), json_data) + else: + filename = filename or _default_filename(".json") + with open(filename, "w") as file: + json.dump(mapping, file, indent=2) + + +def save_to_s3(data, filename=None, bucket="smartapi", format="zip"): + """ + Save data to S3 in either JSON or ZIP format. + :param data: Data to save + :param filename: File name + :param bucket: S3 bucket name + :param format: File format, either 'json' or 'zip' + """ + filename = filename or _default_filename(f".{format}") s3 = boto3.resource("s3") - s3.Bucket(bucket).put_object(Key="db_backup/{}".format(filename), Body=json.dumps(mapping, indent=2)) + + if format == "zip": + with zipfile.ZipFile(filename, 'w', zipfile.ZIP_DEFLATED) as zfile: + json_data = json.dumps(data, indent=2) + zfile.writestr(filename.replace(".zip", ".json"), json_data) + logging.info(f"Uploading {filename} to AWS S3") + s3.Bucket(bucket).upload_file(Filename=filename, Key=f"db_backup/{filename}") + else: + logging.info(f"Uploading {filename} to AWS S3") + s3.Bucket(bucket).put_object(Key=f"db_backup/{filename}", Body=json.dumps(data, indent=2)) def _backup(): @@ -69,14 +99,14 @@ def _backup(): return smartapis -def backup_to_file(filename=None): +def backup_to_file(filename=None, format="zip"): smartapis = _backup() - save_to_file(smartapis, filename) + save_to_file(smartapis, filename, format) -def backup_to_s3(filename=None, bucket="smartapi"): +def backup_to_s3(filename=None, bucket="smartapi", format="zip"): smartapis = _backup() - save_to_s3(smartapis, filename, bucket) + save_to_s3(smartapis, filename, bucket, format) def _restore(smartapis): @@ -99,7 +129,7 @@ def restore_from_s3(filename=None, bucket="smartapi"): s3 = boto3.client("s3") if not filename: - objects = s3.list_objects_v2(Bucket="smartapi", Prefix="db_backup")["Contents"] + objects = s3.list_objects_v2(Bucket=bucket, Prefix="db_backup")["Contents"] filename = max(objects, key=lambda x: x["LastModified"])["Key"] if not filename.startswith("db_backup/"): @@ -108,14 +138,42 @@ def restore_from_s3(filename=None, bucket="smartapi"): logging.info("GET s3://%s/%s", bucket, filename) obj = s3.get_object(Bucket=bucket, Key=filename) - smartapis = json.loads(obj["Body"].read()) + + filename = filename.replace("db_backup/", "") + + if filename.endswith(".zip"): + file_content = obj["Body"].read() + with zipfile.ZipFile(io.BytesIO(file_content)) as zfile: + # Search for a JSON file inside the ZIP + json_file = next((f for f in zfile.namelist() if f.endswith(".json")), None) + if not json_file: + raise ValueError("No JSON file found inside the ZIP archive.") + with zfile.open(json_file) as json_data: + smartapis = json.load(json_data) + elif filename.endswith(".json"): + smartapis = json.loads(obj["Body"].read()) + else: + raise Exception("Unsupported backup file type!") + _restore(smartapis) def restore_from_file(filename): - with open(filename) as file: - smartapis = json.load(file) - _restore(smartapis) + if filename.endswith(".zip"): + with zipfile.ZipFile(filename, 'r') as zfile: + # Search for a JSON file inside the ZIP + json_file = next((f for f in zfile.namelist() if f.endswith(".json")), None) + if not json_file: + raise ValueError("No JSON file found inside the ZIP archive.") + with zfile.open(json_file) as json_data: + smartapis = json.load(json_data) + elif filename.endswith(".json"): + with open(filename) as file: + smartapis = json.load(file) + else: + raise Exception("Unsupported backup file type!") + + _restore(smartapis) def refresh_document(): @@ -226,7 +284,7 @@ def refresh_has_metakg(): _lock = FileLock(".lock", timeout=0) -def routine(no_backup=False): +def routine(no_backup=False, format="zip"): logger = logging.getLogger("routine") # Add jitter: random delay between 100 and 500 milliseconds (adjust range as needed) @@ -244,8 +302,8 @@ def routine(no_backup=False): if lock_acquired: logger.info("Schedule lock acquired successfully.") if not no_backup: - logger.info("backup_to_s3()") - backup_to_s3() + logger.info(f"backup_to_s3(format={format})") + backup_to_s3(format=format) logger.info("refresh_document()") refresh_document() logger.info("check_uptime()") @@ -262,6 +320,7 @@ def routine(no_backup=False): logger.warning("Schedule lock acquired by another process. No need to run it in this process.") except Exception as e: logger.error(f"An error occurred during the routine: {e}") + logger.error("Stack trace:", exc_info=True) finally: if lock_acquired: _lock.release()