diff --git a/wikiteam3/uploader/uploader.py b/wikiteam3/uploader/uploader.py index f23515c6..94eb6dc0 100644 --- a/wikiteam3/uploader/uploader.py +++ b/wikiteam3/uploader/uploader.py @@ -2,9 +2,11 @@ from datetime import datetime import json import os +import random import re import shutil from dataclasses import dataclass +import sys import time import traceback from typing import Dict, List, Optional, Tuple, Union @@ -22,6 +24,7 @@ from wikiteam3.uploader.socketLock import NoLock, SocketLockServer from wikiteam3.utils import url2prefix_from_config, sha1sum from wikiteam3.uploader.compresser import ZstdCompressor, SevenZipCompressor +from wikiteam3.utils.ia_checker import ia_s3_tasks_load_avg from wikiteam3.utils.util import ALL_DUMPED_MARK, UPLOADED_MARK, mark_as_done, is_markfile_exists DEFAULT_COLLECTION = 'opensource' @@ -380,6 +383,26 @@ def upload(arg: Args): print("=== Preparing metadata ===") metadata, logo_url = prepare_item_metadata(wikidump_dir, config, arg) + print("=== Checking IA S3 load average (optional) ===") + + try: + avg_load = ia_s3_tasks_load_avg(session=item.session) # check IA load + print(f"IA S3 load: {avg_load * 100:.4f}%") + if avg_load > 0.99: + print("WARNING: IA S3 is heavily overloaded, upload may fail") + print("Deciding whether to continue even if IA S3 is heavily overloaded... (20% chance to continue, random)") + if random.random() < 0.8: + print("To prevent IA S3 from being overloaded further, please try uploading later, exiting...") + sys.exit(99) + print("Continuing anyway...") + elif avg_load > 0.9: + print("WARNING: IA S3 is overloaded, upload may fail") + except Exception as e: + traceback.print_exc() + print(f"Failed to get IA S3 load average: {e}") + print("Don't worry, it's optional.") + + if arg.dry_run: print("=== Dry run, exiting ===") return diff --git a/wikiteam3/utils/ia_checker.py b/wikiteam3/utils/ia_checker.py index 818bf6f4..3c208345 100644 --- a/wikiteam3/utils/ia_checker.py +++ b/wikiteam3/utils/ia_checker.py @@ -9,6 +9,18 @@ IA_MAX_RETRY = 5 logger = logging.getLogger(__name__) + +def ia_s3_tasks_load_avg(session: ArchiveSession) -> float: + api = "https://s3.us.archive.org/?check_limit=1" + r = session.get(api, timeout=16) + r.raise_for_status() + r_json = r.json() + total_tasks_queued = r_json["detail"]["total_tasks_queued"] + total_global_limit = r_json["detail"]["total_global_limit"] + logger.info(f"ia_s3_load_avg(): {total_tasks_queued} / {total_global_limit}") + return total_tasks_queued / total_global_limit + + def search_ia(apiurl: Optional[str] = None, indexurl: Optional[str] = None, addeddate_intervals: Optional[List[str]] = None): if apiurl is None: apiurl = 'api.php'.join(indexurl.rsplit('index.php', 1)) if indexurl else None