diff --git a/ingestion_server/ingestion_server/cleanup.py b/ingestion_server/ingestion_server/cleanup.py index 14ba9063d3a..0fe8141de9a 100644 --- a/ingestion_server/ingestion_server/cleanup.py +++ b/ingestion_server/ingestion_server/cleanup.py @@ -7,6 +7,8 @@ import csv import logging as log import multiprocessing +import pathlib +import shutil import time import uuid from urllib.parse import urlparse @@ -62,8 +64,12 @@ "www.eol.org": True, ".digitaltmuseum.org": True, "collections.musee-mccord.qc.ca": False, + ".stocksnap.io": True, + "cdn.stocksnap.io": True, } +TMP_DIR = pathlib.Path("/tmp/cleaned_data").resolve() + def _tag_denylisted(tag): """Check if a tag is banned or contains a banned substring.""" @@ -106,9 +112,9 @@ def cleanup_url(url, tls_support): log.debug(f"Tested domain {_tld}") if tls_supported: - return f"'https://{url}'" + return f"https://{url}" else: - return f"'http://{url}'" + return f"http://{url}" else: return None @@ -141,6 +147,7 @@ def cleanup_tags(tags): if update_required: fragment = Json(tag_output) + log.debug(f"Tags fragment: {fragment}") return fragment else: return None @@ -200,7 +207,7 @@ def test_tls_supported(cls, url): https = url.replace("http://", "https://") try: res = re.get(https, timeout=2) - log.info(f"{https}:{res.status_code}") + log.info(f"tls_test - {https}:{res.status_code}") return 200 <= res.status_code < 400 except re.RequestException: return False @@ -243,23 +250,27 @@ def _clean_data_worker(rows, temp_table, sources_config, all_fields: list[str]): if clean: cleaned_data[update_field] = clean log.debug( - f"Updated {update_field} for {identifier} " - f"from '{dirty_value}' to '{clean}'" + f"Updated {update_field} for {identifier}\n\t" + f"from '{dirty_value}' \n\tto '{clean}'" ) # Generate SQL update for all the fields we just cleaned update_field_expressions = [] for field, clean_value in cleaned_data.items(): - update_field_expressions.append(f"{field} = {clean_value}") - # Save cleaned values for later - # (except for tags, which take up too much space) if field == "tags": + # The `clean_value` for tags already includes the single quotes, + # so it's not necessary to add them, and they're omitted in + # `cleaned_values` to save in files later because they take up + # too much disk space. + update_field_expressions.append(f"{field} = {clean_value}") continue + update_field_expressions.append(f"{field} = '{clean_value}'") cleaned_values[field].append((identifier, clean_value)) if len(update_field_expressions) > 0: update_query = f"""UPDATE {temp_table} SET {', '.join(update_field_expressions)} WHERE id = {_id} """ + log.debug(f"Executing update query: \n\t{update_query}") write_cur.execute(update_query) log.info(f"TLS cache: {TLS_CACHE}") log.info("Worker committing changes...") @@ -273,18 +284,17 @@ def _clean_data_worker(rows, temp_table, sources_config, all_fields: list[str]): def save_cleaned_data(result: dict) -> dict[str, int]: - log.info("Saving cleaned data...") start_time = time.perf_counter() cleanup_counts = {field: len(items) for field, items in result.items()} for field, cleaned_items in result.items(): # Skip the tag field because the file is too large and fills up the disk - if field == "tag": + if field == "tag" or not cleaned_items: continue - if cleaned_items: - with open(f"{field}.tsv", "a") as f: - csv_writer = csv.writer(f, delimiter="\t") - csv_writer.writerows(cleaned_items) + + with open(TMP_DIR.joinpath(f"{field}.tsv"), "a", encoding="utf-8") as f: + csv_writer = csv.writer(f, delimiter="\t") + csv_writer.writerows(cleaned_items) end_time = time.perf_counter() total_time = end_time - start_time @@ -300,6 +310,10 @@ def clean_image_data(table): :return: None """ + # Recreate directory where cleaned data is stored + shutil.rmtree(TMP_DIR, ignore_errors=True) + TMP_DIR.mkdir(parents=True) + # Map each table to the fields that need to be cleaned up. Then, map each # field to its cleanup function. log.info("Cleaning up data...") diff --git a/ingestion_server/test/unit_tests/test_cleanup.py b/ingestion_server/test/unit_tests/test_cleanup.py index 696a873c065..02cc6d047e2 100644 --- a/ingestion_server/test/unit_tests/test_cleanup.py +++ b/ingestion_server/test/unit_tests/test_cleanup.py @@ -47,12 +47,12 @@ def test_url_protocol_fix(): tls_support_cache = {} pook.get("https://flickr.com").reply(200) result = CleanupFunctions.cleanup_url(bad_url, tls_support_cache) - expected = "'https://flickr.com'" + expected = "https://flickr.com" bad_http = "neverssl.com" pook.get("https://neverssl.com").reply(500) result_http = CleanupFunctions.cleanup_url(bad_http, tls_support_cache) - expected_http = "'http://neverssl.com'" + expected_http = "http://neverssl.com" assert result == expected assert result_http == expected_http