From f288a55f218661fbe8bcb0b7a6fad5ed13c5ec1b Mon Sep 17 00:00:00 2001 From: Bryan Hilbert Date: Wed, 30 Oct 2024 14:11:25 -0400 Subject: [PATCH] Filter rootnames/filenames before calling the filename_parser --- jwql/utils/constants.py | 1 + jwql/utils/utils.py | 13 +++- .../apps/jwql/archive_database_update.py | 72 ++++++++++++++++++- 3 files changed, 84 insertions(+), 2 deletions(-) diff --git a/jwql/utils/constants.py b/jwql/utils/constants.py index 37d390a74..37ee98757 100644 --- a/jwql/utils/constants.py +++ b/jwql/utils/constants.py @@ -381,6 +381,7 @@ FILE_PROG_ID_LEN = 5 FILE_SEG_LEN = 3 FILE_SOURCE_ID_LEN = 5 +FILE_SOURCE_ID_LONG_LEN = 9 FILE_TARG_ID_LEN = 3 FILE_VISIT_GRP_LEN = 2 FILE_VISIT_LEN = 3 diff --git a/jwql/utils/utils.py b/jwql/utils/utils.py index d96388d9a..49d060422 100644 --- a/jwql/utils/utils.py +++ b/jwql/utils/utils.py @@ -52,7 +52,7 @@ from jwql.utils.constants import FILE_AC_CAR_ID_LEN, FILE_AC_O_ID_LEN, FILE_ACT_LEN, \ FILE_DATETIME_LEN, FILE_EPOCH_LEN, FILE_GUIDESTAR_ATTMPT_LEN_MIN, \ FILE_GUIDESTAR_ATTMPT_LEN_MAX, FILE_OBS_LEN, FILE_PARALLEL_SEQ_ID_LEN, \ - FILE_PROG_ID_LEN, FILE_SEG_LEN, FILE_SOURCE_ID_LEN, FILE_SUFFIX_TYPES, \ + FILE_PROG_ID_LEN, FILE_SEG_LEN, FILE_SOURCE_ID_LEN, FILE_SOURCE_ID_LONG_LEN, FILE_SUFFIX_TYPES, \ FILE_TARG_ID_LEN, FILE_VISIT_GRP_LEN, FILE_VISIT_LEN, FILETYPE_WO_STANDARD_SUFFIX, \ JWST_INSTRUMENT_NAMES_SHORTHAND, ON_GITHUB_ACTIONS __location__ = os.path.dirname(os.path.dirname(os.path.dirname(__file__))) @@ -426,6 +426,17 @@ def filename_parser(filename): r"(?P\d{" + f"{FILE_VISIT_LEN}" + "})"\ r"(_.._msa.fits)" + # Stage 2 WFSS source-based files + # e.g. jw06434-c1021_s000001510_nircam_f444w-grismr + #stage_2_source = \ + # r"jw" \ + # r"(?P\d{" + f"{FILE_PROG_ID_LEN}" + "})"\ + # r"-(?P(o\d{" + f"{FILE_AC_O_ID_LEN}" + r"}|(c|a|r)\d{" + f"{FILE_AC_CAR_ID_LEN}" + "}))"\ + # r"_(?P(s)\d{" + f"{FILE_SOURCE_ID_LONG_LEN}" + "})"\ + # r"_(?P(nircam|niriss|nirspec|miri|fgs))"\ + # r"_(?P((?!_)[\w-])+)"\ + # r"-" + # Stage 3 filenames with target ID # e.g. "jw80600-o009_t001_miri_f1130w_i2d.fits" stage_3_target_id = \ diff --git a/jwql/website/apps/jwql/archive_database_update.py b/jwql/website/apps/jwql/archive_database_update.py index 748cde5dc..674991f8b 100755 --- a/jwql/website/apps/jwql/archive_database_update.py +++ b/jwql/website/apps/jwql/archive_database_update.py @@ -43,13 +43,20 @@ import logging import os import argparse +import re import numpy as np import django from django.apps import apps from jwql.utils.protect_module import lock_module -from jwql.utils.constants import DEFAULT_MODEL_CHARFIELD +from jwql.utils.constants import (DEFAULT_MODEL_CHARFIELD, + FILE_PROG_ID_LEN, + FILE_AC_O_ID_LEN, + FILE_AC_CAR_ID_LEN, + FILE_SOURCE_ID_LONG_LEN, + FILE_TARG_ID_LEN + ) # These lines are needed in order to use the Django models in a standalone # script (as opposed to code run as a result of a webpage request). If these @@ -113,6 +120,11 @@ def get_updates(update_database): # Get set of unique rootnames all_rootnames = set(['_'.join(f.split('/')[-1].split('_')[:-1]) for f in filenames]) + + # Filter source-based level 2 files out of the rootnames and filenames + all_rootnames = filter_rootnames(all_rootnames) + filenames = filter_filenames(filenames, all_rootnames) + rootnames = [] for rootname in all_rootnames: filename_dict = filename_parser(rootname) @@ -510,6 +522,64 @@ def fill_empty_rootfileinfo(rootfileinfo_set): logging.info(f'\tSaved {saved_rootfileinfos} Root File Infos') +def filter_filenames(fnames, roots): + """Filter out filenames from ``fnames`` that don't match the names in ``roots`` + + Parameters + ---------- + fnames : list + List of filenames + + roots : list + List of rootnames + + Returns + ------- + filtered_fnames : list + Filtered list of filenames + """ + filtered_fnames = [] + for fname in fnames: + for root in roots: + if root in fname: + filtered_fnames.append(fname) + break + return filtered_fnames + + +def filter_rootnames(rootnames): + """Filter out rootnames that we know can't be parsed by the filename_parser. We use this + custom filter here rather than within the filename parser itself because in archive_database_update + we can end up providing thousands of unrecognized filenames (e.g. source-based WFSS files) to + the filename parser, which would result in thousands of logging statments and massive log files. + This way, we filter out the rootnames that obviously won't be parsed before calling the + filename_parser with the rest. jw06434-c1021_s000001510_nircam_f444w-grismr + jw06434-c1021_t000_nircam_clear-f090w_segm.fits + + Parameters + ---------- + rootnames : list + List of rootnames + + Returns + ------- + good_rootnames : list + List of rootnames that do not match the filters + """ + stage_2_source = \ + r"jw" \ + r"(?P\d{" + f"{FILE_PROG_ID_LEN}" + "})"\ + r"-(?P(o\d{" + f"{FILE_AC_O_ID_LEN}" + r"}|(c|a|r)\d{" + f"{FILE_AC_CAR_ID_LEN}" + "}))"\ + r"_(?P(s\d{" + f"{FILE_SOURCE_ID_LONG_LEN}" + r"}|(t)\d{" + f"{FILE_TARG_ID_LEN}" + "}))"\ + r"_(?P(nircam|niriss|miri))"\ + r"_(?P((?!_)[\w-])+)"\ + r"-" + + elements = re.compile(stage_2_source) + good_rootnames = [e for e in rootnames if elements.match(e) is None] + return good_rootnames + + @lock_module def protected_code(update_database, fill_empty_list): """Protected code ensures only 1 instance of module will run at any given time