Skip to content

Commit

Permalink
Filter rootnames/filenames before calling the filename_parser
Browse files Browse the repository at this point in the history
  • Loading branch information
bhilbert4 committed Oct 30, 2024
1 parent 2d655d8 commit f288a55
Show file tree
Hide file tree
Showing 3 changed files with 84 additions and 2 deletions.
1 change: 1 addition & 0 deletions jwql/utils/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -381,6 +381,7 @@
FILE_PROG_ID_LEN = 5
FILE_SEG_LEN = 3
FILE_SOURCE_ID_LEN = 5
FILE_SOURCE_ID_LONG_LEN = 9
FILE_TARG_ID_LEN = 3
FILE_VISIT_GRP_LEN = 2
FILE_VISIT_LEN = 3
Expand Down
13 changes: 12 additions & 1 deletion jwql/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@
from jwql.utils.constants import FILE_AC_CAR_ID_LEN, FILE_AC_O_ID_LEN, FILE_ACT_LEN, \
FILE_DATETIME_LEN, FILE_EPOCH_LEN, FILE_GUIDESTAR_ATTMPT_LEN_MIN, \
FILE_GUIDESTAR_ATTMPT_LEN_MAX, FILE_OBS_LEN, FILE_PARALLEL_SEQ_ID_LEN, \
FILE_PROG_ID_LEN, FILE_SEG_LEN, FILE_SOURCE_ID_LEN, FILE_SUFFIX_TYPES, \
FILE_PROG_ID_LEN, FILE_SEG_LEN, FILE_SOURCE_ID_LEN, FILE_SOURCE_ID_LONG_LEN, FILE_SUFFIX_TYPES, \
FILE_TARG_ID_LEN, FILE_VISIT_GRP_LEN, FILE_VISIT_LEN, FILETYPE_WO_STANDARD_SUFFIX, \
JWST_INSTRUMENT_NAMES_SHORTHAND, ON_GITHUB_ACTIONS
__location__ = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
Expand Down Expand Up @@ -426,6 +426,17 @@ def filename_parser(filename):
r"(?P<visit>\d{" + f"{FILE_VISIT_LEN}" + "})"\
r"(_.._msa.fits)"

# Stage 2 WFSS source-based files
# e.g. jw06434-c1021_s000001510_nircam_f444w-grismr
#stage_2_source = \
# r"jw" \
# r"(?P<program_id>\d{" + f"{FILE_PROG_ID_LEN}" + "})"\
# r"-(?P<ac_id>(o\d{" + f"{FILE_AC_O_ID_LEN}" + r"}|(c|a|r)\d{" + f"{FILE_AC_CAR_ID_LEN}" + "}))"\
# r"_(?P<target_id>(s)\d{" + f"{FILE_SOURCE_ID_LONG_LEN}" + "})"\
# r"_(?P<instrument>(nircam|niriss|nirspec|miri|fgs))"\
# r"_(?P<optical_elements>((?!_)[\w-])+)"\
# r"-"

# Stage 3 filenames with target ID
# e.g. "jw80600-o009_t001_miri_f1130w_i2d.fits"
stage_3_target_id = \
Expand Down
72 changes: 71 additions & 1 deletion jwql/website/apps/jwql/archive_database_update.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,13 +43,20 @@
import logging
import os
import argparse
import re

import numpy as np
import django

from django.apps import apps
from jwql.utils.protect_module import lock_module
from jwql.utils.constants import DEFAULT_MODEL_CHARFIELD
from jwql.utils.constants import (DEFAULT_MODEL_CHARFIELD,
FILE_PROG_ID_LEN,
FILE_AC_O_ID_LEN,
FILE_AC_CAR_ID_LEN,
FILE_SOURCE_ID_LONG_LEN,
FILE_TARG_ID_LEN
)

# These lines are needed in order to use the Django models in a standalone
# script (as opposed to code run as a result of a webpage request). If these
Expand Down Expand Up @@ -113,6 +120,11 @@ def get_updates(update_database):

# Get set of unique rootnames
all_rootnames = set(['_'.join(f.split('/')[-1].split('_')[:-1]) for f in filenames])

# Filter source-based level 2 files out of the rootnames and filenames
all_rootnames = filter_rootnames(all_rootnames)
filenames = filter_filenames(filenames, all_rootnames)

rootnames = []
for rootname in all_rootnames:
filename_dict = filename_parser(rootname)
Expand Down Expand Up @@ -510,6 +522,64 @@ def fill_empty_rootfileinfo(rootfileinfo_set):
logging.info(f'\tSaved {saved_rootfileinfos} Root File Infos')


def filter_filenames(fnames, roots):
"""Filter out filenames from ``fnames`` that don't match the names in ``roots``
Parameters
----------
fnames : list
List of filenames
roots : list
List of rootnames
Returns
-------
filtered_fnames : list
Filtered list of filenames
"""
filtered_fnames = []
for fname in fnames:
for root in roots:
if root in fname:
filtered_fnames.append(fname)
break
return filtered_fnames


def filter_rootnames(rootnames):
"""Filter out rootnames that we know can't be parsed by the filename_parser. We use this
custom filter here rather than within the filename parser itself because in archive_database_update
we can end up providing thousands of unrecognized filenames (e.g. source-based WFSS files) to
the filename parser, which would result in thousands of logging statments and massive log files.
This way, we filter out the rootnames that obviously won't be parsed before calling the
filename_parser with the rest. jw06434-c1021_s000001510_nircam_f444w-grismr
jw06434-c1021_t000_nircam_clear-f090w_segm.fits
Parameters
----------
rootnames : list
List of rootnames
Returns
-------
good_rootnames : list
List of rootnames that do not match the filters
"""
stage_2_source = \
r"jw" \
r"(?P<program_id>\d{" + f"{FILE_PROG_ID_LEN}" + "})"\
r"-(?P<ac_id>(o\d{" + f"{FILE_AC_O_ID_LEN}" + r"}|(c|a|r)\d{" + f"{FILE_AC_CAR_ID_LEN}" + "}))"\
r"_(?P<target_id>(s\d{" + f"{FILE_SOURCE_ID_LONG_LEN}" + r"}|(t)\d{" + f"{FILE_TARG_ID_LEN}" + "}))"\
r"_(?P<instrument>(nircam|niriss|miri))"\
r"_(?P<optical_elements>((?!_)[\w-])+)"\
r"-"

elements = re.compile(stage_2_source)
good_rootnames = [e for e in rootnames if elements.match(e) is None]
return good_rootnames


@lock_module
def protected_code(update_database, fill_empty_list):
"""Protected code ensures only 1 instance of module will run at any given time
Expand Down

0 comments on commit f288a55

Please sign in to comment.