From a27a5fd794f769a478a81d855968c4ea5db01c08 Mon Sep 17 00:00:00 2001 From: akremin Date: Sun, 28 Jul 2024 23:40:02 -0700 Subject: [PATCH 01/27] add proctable concatenation cache, update exptable cache, and add cross-night redshift dependencies --- py/desispec/scripts/proc_night.py | 19 +- py/desispec/scripts/submit_night.py | 4 +- py/desispec/scripts/tile_redshifts.py | 60 +++-- py/desispec/scripts/tile_redshifts_bash.py | 6 +- py/desispec/scripts/update_exptable.py | 2 +- py/desispec/scripts/zproc.py | 6 +- py/desispec/scripts/zprocdashboard.py | 6 +- py/desispec/test/test_proc_night.py | 2 +- py/desispec/workflow/processing.py | 123 ++++++---- py/desispec/workflow/redshifts.py | 251 +++++++++++++++++++-- py/desispec/workflow/science_selection.py | 4 +- py/desispec/workflow/tableio.py | 9 +- 12 files changed, 392 insertions(+), 100 deletions(-) diff --git a/py/desispec/scripts/proc_night.py b/py/desispec/scripts/proc_night.py index 36f514cc2..1f1928f5e 100644 --- a/py/desispec/scripts/proc_night.py +++ b/py/desispec/scripts/proc_night.py @@ -7,6 +7,8 @@ from desispec.scripts.link_calibnight import derive_include_exclude from desispec.workflow.calibration_selection import \ determine_calibrations_to_proc +from desispec.workflow.redshifts import read_minimal_science_exptab_cols, \ + read_minimal_tilenight_proctab_cols from desispec.workflow.science_selection import determine_science_to_proc, \ get_tiles_cumulative from desiutil.log import get_logger @@ -336,6 +338,15 @@ def proc_night(night=None, proc_obstypes=None, z_submit_types=None, etable, ptable = load_tables(tablenames=table_pathnames, tabletypes=table_types) full_etable = etable.copy() + ## Pre-populate exposure table and processing table caches of all nights + ## if doing cross-night redshifts + if 'cumulative' in z_submit_types: + ## this shouldn't need to change since we've already updated the exptab + read_minimal_science_exptab_cols() + ## this would become out of date for the current night except + ## write_table will keep it up to date + read_minimal_tilenight_proctab_cols() + ## Cut on OBSTYPES log.info(f"Processing the following obstypes: {proc_obstypes}") good_types = np.isin(np.array(etable['OBSTYPE']).astype(str), proc_obstypes) @@ -346,7 +357,7 @@ def proc_night(night=None, proc_obstypes=None, z_submit_types=None, if tableng > 0: ptable = update_from_queue(ptable, dry_run=dry_run_level) if dry_run_level < 3: - write_table(ptable, tablename=proc_table_pathname) + write_table(ptable, tablename=proc_table_pathname, tabletype='proctable') if any_jobs_failed(ptable['STATUS']): ## Try up to two times to resubmit failures, afterwards give up ## unless explicitly told to proceed with the failures @@ -472,7 +483,7 @@ def create_submit_add_and_save(prow, proctable, check_outputs=check_for_outputs, ## Add the processing row to the processing table proctable.add_row(prow) if len(proctable) > 0 and dry_run_level < 3: - write_table(proctable, tablename=proc_table_pathname) + write_table(proctable, tablename=proc_table_pathname, tabletype='proctable') sleep_and_report(sub_wait_time, message_suffix=f"to slow down the queue submission rate", dry_run=dry_run, logfunc=log.info) @@ -559,7 +570,7 @@ def create_submit_add_and_save(prow, proctable, check_outputs=check_for_outputs, extra_job_args=extra_job_args) if len(ptable) > 0 and dry_run_level < 3: - write_table(ptable, tablename=proc_table_pathname) + write_table(ptable, tablename=proc_table_pathname, tabletype='proctable') sleep_and_report(sub_wait_time, message_suffix=f"to slow down the queue submission rate", @@ -575,7 +586,7 @@ def create_submit_add_and_save(prow, proctable, check_outputs=check_for_outputs, ## All jobs now submitted, update information from job queue and save ptable = update_from_queue(ptable, dry_run=dry_run_level) if dry_run_level < 3: - write_table(ptable, tablename=proc_table_pathname) + write_table(ptable, tablename=proc_table_pathname, tabletype='proctable') ## Now that processing is complete, lets identify what we didn't process if len(ptable) > 0: processed = np.isin(full_etable['EXPID'], np.unique(np.concatenate(ptable['EXPID']))) diff --git a/py/desispec/scripts/submit_night.py b/py/desispec/scripts/submit_night.py index 2718e3eb0..9695a2a07 100644 --- a/py/desispec/scripts/submit_night.py +++ b/py/desispec/scripts/submit_night.py @@ -25,7 +25,7 @@ checkfor_and_submit_joint_job, submit_tilenight_and_redshifts from desispec.workflow.queue import update_from_queue, any_jobs_not_complete from desispec.workflow.desi_proc_funcs import get_desi_proc_batch_file_path -from desispec.workflow.redshifts import read_minimal_exptables_columns +from desispec.workflow.redshifts import read_minimal_science_exptab_cols from desispec.io.util import decode_camword, difference_camwords, create_camword def submit_night(night, proc_obstypes=None, z_submit_types=None, queue='realtime', @@ -245,7 +245,7 @@ def submit_night(night, proc_obstypes=None, z_submit_types=None, queue='realtime tiles_cumulative = list(tiles_this_night) log.info(f'Submitting cumulative redshifts for all tiles: {tiles_cumulative}') else: - allexp = read_minimal_exptables_columns(tileids=tiles_this_night) + allexp = read_minimal_science_exptab_cols(tileids=tiles_this_night) for tileid in tiles_this_night: nights_with_tile = allexp['NIGHT'][allexp['TILEID'] == tileid] if len(nights_with_tile) > 0 and night == np.max(nights_with_tile): diff --git a/py/desispec/scripts/tile_redshifts.py b/py/desispec/scripts/tile_redshifts.py index 8ddb54e1e..fbb203b60 100644 --- a/py/desispec/scripts/tile_redshifts.py +++ b/py/desispec/scripts/tile_redshifts.py @@ -11,7 +11,7 @@ from astropy.table import Table, vstack from desispec.io.util import parse_cameras -from desispec.workflow.redshifts import read_minimal_exptables_columns, \ +from desispec.workflow.redshifts import read_minimal_science_exptab_cols, \ create_desi_zproc_batch_script from desiutil.log import get_logger @@ -60,17 +60,6 @@ def main(args=None): num_error = len(failed_jobs) sys.exit(num_error) -# _allexp is cache of all exposure tables stacked so that we don't have to read all -# of them every time we call generate_tile_redshift_scripts() -_allexp = None - -def reset_allexp_cache(): - """ - Utility script to reset the _allexp cache to ensure it is re-read from disk - """ - global _allexp - _allexp = None - def generate_tile_redshift_scripts(group, nights=None, tileid=None, expids=None, explist=None, camword=None, max_gpuprocs=None, no_gpu=False, run_zmtl=False, no_afterburners=False, @@ -126,7 +115,7 @@ def generate_tile_redshift_scripts(group, nights=None, tileid=None, expids=None, else: log.info(f'Loading production exposure tables for all nights') - exptable = read_minimal_exptables_columns(nights) + exptable = read_minimal_science_exptab_cols(nights) else: log.info(f'Loading exposure list from {explist}') @@ -190,28 +179,49 @@ def generate_tile_redshift_scripts(group, nights=None, tileid=None, expids=None, # - NOTE: depending upon options, this might re-read all the exptables again # - NOTE: this may not scale well several years into the survey if group == 'cumulative': + if nights is not None: + lastnight = int(np.max(nights)) + elif exptable is not None: + lastnight = int(np.max(exptable['NIGHT'])) + else: + lastnight = None log.info(f'{len(tileids)} tiles; searching for exposures on prior nights') - global _allexp - if _allexp is None: - log.info(f'Reading all exposure_tables from all nights') - _allexp = read_minimal_exptables_columns() - keep = np.in1d(_allexp['TILEID'], tileids) - newexptable = _allexp[keep] + log.info(f'Reading all exposure_tables from all nights') + newexptable = read_minimal_science_exptab_cols(tileids=tileids) + newexptable = newexptable[['EXPID', 'NIGHT', 'TILEID']] + if exptable is not None: expids = exptable['EXPID'] missing_exps = np.in1d(expids, newexptable['EXPID'], invert=True) if np.any(missing_exps): - latest_exptable = read_minimal_exptables_columns(nights=np.unique(exptable['NIGHT'][missing_exps])) - keep = np.in1d(latest_exptable['EXPID'], expids[missing_exps]) - latest_exptable = latest_exptable[keep] - newexptable = vstack([newexptable, latest_exptable]) + log.warning(f'Identified {np.sum(missing_exps)} missing exposures ' + + f'in the exposure cache. Resetting the cache to acquire' + + f' them from all nights') + ## reset_cache will remove cache but it won't be repopulated + ## unless we request all nights. So let's request all nights + ## then subselect to the nights we want + latest_exptable = read_minimal_science_exptab_cols(tileids=tileids, + reset_cache=True) + latest_exptable = latest_exptable[['EXPID', 'NIGHT', 'TILEID']] + missing_exps = np.in1d(expids, newexptable['EXPID'], invert=True) + if np.any(missing_exps): + log.error(f'Identified {np.sum(missing_exps)} missing exposures ' + + f'in the exposure cache even after updating. Using the ' + + f'appending the user provided exposures but this may ' + + f'indicate a problem.') + newexptable = vstack([latest_exptable, exptable[missing_exps]]) + else: + newexptable = latest_exptable newexptable.sort(['EXPID']) exptable = newexptable + ## Ensure we only include data for nights up to and including specified nights - if nights is not None: - lastnight = int(np.max(nights)) + if lastnight is not None: + log.info(f'Selecting only those exposures on nights before or ' + + f'during the latest night provided: {lastnight}') exptable = exptable[exptable['NIGHT'] <= lastnight] + #expids = np.array(exptable['EXPID']) tileids = np.unique(np.array(exptable['TILEID'])) diff --git a/py/desispec/scripts/tile_redshifts_bash.py b/py/desispec/scripts/tile_redshifts_bash.py index 4561dc5ef..41ae5f160 100644 --- a/py/desispec/scripts/tile_redshifts_bash.py +++ b/py/desispec/scripts/tile_redshifts_bash.py @@ -10,7 +10,7 @@ import numpy as np from astropy.table import Table, vstack -from desispec.workflow.redshifts import read_minimal_exptables_columns, \ +from desispec.workflow.redshifts import read_minimal_science_exptab_cols, \ get_ztile_script_pathname, get_ztile_relpath, \ get_ztile_script_suffix from desiutil.log import get_logger @@ -599,7 +599,7 @@ def generate_tile_redshift_scripts(group, night=None, tileid=None, expid=None, e else: log.info(f'Loading production exposure tables for all nights') - exptable = read_minimal_exptables_columns(night) + exptable = read_minimal_science_exptab_cols(night) else: log.info(f'Loading exposure list from {explist}') @@ -656,7 +656,7 @@ def generate_tile_redshift_scripts(group, night=None, tileid=None, expid=None, e # - NOTE: this may not scale well several years into the survey if group == 'cumulative': log.info(f'{len(tileids)} tiles; searching for exposures on prior nights') - allexp = read_minimal_exptables_columns() + allexp = read_minimal_science_exptab_cols() keep = np.in1d(allexp['TILEID'], tileids) exptable = allexp[keep] ## Ensure we only include data for nights up to and including specified nights diff --git a/py/desispec/scripts/update_exptable.py b/py/desispec/scripts/update_exptable.py index 56d9d5224..961a9512a 100644 --- a/py/desispec/scripts/update_exptable.py +++ b/py/desispec/scripts/update_exptable.py @@ -180,7 +180,7 @@ def update_exposure_table(night=None, specprod=None, exp_table_pathname=None, ## Only write out the table at the end and only if dry_run_level dictates if dry_run_level < 3: - write_table(etable, tablename=exp_table_pathname) + write_table(etable, tablename=exp_table_pathname, tabletype='exptable') else: log.info(f"{dry_run_level=}, so not saving exposure table.\n{etable=}") diff --git a/py/desispec/scripts/zproc.py b/py/desispec/scripts/zproc.py index 8311ffcd9..f23760b70 100644 --- a/py/desispec/scripts/zproc.py +++ b/py/desispec/scripts/zproc.py @@ -29,7 +29,7 @@ import desiutil.iers from desispec.io.meta import get_nights_up_to_date -from desispec.workflow.redshifts import read_minimal_exptables_columns, \ +from desispec.workflow.redshifts import read_minimal_science_exptab_cols, \ create_desi_zproc_batch_script #- internal desispec imports @@ -357,8 +357,8 @@ def main(args=None, comm=None): ## Get list of only nights up to date of thrunight nights = get_nights_up_to_date(args.thrunight) - exposure_table = read_minimal_exptables_columns(nights=nights, - tileids=[tileid]) + exposure_table = read_minimal_science_exptab_cols(nights=nights, + tileids=[tileid]) if args.expids is not None: exposure_table = exposure_table[np.isin(exposure_table['EXPID'], args.expids)] diff --git a/py/desispec/scripts/zprocdashboard.py b/py/desispec/scripts/zprocdashboard.py index 13fd83f77..6f00ff398 100644 --- a/py/desispec/scripts/zprocdashboard.py +++ b/py/desispec/scripts/zprocdashboard.py @@ -27,7 +27,7 @@ from desispec.workflow.proctable import get_processing_table_pathname, \ erow_to_prow, instantiate_processing_table from desispec.workflow.tableio import load_table -from desispec.workflow.redshifts import read_minimal_exptables_columns +from desispec.workflow.redshifts import read_minimal_science_exptab_cols from desispec.io.meta import specprod_root, rawdata_root, findfile from desispec.io.util import decode_camword, camword_to_spectros, \ difference_camwords, parse_badamps, create_camword, camword_union, \ @@ -127,7 +127,7 @@ def main(args=None): log.info(f'Searching {prod_dir} for: {nights}') ## Get all the exposure tables for cross-night dependencies - all_exptabs = read_minimal_exptables_columns(nights=None) + all_exptabs = read_minimal_science_exptab_cols(nights=None) ## We don't want future days mixing in all_exptabs = all_exptabs[all_exptabs['NIGHT'] <= np.max(nights)] ## Restrict to only the exptabs relevant to the current dashboard @@ -199,7 +199,7 @@ def populate_night_zinfo(night, doem=True, doqso=True, dotileqa=True, skipd_tileids (list): List of tileids that should be skipped and not listed in the output dashboard. all_exptabs (astropy.table.Table): A stacked exposure table with minimal - columns returned from read_minimal_exptables_columns(). Used for + columns returned from read_minimal_science_exptab_cols(). Used for cumulative redshifts jobs to identify tile data from previous nights. Returns dict: diff --git a/py/desispec/test/test_proc_night.py b/py/desispec/test/test_proc_night.py index 1c0ad4a51..1ec6a638e 100644 --- a/py/desispec/test/test_proc_night.py +++ b/py/desispec/test/test_proc_night.py @@ -403,7 +403,7 @@ def test_proc_night_daily(self): while True: num_newlinks = link_rawdata(self.real_rawnight_dir, self.test_rawnight_dir, numexp=10) - desispec.scripts.tile_redshifts.reset_allexp_cache() + desispec.workflow.redshifts.reset_science_etab_cache() if num_newlinks == 0: break else: diff --git a/py/desispec/workflow/processing.py b/py/desispec/workflow/processing.py index 67c6372c7..bbd302f35 100644 --- a/py/desispec/workflow/processing.py +++ b/py/desispec/workflow/processing.py @@ -16,8 +16,9 @@ from desispec.scripts.link_calibnight import derive_include_exclude from desispec.scripts.tile_redshifts import generate_tile_redshift_scripts from desispec.workflow.redshifts import get_ztile_script_pathname, \ - get_ztile_relpath, \ - get_ztile_script_suffix + get_ztile_relpath, \ + get_ztile_script_suffix, read_minimal_tilenight_proctab_cols, \ + read_minimal_science_exptab_cols from desispec.workflow.queue import get_resubmission_states, update_from_queue, \ queue_info_from_qids, get_queue_states_from_qids, update_queue_state_cache from desispec.workflow.timing import what_night_is_it @@ -1546,23 +1547,59 @@ def submit_redshifts(ptable, prows, tnight, internal_id, queue, reservation, if len(zprows) > 0: for zsubtype in z_submit_types: + log.info(" ") + log.info(f"Submitting joint redshift fits of type {zsubtype} for TILEID {zprows[0]['TILEID']}.") if zsubtype == 'perexp': for zprow in zprows: - log.info(" ") - log.info(f"Submitting redshift fit of type {zsubtype} for TILEID {zprow['TILEID']} and EXPID {zprow['EXPID']}.\n") + log.info(f"EXPID: {zprow['EXPID']}.\n") redshift_prow = make_redshift_prow([zprow], tnight, descriptor=zsubtype, internal_id=internal_id) internal_id += 1 redshift_prow = create_and_submit(redshift_prow, queue=queue, reservation=reservation, joint=True, dry_run=dry_run, strictly_successful=strictly_successful, check_for_outputs=check_for_outputs, resubmit_partial_complete=resubmit_partial_complete, system_name=system_name) ptable.add_row(redshift_prow) - else: - log.info(" ") - log.info(f"Submitting joint redshift fits of type {zsubtype} for TILEID {zprows[0]['TILEID']}.") + elif zsubtype == 'cumulative': + tileids = np.unique([prow['TILEID'] for prow in zprows]) + if len(tileids) > 1: + msg = f"Error, more than one tileid provided for cumulative redshift job: {tileids}" + log.critical(msg) + raise ValueError(msg) + nights = np.unique([prow['NIGHT'] for prow in zprows]) + if len(nights) > 1: + msg = f"Error, more than one night provided for cumulative redshift job: {nights}" + log.critical(msg) + raise ValueError(msg) + tileid, night = tileids[0], nights[0] + ## For cumulative redshifts, get any existing processing rows for tile + matched_prows = read_minimal_tilenight_proctab_cols(tileids=tileids) + matched_prows = matched_prows[matched_prows['NIGHT']<=night] + ## Identify the processing rows that should be assigned as dependecies + ## tnight should be first such that the new job inherits the other metadata from it + tnights = [tnight] + for prow in matched_prows: + if matched_prows['INTID'] != tnight['INTID']: + tnights.append(prow) + log.info(f"Internal Processing IDs: {[prow['INTID'] for prow in tnights]}.\n") + ## Identify all exposures that should go into the fit expids = [prow['EXPID'][0] for prow in zprows] + ## note we can actually get the full list of exposures, but for now + ## we'll stay consistent with old processing where we only list exposures + ## from the current night + ## For cumulative redshifts, get valid expids from exptables + #matched_erows = read_minimal_science_exptab_cols(tileids=tileids) + #matched_erows = matched_erows[matched_erows['NIGHT']<=night] + #expids = list(set([prow['EXPID'][0] for prow in zprows])+set(matched_erows['EXPID'])) log.info(f"Expids: {expids}.\n") - redshift_prow = make_redshift_prow(zprows, tnight, descriptor=zsubtype, internal_id=internal_id) - internal_id += 1 + redshift_prow, internal_id = make_joint_prow(tnights, descriptor=zsubtype, internal_id=internal_id) + redshift_prow['EXPID'] = expids + redshift_prow = create_and_submit(redshift_prow, queue=queue, reservation=reservation, joint=True, dry_run=dry_run, + strictly_successful=strictly_successful, check_for_outputs=check_for_outputs, + resubmit_partial_complete=resubmit_partial_complete, system_name=system_name) + ptable.add_row(redshift_prow) + else: # pernight + expids = [prow['EXPID'][0] for prow in zprows] + log.info(f"Expids: {expids}.\n") + redshift_prow, internal_id = make_redshift_prow(zprows, tnight, descriptor=zsubtype, internal_id=internal_id) redshift_prow = create_and_submit(redshift_prow, queue=queue, reservation=reservation, joint=True, dry_run=dry_run, strictly_successful=strictly_successful, check_for_outputs=check_for_outputs, resubmit_partial_complete=resubmit_partial_complete, system_name=system_name) @@ -1700,6 +1737,7 @@ def make_joint_prow(prows, descriptor, internal_id): dict: Row of a processing table corresponding to the joint fit job. internal_id, int, the next internal id to be used for assignment (already incremented up from the last used id number used). """ + log = get_logger() first_row = table_row_to_dict(prows[0]) joint_prow = first_row.copy() @@ -1711,38 +1749,41 @@ def make_joint_prow(prows, descriptor, internal_id): joint_prow['SUBMIT_DATE'] = -99 joint_prow['STATUS'] = 'U' joint_prow['SCRIPTNAME'] = '' - joint_prow['EXPID'] = np.array([currow['EXPID'][0] for currow in prows], dtype=int) + joint_prow['EXPID'] = np.unique(np.concatenate([currow['EXPID'] for currow in prows])).astype(int) ## Assign the PROCCAMWORD based on the descriptor and the input exposures - if descriptor == 'stdstarfit': - pcamwords = [prow['PROCCAMWORD'] for prow in prows] + ## UPDATE 2024-04-24: badamps are now included in arc/flat joint fits, + ## so grab all PROCCAMWORDs instead of filtering out BADAMP cameras + ## For flats we want any camera that exists in all 12 exposures + ## For arcs we want any camera that exists in at least 3 exposures + pcamwords = [prow['PROCCAMWORD'] for prow in prows] + if descriptor in 'stdstarfit': joint_prow['PROCCAMWORD'] = camword_union(pcamwords, full_spectros_only=True) + elif descriptor in ['pernight', 'cumulative']: + joint_prow['PROCCAMWORD'] = camword_union(pcamwords, + full_spectros_only=False) + elif descriptor == 'nightlyflat': + joint_prow['PROCCAMWORD'] = camword_intersection(pcamwords, + full_spectros_only=False) + elif descriptor == 'psfnight': + ## Count number of exposures each camera is present for + camcheck = {} + for camword in pcamwords: + for cam in decode_camword(camword): + if cam in camcheck: + camcheck[cam] += 1 + else: + camcheck[cam] = 1 + ## if exists in 3 or more exposures, then include it + goodcams = [] + for cam,camcount in camcheck.items(): + if camcount >= 3: + goodcams.append(cam) + joint_prow['PROCCAMWORD'] = create_camword(goodcams) else: - ## UPDATE 2024-04-24: badamps are now included in arc/flat joint fits, - ## so grab all PROCCAMWORDs instead of filtering out BADAMP cameras - pcamwords = [prow['PROCCAMWORD'] for prow in prows] - - ## For flats we want any camera that exists in all 12 exposures - ## For arcs we want any camera that exists in at least 3 exposures - if descriptor == 'nightlyflat': - joint_prow['PROCCAMWORD'] = camword_intersection(pcamwords, - full_spectros_only=False) - elif descriptor == 'psfnight': - ## Count number of exposures each camera is present for - camcheck = {} - for camword in pcamwords: - for cam in decode_camword(camword): - if cam in camcheck: - camcheck[cam] += 1 - else: - camcheck[cam] = 1 - ## if exists in 3 or more exposures, then include it - goodcams = [] - for cam,camcount in camcheck.items(): - if camcount >= 3: - goodcams.append(cam) - joint_prow['PROCCAMWORD'] = create_camword(goodcams) + log.warning("Warning asked to produce joint proc table row for unknown" + + f" job description {descriptor}") joint_prow = assign_dependency(joint_prow, dependency=prows) return joint_prow, internal_id @@ -1787,11 +1828,11 @@ def make_tnight_prow(prows, calibjobs, internal_id): joint_prow['SCRIPTNAME'] = '' joint_prow['EXPID'] = np.array([currow['EXPID'][0] for currow in prows], dtype=int) - joint_prow = define_and_assign_dependency(joint_prow,calibjobs,use_tilenight=True) + joint_prow = define_and_assign_dependency(joint_prow, calibjobs, use_tilenight=True) return joint_prow -def make_redshift_prow(prows, tnight, descriptor, internal_id): +def make_redshift_prow(prows, tnights, descriptor, internal_id): """ Given an input list or array of processing table rows and a descriptor, this creates a joint fit processing job row. It starts by copying the first input row, overwrites relevant columns, and defines the new dependencies (based on the @@ -1800,11 +1841,11 @@ def make_redshift_prow(prows, tnight, descriptor, internal_id): Args: prows, list or array of dicts. Unsumbitted rows corresponding to the individual prestdstar jobs that are the first steps of tilenight. - tnight, Table.Row object. Row corresponding to the tilenight job on which the redshift job depends. + tnights, list or array of Table.Row objects. Rows corresponding to the tilenight jobs on which the redshift job depends. internal_id, int, the next internal id to be used for assignment (already incremented up from the last used id number used). Returns: - dict: Row of a processing table corresponding to the tilenight job. + dict: Row of a processing table corresponding to the tilenight jobs. """ first_row = table_row_to_dict(prows[0]) redshift_prow = first_row.copy() @@ -1818,7 +1859,7 @@ def make_redshift_prow(prows, tnight, descriptor, internal_id): redshift_prow['SCRIPTNAME'] = '' redshift_prow['EXPID'] = np.array([currow['EXPID'][0] for currow in prows], dtype=int) - redshift_prow = assign_dependency(redshift_prow,dependency=tnight) + redshift_prow = assign_dependency(redshift_prow,dependency=tnights) return redshift_prow diff --git a/py/desispec/workflow/redshifts.py b/py/desispec/workflow/redshifts.py index 0bae1f731..87cdba43d 100644 --- a/py/desispec/workflow/redshifts.py +++ b/py/desispec/workflow/redshifts.py @@ -10,6 +10,7 @@ import numpy as np from astropy.table import Table, vstack, Column +from desispec.io import findfile from desispec.io.util import parse_cameras, decode_camword from desispec.workflow.desi_proc_funcs import determine_resources from desiutil.log import get_logger @@ -21,7 +22,10 @@ from desispec.workflow import batch from desispec.util import parse_int_args - +# processing table row cache for tilenight selection +_tilenight_ptab_cache = None +# exposure table row cache for science exposure selection +_science_etab_cache = None def get_ztile_relpath(tileid,group,night=None,expid=None): """ @@ -360,13 +364,14 @@ def create_desi_zproc_batch_script(group, return scriptfile -def read_minimal_exptables_columns(nights=None, tileids=None): +def read_minimal_science_exptab_cols(nights=None, tileids=None, reset_cache=False): """ Read exposure tables while handling evolving formats Args: nights (list of int): nights to include (default all nights found) tileids (list of int): tileids to include (default all tiles found) + reset_cache (bool): If true, global cache is cleared Returns exptable with just columns TILEID, NIGHT, EXPID, 'CAMWORD', 'BADCAMWORD', filtered by science @@ -375,12 +380,27 @@ def read_minimal_exptables_columns(nights=None, tileids=None): Note: the returned table is the full pipeline exposures table. It is trimmed to science exposures that have LASTSTEP=='all' """ + global _science_etab_cache log = get_logger() + + ## If requested reset the science exposure table cache + if reset_cache: + reset_science_etab_cache() + + ## If the cache exists, use it speed up the search over tiles and nights + if _science_etab_cache is not None: + log.info(f'Using cached exposure table rows for science selection') + t = _science_etab_cache.copy() + if nights is not None: + t = t[np.isin(t['NIGHT'], nights)] + if tileids is not None: + t = t[np.isin(t['TILEID'], tileids)] + return t + + ## If not cached, then find all the relevant exposure tables and load them if nights is None: - exptab_path = get_exposure_table_path(night=None) - monthglob = '202???' - globname = get_exposure_table_name(night='202?????') - etab_files = glob.glob(os.path.join(exptab_path, monthglob, globname)) + etab_path = findfile('exptable', night='99999999', readonly=True) + etab_files = glob.glob(etab_path.replace('99999999', '202?????')) else: etab_files = list() for night in nights: @@ -393,23 +413,23 @@ def read_minimal_exptables_columns(nights=None, tileids=None): # - these are expected for the daily run, ok log.debug(f"Exposure table missing for night {night}") + ## Load each relevant exposure table file, subselect valid science's and + ## append to the full set etab_files = sorted(etab_files) exptables = list() for etab_file in etab_files: ## correct way but slower and we don't need multivalue columns #t = load_table(etab_file, tabletype='etable') t = Table.read(etab_file, format='ascii.csv') + + ## Subselect only valid science exposures + t = _select_sciences_from_etab(t) + ## For backwards compatibility if BADCAMWORD column does not ## exist then add a blank one if 'BADCAMWORD' not in t.colnames: t.add_column(Table.Column(['' for i in range(len(t))], dtype='S36', name='BADCAMWORD')) - keep = (t['OBSTYPE'] == 'science') & (t['TILEID'] >= 0) - if 'LASTSTEP' in t.colnames: - keep &= (t['LASTSTEP'] == 'all') - if tileids is not None: - # Default false - keep &= np.isin(t['TILEID'], tileids) - t = t[keep] + ## Need to ensure that the string columns are consistent for col in ['CAMWORD', 'BADCAMWORD']: ## Masked arrays need special handling @@ -425,4 +445,207 @@ def read_minimal_exptables_columns(nights=None, tileids=None): t[col] = Table.Column(t[col], dtype='S36', name=col) exptables.append(t['TILEID', 'NIGHT', 'EXPID', 'CAMWORD', 'BADCAMWORD']) - return vstack(exptables) + outtable = vstack(exptables) + + ## If we've loaded all nights, then cache the result + if nights is None: + log.info(f'Caching exposure table rows for science selection') + set_science_etab_cache(outtable.copy()) + + ## If requeted specific tileids, then subselect that + if tileids is not None: + outtable = outtable[np.isin(outtable['TILEID'], tileids)] + + return outtable + +def _select_sciences_from_etab(etab): + """ + takes an exposure table or combination of exposure tables and subselects + valid science jobs. Those that pass selection are returned as a table. + """ + t = etab.copy() + t = t[((t['OBSTYPE'] == 'science') & (t['TILEID'] >= 0))] + if 'LASTSTEP' in t.colnames: + t = t[t['LASTSTEP'] == 'all'] + return t + +def reset_science_etab_cache(): + """ + reset the global cache of science exposure tables stored in var _science_etab_cache + """ + global _science_etab_cache + log = get_logger() + log.info(f'Resetting science exposure table row cache') + _science_etab_cache = None + +def set_science_etab_cache(etab): + """ + sets the global cache of science exposure tables stored in var _science_etab_cache + """ + global _science_etab_cache + log = get_logger() + log.info(f'Assigning science exposure table row cache to new table') + _science_etab_cache = _select_sciences_from_etab(etab) + +def update_science_etab_cache(etab): + """ + updates the global cache of science exposure tables stored in var + _science_etab_cache. + + Notes: this will remove all current entries for any night in the input + """ + global _science_etab_cache + log = get_logger() + ## If the cache doesn't exist, don't update it. + if _science_etab_cache is None: + log.debug(f'Science exptab cache does not exist, so not updating') + return + cleaned_etab = _select_sciences_from_etab(etab) + new_nights = np.unique(cleaned_etab['NIGHT']) + log.info(f'Removing all current entries in science exposure ' + + f'table row cache for nights {new_nights}') + conflicting_entries = np.isin(_science_etab_cache['NIGHT'], new_nights) + log.info(f"Removing {len(conflicting_entries)} rows and adding {len(cleaned_etab)} rows " + + f"to science exposure table row cache.") + keep = np.bitwise_not(conflicting_entries) + _science_etab_cache = _science_etab_cache[keep] + _science_etab_cache = vstack([_science_etab_cache, cleaned_etab]) + + +def read_minimal_tilenight_proctab_cols(nights=None, tileids=None, reset_cache=False): + """ + Read processing tables while handling evolving formats + + Args: + nights (list of int): nights to include (default all nights found) + tileids (list of int): tileids to include (default all tiles found) + reset_cache (bool): If true, global cache is cleared + + Returns exptable with just columns EXPID, TILEID, NIGHT, PROCCAMWORD, + INTID, LATEST_QID + """ + global _tilenight_ptab_cache + log = get_logger() + + ## If requested reset the tilenight processing table cache + if reset_cache: + reset_tilenight_ptab_cache() + + ## If the cache exists, use it speed up the search over tiles and nights + if _tilenight_ptab_cache is not None: + log.info(f'Using cached processing table rows for tilenight selection') + t = _tilenight_ptab_cache.copy() + if nights is not None: + t = t[np.isin(t['NIGHT'], nights)] + if tileids is not None: + t = t[np.isin(t['TILEID'], tileids)] + return t + + ## If not cached, then find all the relevant processing tables and load them + if nights is None: + ptab_path = findfile('proctable', night='99999999', readonly=True) + ptab_files = glob.glob(ptab_path.replace('99999999', '202?????')) + else: + ptab_files = list() + for night in nights: + ptab_file = findfile('proctable', night=night) + if os.path.exists(ptab_file): + ptab_files.append(ptab_file) + elif night >= 20201201: + log.error(f"Processing table missing for night {night}") + else: + # - these are expected for the daily run, ok + log.debug(f"Processing table missing for night {night}") + + ## Load each relevant processing table file, subselect valid tilenight's and + ## append to the full set + ptab_files = sorted(ptab_files) + ptables = list() + for ptab_file in ptab_files: + ## correct way but slower and we don't need multivalue columns + t = load_table(tablename=ptab_file, tabletype='proctable') + t = _select_tilenights_from_ptab(t) + + ## Need to ensure that the string columns are consistent + for col in ['PROCCAMWORD']: + ## Masked arrays need special handling + ## else just reassign with consistent dtype + if isinstance(t[col], Table.MaskedColumn): + ## If compeltely empty it's loaded as type int + ## otherwise fill masked with '' + if t[col].dtype == int: + t[col] = Table.Column(['' for i in range(len(t))], dtype='S36', name=col) + else: + t[col] = Table.Column(t[col].filled(fill_value=''), dtype='S36', name=col) + else: + t[col] = Table.Column(t[col], dtype='S36', name=col) + ptables.append(t['EXPID', 'TILEID', 'NIGHT', 'PROCCAMWORD', + 'INTID', 'LATEST_QID']) + + outtable = vstack(ptables) + ## If we've loaded all nights, then cache the result + if nights is None: + log.info(f'Caching processing table rows for tilenight selection') + set_tilenight_ptab_cache(outtable) + + ## If requested specific tileids, then subselect that + if tileids is not None: + outtable = outtable[np.isin(outtable['TILEID'], tileids)] + + return outtable + +def _select_tilenights_from_ptab(ptab): + """ + takes a processing table or combination of processing tables and subselects + valid tilenight jobs. Those that pass selection are returned as a table. + """ + t = ptab.copy() + t = t[((t['OBSTYPE'] == 'science') & (t['JOBDESC'] == 'tilenight'))] + if 'LASTSTEP' in t.colnames: + t = t[t['LASTSTEP'] == 'all'] + return t + +def reset_tilenight_ptab_cache(): + """ + reset the global cache of tilenight processing tables stored in var _tilenight_ptab_cache + """ + global _tilenight_ptab_cache + log = get_logger() + log.info(f'Resetting processing table row cache for tilenight selection') + _tilenight_ptab_cache = None + +def set_tilenight_ptab_cache(ptab): + """ + sets the global cache of tilenight processing tables stored in var _tilenight_ptab_cache + """ + global _tilenight_ptab_cache + log = get_logger() + log.info(f'Asigning processing table row cache for tilenight selection to new table') + _tilenight_ptab_cache = _select_tilenights_from_ptab(ptab) + _tilenight_ptab_cache.sort(['INTID']) + +def update_tilenight_ptab_cache(ptab): + """ + updates the global cache of tilenight processing tables stored in var + _tilenight_ptab_cache. + + Notes: this will remove all current entries for any night in the input + """ + global _tilenight_ptab_cache + log = get_logger() + ## If the cache doesn't exist, don't update it. + if _tilenight_ptab_cache is None: + log.debug(f'Science exptab cache does not exist, so not updating') + return + cleaned_ptab = _select_tilenights_from_ptab(ptab) + new_nights = np.unique(cleaned_ptab['NIGHT']) + log.info(f'Removing all current entries in processing table tilenight ' + + f'selection cache for nights {new_nights}') + conflicting_entries = np.isin(_tilenight_ptab_cache['NIGHT'], new_nights) + log.info(f"Removing {len(conflicting_entries)} rows and adding " + + f"{len(cleaned_ptab)} rows " + + f"to processing table tilenight cache.") + keep = np.bitwise_not(conflicting_entries) + _tilenight_ptab_cache = _tilenight_ptab_cache[keep] + _tilenight_ptab_cache = vstack([_tilenight_ptab_cache, cleaned_ptab]) + _tilenight_ptab_cache.sort(['INTID']) \ No newline at end of file diff --git a/py/desispec/workflow/science_selection.py b/py/desispec/workflow/science_selection.py index e16b39cc6..04a555796 100644 --- a/py/desispec/workflow/science_selection.py +++ b/py/desispec/workflow/science_selection.py @@ -17,7 +17,7 @@ from desispec.scripts.tile_redshifts import generate_tile_redshift_scripts from desispec.workflow.redshifts import get_ztile_script_pathname, \ get_ztile_relpath, \ - get_ztile_script_suffix, read_minimal_exptables_columns + get_ztile_script_suffix, read_minimal_science_exptab_cols from desispec.workflow.queue import get_resubmission_states, update_from_queue, queue_info_from_qids from desispec.workflow.timing import what_night_is_it from desispec.workflow.desi_proc_funcs import get_desi_proc_batch_file_pathname, \ @@ -237,7 +237,7 @@ def get_tiles_cumulative(sci_etable, z_submit_types, all_cumulatives, night): tiles_cumulative = list(tiles_this_night) log.info(f'Submitting cumulative redshifts for all tiles: {tiles_cumulative}') else: - allexp = read_minimal_exptables_columns(tileids=tiles_this_night) + allexp = read_minimal_science_exptab_cols(tileids=tiles_this_night) for tileid in tiles_this_night: nights_with_tile = allexp['NIGHT'][allexp['TILEID'] == tileid] if len(nights_with_tile) > 0 and night == np.max(nights_with_tile): diff --git a/py/desispec/workflow/tableio.py b/py/desispec/workflow/tableio.py index 042dca835..89b6a0065 100644 --- a/py/desispec/workflow/tableio.py +++ b/py/desispec/workflow/tableio.py @@ -7,7 +7,8 @@ import numpy as np from astropy.table import Table - +from desispec.workflow.redshifts import update_science_etab_cache, \ + update_tilenight_ptab_cache ################################################### ################ Table Functions ################# ################################################### @@ -203,6 +204,12 @@ def write_table(origtable, tablename=None, tabletype=None, joinsymb='|', overwri if verbose: log.info("Written table: ", table.info) + if tabletype is not None: + if tabletype == 'exptable': + update_science_etab_cache(origtable) + elif tabletype == 'proctable': + update_tilenight_ptab_cache(origtable) + def standardize_tabletype(tabletype): """ Given the user defined type of table it returns the proper 'tabletype' expected by the pipeline From 11f40b7dc342c5fe60066364c5bfabaa47eaff81 Mon Sep 17 00:00:00 2001 From: akremin Date: Tue, 30 Jul 2024 15:59:56 -0700 Subject: [PATCH 02/27] add end-to-end cross-night zproc unit test --- py/desispec/scripts/proc_night.py | 2 +- .../202309/exposure_table_20230913.csv | 95 +++++++++++++++++++ py/desispec/test/test_proc_night.py | 39 +++++++- py/desispec/workflow/processing.py | 9 +- py/desispec/workflow/redshifts.py | 36 +++++-- py/desispec/workflow/tableio.py | 5 +- 6 files changed, 168 insertions(+), 18 deletions(-) create mode 100644 py/desispec/test/data/exposure_tables/202309/exposure_table_20230913.csv diff --git a/py/desispec/scripts/proc_night.py b/py/desispec/scripts/proc_night.py index 1f1928f5e..1db9d22b0 100644 --- a/py/desispec/scripts/proc_night.py +++ b/py/desispec/scripts/proc_night.py @@ -340,7 +340,7 @@ def proc_night(night=None, proc_obstypes=None, z_submit_types=None, ## Pre-populate exposure table and processing table caches of all nights ## if doing cross-night redshifts - if 'cumulative' in z_submit_types: + if z_submit_types is not None and 'cumulative' in z_submit_types: ## this shouldn't need to change since we've already updated the exptab read_minimal_science_exptab_cols() ## this would become out of date for the current night except diff --git a/py/desispec/test/data/exposure_tables/202309/exposure_table_20230913.csv b/py/desispec/test/data/exposure_tables/202309/exposure_table_20230913.csv new file mode 100644 index 000000000..57644895a --- /dev/null +++ b/py/desispec/test/data/exposure_tables/202309/exposure_table_20230913.csv @@ -0,0 +1,95 @@ +EXPID,OBSTYPE,TILEID,LASTSTEP,CAMWORD,BADCAMWORD,BADAMPS,EXPTIME,EFFTIME_ETC,SURVEY,FA_SURV,FAPRGRM,GOALTIME,GOALTYPE,EBVFAC,AIRMASS,SPEED,TARGTRA,TARGTDEC,SEQNUM,SEQTOT,PROGRAM,PURPOSE,MJD-OBS,NIGHT,HEADERERR,EXPFLAG,COMMENTS +195972,zero,-99,all,a0123456789,,,0.0,-99.0,unknown,unknown,unknown,-99.0,unknown,1.0,1.0,-99.0,179.350056,-17.0,1,25,calib zeros for nightly bias,main survey,60200.808129882,20230913,|,|,| +195973,zero,-99,all,a0123456789,,,0.0,-99.0,unknown,unknown,unknown,-99.0,unknown,1.0,1.0,-99.0,179.350056,-17.0,2,25,calib zeros for nightly bias,main survey,60200.808906154,20230913,|,|,| +195974,zero,-99,all,a0123456789,,,0.0,-99.0,unknown,unknown,unknown,-99.0,unknown,1.0,1.0,-99.0,179.350056,-17.0,3,25,calib zeros for nightly bias,main survey,60200.809692809,20230913,|,|,| +195975,zero,-99,all,a0123456789,,,0.0,-99.0,unknown,unknown,unknown,-99.0,unknown,1.0,1.0,-99.0,179.350056,-17.0,4,25,calib zeros for nightly bias,main survey,60200.810469491,20230913,|,|,| +195976,zero,-99,all,a0123456789,,,0.0,-99.0,unknown,unknown,unknown,-99.0,unknown,1.0,1.0,-99.0,179.350056,-17.0,5,25,calib zeros for nightly bias,main survey,60200.811248611,20230913,|,|,| +195977,zero,-99,all,a0123456789,,,0.0,-99.0,unknown,unknown,unknown,-99.0,unknown,1.0,1.0,-99.0,179.350056,-17.0,6,25,calib zeros for nightly bias,main survey,60200.812042202,20230913,|,|,| +195978,zero,-99,all,a0123456789,,,0.0,-99.0,unknown,unknown,unknown,-99.0,unknown,1.0,1.0,-99.0,179.350056,-17.0,7,25,calib zeros for nightly bias,main survey,60200.812818175,20230913,|,|,| +195979,zero,-99,all,a0123456789,,,0.0,-99.0,unknown,unknown,unknown,-99.0,unknown,1.0,1.0,-99.0,179.350056,-17.0,8,25,calib zeros for nightly bias,main survey,60200.813606913,20230913,|,|,| +195980,zero,-99,all,a0123456789,,,0.0,-99.0,unknown,unknown,unknown,-99.0,unknown,1.0,1.0,-99.0,179.350056,-17.0,9,25,calib zeros for nightly bias,main survey,60200.814401309,20230913,|,|,| +195981,zero,-99,all,a0123456789,,,0.0,-99.0,unknown,unknown,unknown,-99.0,unknown,1.0,1.0,-99.0,179.350056,-17.0,10,25,calib zeros for nightly bias,main survey,60200.815184343,20230913,|,|,| +195982,zero,-99,all,a0123456789,,,0.0,-99.0,unknown,unknown,unknown,-99.0,unknown,1.0,1.0,-99.0,179.350056,-17.0,11,25,calib zeros for nightly bias,main survey,60200.81595719,20230913,|,|,| +195983,zero,-99,all,a0123456789,,,0.0,-99.0,unknown,unknown,unknown,-99.0,unknown,1.0,1.0,-99.0,179.350056,-17.0,12,25,calib zeros for nightly bias,main survey,60200.816743438,20230913,|,|,| +195984,zero,-99,all,a0123456789,,,0.0,-99.0,unknown,unknown,unknown,-99.0,unknown,1.0,1.0,-99.0,179.350056,-17.0,13,25,calib zeros for nightly bias,main survey,60200.817533,20230913,|,|,| +195985,zero,-99,all,a0123456789,,,0.0,-99.0,unknown,unknown,unknown,-99.0,unknown,1.0,1.0,-99.0,179.350056,-17.0,14,25,calib zeros for nightly bias,main survey,60200.818305598,20230913,|,|,| +195986,zero,-99,all,a0123456789,,,0.0,-99.0,unknown,unknown,unknown,-99.0,unknown,1.0,1.0,-99.0,179.350056,-17.0,15,25,calib zeros for nightly bias,main survey,60200.819099186,20230913,|,|,| +195987,zero,-99,all,a0123456789,,,0.0,-99.0,unknown,unknown,unknown,-99.0,unknown,1.0,1.0,-99.0,179.350056,-17.0,16,25,calib zeros for nightly bias,main survey,60200.819877863,20230913,|,|,| +195988,zero,-99,all,a0123456789,,,0.0,-99.0,unknown,unknown,unknown,-99.0,unknown,1.0,1.217392,-99.0,179.350056,-17.0,17,25,calib zeros for nightly bias,main survey,60200.820659605,20230913,|,|,| +195989,zero,-99,all,a0123456789,,,0.0,-99.0,unknown,unknown,unknown,-99.0,unknown,1.0,3.504877,-99.0,179.350056,-17.0,18,25,calib zeros for nightly bias,main survey,60200.821446053,20230913,|,|,| +195990,zero,-99,all,a0123456789,,,0.0,-99.0,unknown,unknown,unknown,-99.0,unknown,1.0,4.427182,-99.0,179.350056,-17.0,19,25,calib zeros for nightly bias,main survey,60200.822226695,20230913,|,|,| +195991,zero,-99,all,a0123456789,,,0.0,-99.0,unknown,unknown,unknown,-99.0,unknown,1.0,4.427197,-99.0,179.350056,-17.0,20,25,calib zeros for nightly bias,main survey,60200.823012683,20230913,|,|,| +195992,zero,-99,all,a0123456789,,,0.0,-99.0,unknown,unknown,unknown,-99.0,unknown,1.0,4.427215,-99.0,179.350056,-17.0,21,25,calib zeros for nightly bias,main survey,60200.823794792,20230913,|,|,| +195993,zero,-99,all,a0123456789,,,0.0,-99.0,unknown,unknown,unknown,-99.0,unknown,1.0,4.427291,-99.0,179.350056,-17.0,22,25,calib zeros for nightly bias,main survey,60200.824572969,20230913,|,|,| +195994,zero,-99,all,a0123456789,,,0.0,-99.0,unknown,unknown,unknown,-99.0,unknown,1.0,4.427384,-99.0,179.350056,-17.0,23,25,calib zeros for nightly bias,main survey,60200.825361952,20230913,|,|,| +195995,zero,-99,all,a0123456789,,,0.0,-99.0,unknown,unknown,unknown,-99.0,unknown,1.0,4.427397,-99.0,179.350056,-17.0,24,25,calib zeros for nightly bias,main survey,60200.826156065,20230913,|,|,| +195996,zero,-99,all,a0123456789,,,0.0,-99.0,unknown,unknown,unknown,-99.0,unknown,1.0,4.427378,-99.0,179.350056,-17.0,25,25,calib zeros for nightly bias,main survey,60200.82693914,20230913,|,|,| +195998,dark,-99,all,a0123456789,,,300.0574,-99.0,unknown,unknown,unknown,-99.0,unknown,1.0,4.427361,-99.0,179.350056,-17.0,1,1,calib dark 5min,main survey,60200.827721146,20230913,|,|,| +196003,arc,-99,all,a0123456789,,,5.0095,-99.0,unknown,unknown,unknown,-99.0,unknown,1.0,1.521189,-99.0,173.483903,31.963305,1,5,calib short arcs all,main survey,60201.035380872,20230913,|,|,| +196004,arc,-99,all,a0123456789,,,5.0098,-99.0,unknown,unknown,unknown,-99.0,unknown,1.0,1.521189,-99.0,173.483903,31.963305,2,5,calib short arcs all,main survey,60201.036218171,20230913,|,|,| +196005,arc,-99,all,a0123456789,,,5.0093,-99.0,unknown,unknown,unknown,-99.0,unknown,1.0,1.521189,-99.0,173.483903,31.963305,3,5,calib short arcs all,main survey,60201.037055921,20230913,|,|,| +196006,arc,-99,all,a0123456789,,,5.0092,-99.0,unknown,unknown,unknown,-99.0,unknown,1.0,1.521189,-99.0,173.483903,31.963305,4,5,calib short arcs all,main survey,60201.037902204,20230913,|,|,| +196007,arc,-99,all,a0123456789,,,5.0095,-99.0,unknown,unknown,unknown,-99.0,unknown,1.0,1.521189,-99.0,173.483903,31.963305,5,5,calib short arcs all,main survey,60201.038737319,20230913,|,|,| +196010,arc,-99,all,a0123456789,,,30.0562,-99.0,unknown,unknown,unknown,-99.0,unknown,1.0,1.521189,-99.0,173.483903,31.963305,1,5,calib long arcs cd+xe,main survey,60201.040510029,20230913,|,|,| +196011,arc,-99,all,a0123456789,,,30.0567,-99.0,unknown,unknown,unknown,-99.0,unknown,1.0,1.521189,-99.0,173.483903,31.963305,2,5,calib long arcs cd+xe,main survey,60201.04163265,20230913,|,|,| +196012,arc,-99,all,a0123456789,,,30.0577,-99.0,unknown,unknown,unknown,-99.0,unknown,1.0,1.521189,-99.0,173.483903,31.963305,3,5,calib long arcs cd+xe,main survey,60201.042759972,20230913,|,|,| +196013,arc,-99,all,a0123456789,,,30.0569,-99.0,unknown,unknown,unknown,-99.0,unknown,1.0,1.521189,-99.0,173.483903,31.963305,4,5,calib long arcs cd+xe,main survey,60201.043882699,20230913,|,|,| +196014,arc,-99,all,a0123456789,,,30.0584,-99.0,unknown,unknown,unknown,-99.0,unknown,1.0,1.52119,-99.0,173.483903,31.963305,5,5,calib long arcs cd+xe,main survey,60201.045010477,20230913,|,|,| +196019,flat,-99,all,a0123456789,,,120.0271,-99.0,unknown,unknown,unknown,-99.0,unknown,1.0,1.521189,-99.0,173.483903,31.963305,1,3,calib desi-calib-00 leds only,main survey,60201.047447777,20230913,|,|,| +196020,flat,-99,all,a0123456789,,,120.0256,-99.0,unknown,unknown,unknown,-99.0,unknown,1.0,1.521189,-99.0,173.483903,31.963305,2,3,calib desi-calib-00 leds only,main survey,60201.049617614,20230913,|,|,| +196021,flat,-99,all,a0123456789,,,120.0284,-99.0,unknown,unknown,unknown,-99.0,unknown,1.0,1.521189,-99.0,173.483903,31.963305,3,3,calib desi-calib-00 leds only,main survey,60201.051777903,20230913,|,|,| +196024,flat,-99,all,a0123456789,,,120.026,-99.0,unknown,unknown,unknown,-99.0,unknown,1.0,1.521189,-99.0,173.483903,31.963305,1,3,calib desi-calib-01 leds only,main survey,60201.054569618,20230913,|,|,| +196025,flat,-99,all,a0123456789,,,120.0263,-99.0,unknown,unknown,unknown,-99.0,unknown,1.0,1.521189,-99.0,173.483903,31.963305,2,3,calib desi-calib-01 leds only,main survey,60201.056733695,20230913,|,|,| +196026,flat,-99,all,a0123456789,,,120.0273,-99.0,unknown,unknown,unknown,-99.0,unknown,1.0,1.521189,-99.0,173.483903,31.963305,3,3,calib desi-calib-01 leds only,main survey,60201.058898244,20230913,|,|,| +196029,flat,-99,all,a0123456789,,,120.0273,-99.0,unknown,unknown,unknown,-99.0,unknown,1.0,1.521189,-99.0,173.483903,31.963305,1,3,calib desi-calib-02 leds only,main survey,60201.061722526,20230913,|,|,| +196030,flat,-99,all,a0123456789,,,120.0232,-99.0,unknown,unknown,unknown,-99.0,unknown,1.0,1.52119,-99.0,173.483903,31.963305,2,3,calib desi-calib-02 leds only,main survey,60201.063890883,20230913,|,|,| +196031,flat,-99,all,a0123456789,,,120.0268,-99.0,unknown,unknown,unknown,-99.0,unknown,1.0,1.521189,-99.0,173.483903,31.963305,3,3,calib desi-calib-02 leds only,main survey,60201.06605901,20230913,|,|,| +196034,flat,-99,all,a0123456789,,,120.0254,-99.0,unknown,unknown,unknown,-99.0,unknown,1.0,1.521189,-99.0,173.483903,31.963305,1,3,calib desi-calib-03 leds only,main survey,60201.068883331,20230913,|,|,| +196035,flat,-99,all,a0123456789,,,120.0256,-99.0,unknown,unknown,unknown,-99.0,unknown,1.0,1.521189,-99.0,173.483903,31.963305,2,3,calib desi-calib-03 leds only,main survey,60201.071047905,20230913,|,|,| +196036,flat,-99,all,a0123456789,,,120.0242,-99.0,unknown,unknown,unknown,-99.0,unknown,1.0,1.521189,-99.0,173.483903,31.963305,3,3,calib desi-calib-03 leds only,main survey,60201.07321184,20230913,|,|,| +196037,flat,-99,all,a0123456789,,,1.002,-99.0,unknown,unknown,unknown,-99.0,unknown,1.0,1.521189,-99.0,173.483903,31.963305,1,1,led03 flat for cte check,main survey,60201.075376003,20230913,SEQTOT:->1|,metadata_missing|,| +196045,science,22287,all,a0123456789,,,273.1084,214.148911,main,main,bright,180.0,bright,1.02207370122672,1.223074,1.165165251593235,243.625721,41.69028,1,1,bright,main survey,60201.125351612,20230913,|,|,| +196046,science,7668,all,a0123456789,,,993.2377,1038.237427,main,main,dark,1000.0,dark,1.12896467978169,1.099196,1.572119985902638,261.433021,24.79212,1,1,dark,main survey,60201.13040784,20230913,|,|,| +196047,science,9708,all,a0123456789,,,1318.2757,1028.784058,main,main,dark,1000.0,dark,1.10338294694736,1.100976,1.1242988206742046,264.576579,27.7876,1,1,dark,main survey,60201.143115096,20230913,|,|,| +196048,science,7567,all,a0123456789,,,584.6919,110.997185,main,main,dark,1000.0,dark,1.17479352617935,1.239758,0.3816344625013154,308.97465,-3.09878,1,1,dark,main survey,60201.160820184,20230913,|,|,| +196049,science,40156,all,a0123456789,,,561.5724,15.750249,main,main,backup,60.0,backup,1.49633502183324,1.021889,0.06522226369509196,306.076542,20.45885,1,1,backup,main survey,60201.170580523,20230913,|,|,| +196050,science,40203,all,a0123456789,,,317.7902,71.675003,main,main,backup,60.0,backup,1.2817482313747,1.020025,0.3836203183080122,320.574908,26.41986,1,1,backup,main survey,60201.188479989,20230913,|,|,| +196051,science,40211,all,a0123456789,,,206.8821,78.008102,main,main,backup,60.0,backup,1.16832223329501,1.088465,0.5969904701117856,312.19615,8.70255,1,1,backup,main survey,60201.193868769,20230913,|,|,| +196052,science,42039,all,a0123456789,,,216.1297,77.466599,main,main,backup,60.0,backup,1.21884644057295,1.118092,0.6473419041078877,317.918921,5.86084,1,1,backup,main survey,60201.197564982,20230913,|,|,| +196053,science,9675,all,a0123456789,,,1515.2998,1026.115356,main,main,dark,1000.0,dark,1.11020146486029,1.226639,1.1933162596140143,322.319292,-2.54256,1,1,dark,main survey,60201.201468867,20230913,|,|,| +196054,science,9653,all,a0123456789,,,1153.4999,411.587982,main,main,dark,1000.0,dark,1.06765254870705,1.273025,0.6205397955644159,326.91395,-5.83964,1,1,dark,main survey,60201.220256053,20230913,|,|,| +196055,science,9653,all,a0123456789,,,1241.9995,602.435547,main,main,dark,1000.0,dark,1.06765254870705,1.265193,0.8344943809384916,326.91395,-5.83964,1,1,dark,main survey,60201.23444903,20230913,|,|,| +196056,science,6571,all,a0123456789,,,1464.1436,309.545837,main,main,dark,1000.0,dark,1.10588210768744,1.328267,0.4249211787818334,335.579521,-9.05382,1,1,dark,main survey,60201.250167218,20230913,|,|,| +196057,science,6571,all,a0123456789,,,1057.8747,284.82724,main,main,dark,1000.0,dark,1.10588210768744,1.325601,0.5392465736115808,335.579521,-9.05382,1,1,dark,main survey,60201.26795283,20230913,|,|,| +196058,science,6571,all,a0123456789,,,1090.9948,426.343719,main,main,dark,1000.0,dark,1.10588210768744,1.335487,0.7929106757259986,335.579521,-9.05382,1,1,dark,main survey,60201.281038287,20230913,|,|,| +196059,science,7784,all,a0123456789,,,1585.4423,1030.913696,main,main,dark,1000.0,dark,1.08463116076279,1.268702,1.1601532626891766,348.267742,-6.05568,1,1,dark,main survey,60201.295164878,20230913,|,|,| +196060,science,1532,all,a0123456789,,,1377.1362,1027.701782,main,main,dark,1000.0,dark,1.06655282703018,1.295326,1.3351140077456667,351.830992,-7.44723,1,1,dark,main survey,60201.314740243,20230913,|,|,| +196061,science,4686,all,a0123456789,,,550.5909,369.394928,main,main,dark,1000.0,dark,1.07475944177841,1.202082,1.0694712190638314,1.454008,-1.77624,1,1,dark,main survey,60201.33216591,20230913,|,|,| +196062,science,9357,all,a0123456789,,,1181.929,1036.699829,main,main,dark,1000.0,dark,1.08313204043255,1.230533,1.4794069611609242,357.726492,-3.2498,1,1,dark,main survey,60201.339775,20230913,|,|,| +196063,science,7427,all,a0123456789,,,1057.3861,1039.174438,main,main,dark,1000.0,dark,1.06227824929815,1.212675,1.5541186755164602,6.266192,-2.40946,1,1,dark,main survey,60201.354781134,20230913,|,|,| +196064,science,5178,all,a0123456789,,,1040.6573,1042.881836,main,main,dark,1000.0,dark,1.11207696143075,1.096866,1.4570210003544364,6.426508,8.76649,1,1,dark,main survey,60201.36852137,20230913,|,|,| +196065,science,2006,all,a0123456789,,,1007.3224,1043.452026,main,main,dark,1000.0,dark,1.15407900790316,1.064437,1.5389872154574937,12.393708,12.8374,1,1,dark,main survey,60201.381833522,20230913,|,|,| +196066,science,8297,all,a0123456789,,,902.9169,1044.519409,main,main,dark,1000.0,dark,1.10503018070504,1.063582,1.573498106651222,15.6925,13.40049,1,1,dark,main survey,60201.394682898,20230913,|,|,| +196067,science,8250,all,a0123456789,,,838.6014,1062.376953,main,main,dark,1000.0,dark,1.07372324038185,1.067938,1.6385637188342717,19.142392,12.9984,1,1,dark,main survey,60201.406336121,20230913,|,|,| +196068,science,2010,all,a0123456789,,,912.5388,1045.723267,main,main,dark,1000.0,dark,1.13876220131243,1.079696,1.6994528900066272,22.544271,11.47607,1,1,dark,main survey,60201.41721519,20230913,|,|,| +196069,science,2012,all,a0123456789,,,894.9935,1048.256958,main,main,dark,1000.0,dark,1.15405100905561,1.08539,1.8004168162472984,25.917121,11.03695,1,1,dark,main survey,60201.429001954,20230913,|,|,| +196070,science,7976,all,a0123456789,,,879.9489,1047.328613,main,main,dark,1000.0,dark,1.10480292501459,1.138147,1.821974410880971,28.293708,5.66763,1,1,dark,main survey,60201.440615547,20230913,|,|,| +196071,science,9113,all,a0123456789,,,274.3794,182.60141,main,main,dark,1000.0,dark,1.05823873483025,1.51258,1.537542225358108,34.091179,-15.68103,1,1,dark,main survey,60201.452492859,20230913,|,|,| +196072,science,1684,all,a0123456789,,,885.5378,1043.556885,main,main,dark,1000.0,dark,1.04996835540577,1.391675,2.3166084648643017,34.77705,-10.80484,1,1,dark,main survey,60201.456935414,20230913,|,|,| +196073,science,4810,all,a0123456789,,,902.6006,1043.129395,main,main,dark,1000.0,dark,1.05643341960371,1.318582,2.0927354386207546,38.679279,-7.24039,1,1,dark,main survey,60201.468408981,20230913,|,|,| +196074,science,9479,all,a0123456789,,,909.684,1039.709717,main,main,dark,1000.0,dark,1.06693258459948,1.236514,1.8864435796441235,41.101179,-1.81056,1,1,dark,main survey,60201.480107356,20230913,|,|,| +196075,science,9505,all,a0123456789,,,926.5479,922.025818,main,main,dark,1000.0,dark,1.14099337034299,1.251384,1.9181165573626417,44.3799,-2.44682,1,1,dark,main survey,60201.491866222,20230913,|,|,| +196076,science,23818,all,a0123456789,,,232.9723,116.082283,main,main,bright,180.0,bright,1.06110852448436,1.400322,1.011300230925507,42.007042,-7.73316,1,1,bright,main survey,60201.503871194,20230913,|,|,| +196077,science,23826,all,a0123456789,,,526.9221,112.438942,main,main,bright,180.0,bright,1.14264458409689,1.321973,0.45407980947773674,47.3472,-5.54103,1,1,bright,main survey,60201.507839155,20230913,|,|,| +196086,zero,-99,all,a0123456789,,,0.0,-99.0,unknown,unknown,unknown,-99.0,unknown,1.0,1.0,-99.0,67.736611,31.9633,1,10,zeros for morning darks,main survey,60201.519140506,20230913,|,|,| +196087,zero,-99,all,a0123456789,,,0.0,-99.0,unknown,unknown,unknown,-99.0,unknown,1.0,1.0,-99.0,67.736611,31.9633,2,10,zeros for morning darks,main survey,60201.519929937,20230913,|,|,| +196088,zero,-99,all,a0123456789,,,0.0,-99.0,unknown,unknown,unknown,-99.0,unknown,1.0,1.0,-99.0,67.736611,31.9633,3,10,zeros for morning darks,main survey,60201.520707462,20230913,|,|,| +196089,zero,-99,all,a0123456789,,,0.0,-99.0,unknown,unknown,unknown,-99.0,unknown,1.0,1.0,-99.0,67.736611,31.9633,4,10,zeros for morning darks,main survey,60201.521480444,20230913,|,|,| +196090,zero,-99,all,a0123456789,,,0.0,-99.0,unknown,unknown,unknown,-99.0,unknown,1.0,1.0,-99.0,67.736611,31.9633,5,10,zeros for morning darks,main survey,60201.522253294,20230913,|,|,| +196091,zero,-99,all,a0123456789,,,0.0,-99.0,unknown,unknown,unknown,-99.0,unknown,1.0,1.0,-99.0,67.736611,31.9633,6,10,zeros for morning darks,main survey,60201.523026907,20230913,|,|,| +196092,zero,-99,all,a0123456789,,,0.0,-99.0,unknown,unknown,unknown,-99.0,unknown,1.0,1.0,-99.0,67.736611,31.9633,7,10,zeros for morning darks,main survey,60201.523806704,20230913,|,|,| +196093,zero,-99,all,a0123456789,,,0.0,-99.0,unknown,unknown,unknown,-99.0,unknown,1.0,1.0,-99.0,67.736611,31.9633,8,10,zeros for morning darks,main survey,60201.524586461,20230913,|,|,| +196094,zero,-99,all,a0123456789,,,0.0,-99.0,unknown,unknown,unknown,-99.0,unknown,1.0,1.0,-99.0,67.736611,31.9633,9,10,zeros for morning darks,main survey,60201.525367783,20230913,|,|,| +196095,zero,-99,all,a0123456789,,,0.0,-99.0,unknown,unknown,unknown,-99.0,unknown,1.0,1.0,-99.0,67.736611,31.9633,10,10,zeros for morning darks,main survey,60201.52614057,20230913,|,|,| +196096,dark,-99,all,a0123456789,,,1200.0632,-99.0,unknown,unknown,unknown,-99.0,unknown,1.0,1.0,-99.0,67.736611,31.9633,1,2,morning darks,main survey,60201.526913478,20230913,|,|,| +196097,dark,-99,all,a0123456789,,,1200.0611,-99.0,unknown,unknown,unknown,-99.0,unknown,1.0,1.0,-99.0,67.736611,31.9633,2,2,morning darks,main survey,60201.541584593,20230913,|,|,| diff --git a/py/desispec/test/test_proc_night.py b/py/desispec/test/test_proc_night.py index 1ec6a638e..084602bf0 100644 --- a/py/desispec/test/test_proc_night.py +++ b/py/desispec/test/test_proc_night.py @@ -33,7 +33,9 @@ class TestProcNight(unittest.TestCase): @classmethod def setUpClass(cls): + cls.prenight = 20230913 cls.night = 20230914 + cls.repeat_tiles = [7567, 23826] cls.dailynight = _dailynight cls.basicnight = 20211129 #- early data without 1s CTE flat or end-of-night zeros/darks @@ -64,6 +66,7 @@ def setUpClass(cls): cls.override_file = findfile('override', cls.night) # these are created in function def tearDown(self): + desispec.workflow.redshifts.reset_tilenight_ptab_cache() # remove everything from prod except exposure_table for self.night for path in glob.glob(self.proddir+'/*'): if os.path.basename(path) == 'exposure_tables': @@ -155,8 +158,41 @@ def test_proc_night_noz(self): proctiles = proctable['TILEID'][proctable['OBSTYPE'] == 'science'] self.assertEqual(len(np.unique(proctiles)), ntiles) + def test_proc_night_cross_night_redshifts(self): + """Test if crossnight redshifts are submitted properly.""" + proctable1, unproctable1 = proc_night(self.prenight, sub_wait_time=0.0, dry_run_level=1) + desispec.workflow.redshifts.reset_science_etab_cache() + desispec.workflow.redshifts.reset_tilenight_ptab_cache() + proctable2, unproctable2 = proc_night(self.night, sub_wait_time=0.0, + dry_run_level=1, z_submit_types=['cumulative']) + + ## Test that cumulative redshift has dependency on previous night's job + ## as well as the tilenight job from the second night + for tileid in self.repeat_tiles: + tilematches1 = proctable1[proctable1['TILEID'] == tileid] + tilenight1 = tilematches1[tilematches1['JOBDESC']=='tilenight'][0] + tilematches2 = proctable2[proctable2['TILEID'] == tileid] + tilenight2 = tilematches2[tilematches2['JOBDESC']=='tilenight'][0] + cumulative2 = tilematches2[tilematches2['JOBDESC'] == 'cumulative'][0] + + self.assertTrue(len(cumulative2['INT_DEP_IDS']) == 2) + self.assertTrue(tilenight1['INTID'] in cumulative2['INT_DEP_IDS']) + self.assertTrue(tilenight2['INTID'] in cumulative2['INT_DEP_IDS']) + + scriptpath = get_ztile_script_pathname(tileid, group='cumulative', + night=self.night) + with open(scriptpath, 'r') as fil: + for line in fil.readlines(): + if 'desi_zproc' in line: + self.assertTrue(str(self.prenight) in line) + self.assertTrue(str(tilenight1['EXPID'][0]) in line) + self.assertTrue(str(self.night) in line) + self.assertTrue(str(tilenight2['EXPID'][0]) in line) + def _override_write_run_delete(self, override_dict, night=None, **kwargs): """Write override, run proc_night, remove override file, and return outputs""" + desispec.workflow.redshifts.reset_tilenight_ptab_cache() + if night is None: night = self.night @@ -204,7 +240,6 @@ def test_proc_night_linking_and_ccdcalib(self): for job in ['nightlybias', 'psfnight']: self.assertTrue(job not in proctable['JOBDESC']) - ## Test link fiberflatnight testdict = base_override_dict.copy() testdict['calibration']['linkcal']['include'] = 'fiberflatnight' @@ -392,6 +427,7 @@ def test_proc_night_override_flag_setting(self): if 'desi_proc_joint_fit' in line: self.assertFalse(flag in line) + @unittest.skipIf('SKIP_PROC_NIGHT_DAILY_TEST' in os.environ, 'Skipping test_proc_night_daily because $SKIP_PROC_NIGHT_DAILY_TEST is set') @unittest.skipUnless(os.path.isdir(_real_rawnight_dir), f'{_real_rawnight_dir} not available') def test_proc_night_daily(self): @@ -404,6 +440,7 @@ def test_proc_night_daily(self): while True: num_newlinks = link_rawdata(self.real_rawnight_dir, self.test_rawnight_dir, numexp=10) desispec.workflow.redshifts.reset_science_etab_cache() + desispec.workflow.redshifts.reset_tilenight_ptab_cache() if num_newlinks == 0: break else: diff --git a/py/desispec/workflow/processing.py b/py/desispec/workflow/processing.py index bbd302f35..93d8cad69 100644 --- a/py/desispec/workflow/processing.py +++ b/py/desispec/workflow/processing.py @@ -1572,13 +1572,14 @@ def submit_redshifts(ptable, prows, tnight, internal_id, queue, reservation, tileid, night = tileids[0], nights[0] ## For cumulative redshifts, get any existing processing rows for tile matched_prows = read_minimal_tilenight_proctab_cols(tileids=tileids) - matched_prows = matched_prows[matched_prows['NIGHT']<=night] ## Identify the processing rows that should be assigned as dependecies ## tnight should be first such that the new job inherits the other metadata from it tnights = [tnight] - for prow in matched_prows: - if matched_prows['INTID'] != tnight['INTID']: - tnights.append(prow) + if matched_prows is not None: + matched_prows = matched_prows[matched_prows['NIGHT'] <= night] + for prow in matched_prows: + if matched_prows['INTID'] != tnight['INTID']: + tnights.append(prow) log.info(f"Internal Processing IDs: {[prow['INTID'] for prow in tnights]}.\n") ## Identify all exposures that should go into the fit expids = [prow['EXPID'][0] for prow in zprows] diff --git a/py/desispec/workflow/redshifts.py b/py/desispec/workflow/redshifts.py index 87cdba43d..cfa2ea25a 100644 --- a/py/desispec/workflow/redshifts.py +++ b/py/desispec/workflow/redshifts.py @@ -400,7 +400,8 @@ def read_minimal_science_exptab_cols(nights=None, tileids=None, reset_cache=Fals ## If not cached, then find all the relevant exposure tables and load them if nights is None: etab_path = findfile('exptable', night='99999999', readonly=True) - etab_files = glob.glob(etab_path.replace('99999999', '202?????')) + glob_path = etab_path.replace('99999999', '202?????').replace('999999', '202???') + etab_files = glob.glob(glob_path) else: etab_files = list() for night in nights: @@ -417,6 +418,7 @@ def read_minimal_science_exptab_cols(nights=None, tileids=None, reset_cache=Fals ## append to the full set etab_files = sorted(etab_files) exptables = list() + for etab_file in etab_files: ## correct way but slower and we don't need multivalue columns #t = load_table(etab_file, tabletype='etable') @@ -450,7 +452,7 @@ def read_minimal_science_exptab_cols(nights=None, tileids=None, reset_cache=Fals ## If we've loaded all nights, then cache the result if nights is None: log.info(f'Caching exposure table rows for science selection') - set_science_etab_cache(outtable.copy()) + _set_science_etab_cache(outtable.copy()) ## If requeted specific tileids, then subselect that if tileids is not None: @@ -467,6 +469,7 @@ def _select_sciences_from_etab(etab): t = t[((t['OBSTYPE'] == 'science') & (t['TILEID'] >= 0))] if 'LASTSTEP' in t.colnames: t = t[t['LASTSTEP'] == 'all'] + return t def reset_science_etab_cache(): @@ -478,14 +481,18 @@ def reset_science_etab_cache(): log.info(f'Resetting science exposure table row cache') _science_etab_cache = None -def set_science_etab_cache(etab): +def _set_science_etab_cache(etab): """ sets the global cache of science exposure tables stored in var _science_etab_cache """ global _science_etab_cache log = get_logger() log.info(f'Assigning science exposure table row cache to new table') - _science_etab_cache = _select_sciences_from_etab(etab) + if 'OBSTYPE' in etab.colnames: + _science_etab_cache = _select_sciences_from_etab(etab) + else: + _science_etab_cache = etab + _science_etab_cache.sort(['EXPID']) def update_science_etab_cache(etab): """ @@ -521,8 +528,9 @@ def read_minimal_tilenight_proctab_cols(nights=None, tileids=None, reset_cache=F tileids (list of int): tileids to include (default all tiles found) reset_cache (bool): If true, global cache is cleared - Returns exptable with just columns EXPID, TILEID, NIGHT, PROCCAMWORD, - INTID, LATEST_QID + Returns None if not proc tables exist or exptable with columns EXPID, + TILEID, NIGHT, PROCCAMWORD, INTID, LATEST_QID and rows matching + the input selection criteria """ global _tilenight_ptab_cache log = get_logger() @@ -582,11 +590,16 @@ def read_minimal_tilenight_proctab_cols(nights=None, tileids=None, reset_cache=F ptables.append(t['EXPID', 'TILEID', 'NIGHT', 'PROCCAMWORD', 'INTID', 'LATEST_QID']) - outtable = vstack(ptables) + if len(ptables) > 0: + outtable = vstack(ptables) + else: + log.info(f"No processing tables found. Returning None.") + return None + ## If we've loaded all nights, then cache the result if nights is None: log.info(f'Caching processing table rows for tilenight selection') - set_tilenight_ptab_cache(outtable) + _set_tilenight_ptab_cache(outtable) ## If requested specific tileids, then subselect that if tileids is not None: @@ -614,14 +627,17 @@ def reset_tilenight_ptab_cache(): log.info(f'Resetting processing table row cache for tilenight selection') _tilenight_ptab_cache = None -def set_tilenight_ptab_cache(ptab): +def _set_tilenight_ptab_cache(ptab): """ sets the global cache of tilenight processing tables stored in var _tilenight_ptab_cache """ global _tilenight_ptab_cache log = get_logger() log.info(f'Asigning processing table row cache for tilenight selection to new table') - _tilenight_ptab_cache = _select_tilenights_from_ptab(ptab) + if 'OBSTYPE' in ptab.colnames: + _tilenight_ptab_cache = _select_tilenights_from_ptab(ptab) + else: + _tilenight_ptab_cache = ptab _tilenight_ptab_cache.sort(['INTID']) def update_tilenight_ptab_cache(ptab): diff --git a/py/desispec/workflow/tableio.py b/py/desispec/workflow/tableio.py index 89b6a0065..872f4b266 100644 --- a/py/desispec/workflow/tableio.py +++ b/py/desispec/workflow/tableio.py @@ -7,8 +7,6 @@ import numpy as np from astropy.table import Table -from desispec.workflow.redshifts import update_science_etab_cache, \ - update_tilenight_ptab_cache ################################################### ################ Table Functions ################# ################################################### @@ -138,6 +136,9 @@ def write_table(origtable, tablename=None, tabletype=None, joinsymb='|', overwri If True and tablename not specified and tabletype is exposure table, this looks for the table in the SPECPROD rather than the exptab repository. Default is True. """ + ## avoid circular import by importing only in this function that uses it + from desispec.workflow.redshifts import update_science_etab_cache, \ + update_tilenight_ptab_cache log = get_logger() if tablename is None and tabletype is None: log.error("Pathname or type of table is required to save the table") From 9a36693212d761bae01beac2ea9617d4a09363c1 Mon Sep 17 00:00:00 2001 From: akremin Date: Fri, 2 Aug 2024 16:42:17 -0700 Subject: [PATCH 03/27] add cross-night zproc resubmissions and unit test --- bin/desi_resubmit_queue_failures | 22 +- py/desispec/scripts/daily_processing.py | 6 +- py/desispec/scripts/proc_night.py | 22 +- py/desispec/scripts/submit_night.py | 4 +- py/desispec/scripts/tile_redshifts.py | 4 +- py/desispec/scripts/tile_redshifts_bash.py | 4 +- py/desispec/scripts/zproc.py | 6 +- py/desispec/scripts/zprocdashboard.py | 3 +- py/desispec/test/test_proc_night.py | 68 +++- py/desispec/test/test_workflow_queue.py | 16 +- py/desispec/workflow/exptable.py | 173 +++++++++- py/desispec/workflow/processing.py | 93 ++++-- py/desispec/workflow/proctable.py | 288 +++++++++++++++++ py/desispec/workflow/queue.py | 33 +- py/desispec/workflow/redshifts.py | 357 ++------------------- py/desispec/workflow/science_selection.py | 3 +- py/desispec/workflow/tableio.py | 89 ++--- 17 files changed, 724 insertions(+), 467 deletions(-) diff --git a/bin/desi_resubmit_queue_failures b/bin/desi_resubmit_queue_failures index f69b6fd4b..92a2b7f47 100755 --- a/bin/desi_resubmit_queue_failures +++ b/bin/desi_resubmit_queue_failures @@ -11,9 +11,10 @@ from astropy.table import Table import glob ## Import some helper functions, you can see their definitions by uncomenting the bash shell command +from desiutil.log import get_logger from desispec.workflow.tableio import load_table, write_table from desispec.workflow.proctable import get_processing_table_pathname -from desispec.workflow.processing import update_and_recurvsively_submit +from desispec.workflow.processing import update_and_recursively_submit from desispec.workflow.queue import get_resubmission_states def parse_args(): # options=None): @@ -49,6 +50,7 @@ def parse_args(): # options=None): if __name__ == '__main__': args = parse_args() + log = get_logger() ptable_pathname = args.proc_table_pathname if ptable_pathname is None: if args.night is None: @@ -66,21 +68,25 @@ if __name__ == '__main__': if not args.dont_resub_failed: resub_states.append('FAILED') - print(f"Resubmitting the following Slurm states: {resub_states}") + log.info(f"Resubmitting the following Slurm states: {resub_states}") + + if args.dry_run > 0 and args.dry_run < 3: + log.warning(f"{args.dry_run=} will be run with limited simulation " + f"because we don't want to write out incorrect queue information.") ## Combine the table names and types for easier passing to io functions table_type = 'proctable' ## Load in the files defined above ptable = load_table(tablename=ptable_pathname, tabletype=table_type) - print(f"Identified ptable with {len(ptable)} entries.") - ptable, nsubmits = update_and_recurvsively_submit(ptable, submits=0, - resubmission_states=resub_states, - ptab_name=ptable_pathname, dry_run=args.dry_run, - reservation=args.reservation) + log.info(f"Identified ptable with {len(ptable)} entries.") + ptable, nsubmits = update_and_recursively_submit(ptable, submits=0, + resubmission_states=resub_states, + ptab_name=ptable_pathname, dry_run=args.dry_run, + reservation=args.reservation) if not args.dry_run: write_table(ptable, tablename=ptable_pathname) - print("Completed all necessary queue resubmissions from processing " + log.info("Completed all necessary queue resubmissions from processing " + f"table: {ptable_pathname}") diff --git a/py/desispec/scripts/daily_processing.py b/py/desispec/scripts/daily_processing.py index 27a63ce3a..532aad91e 100644 --- a/py/desispec/scripts/daily_processing.py +++ b/py/desispec/scripts/daily_processing.py @@ -23,7 +23,7 @@ erow_to_prow, default_prow from desispec.workflow.processing import parse_previous_tables, flat_joint_fit, arc_joint_fit, get_type_and_tile, \ science_joint_fit, define_and_assign_dependency, create_and_submit, \ - update_and_recurvsively_submit, checkfor_and_submit_joint_job, \ + update_and_recursively_submit, checkfor_and_submit_joint_job, \ submit_tilenight_and_redshifts from desispec.workflow.queue import update_from_queue, any_jobs_not_complete from desispec.io.util import difference_camwords, parse_badamps, validate_badamps @@ -445,7 +445,7 @@ def daily_processing_manager(specprod=None, exp_table_path=None, proc_table_path if len(ptable) > 0: ptable = update_from_queue(ptable, dry_run=dry_run_level) - # ptable, nsubmits = update_and_recurvsively_submit(ptable, + # ptable, nsubmits = update_and_recursively_submit(ptable, # ptab_name=proc_table_pathname, dry_run=dry_run_level) ## Exposure table doesn't change in the interim, so no need to re-write it to disk @@ -500,7 +500,7 @@ def daily_processing_manager(specprod=None, exp_table_path=None, proc_table_path # ii,nsubmits = 0, 0 # while ii < 4 and any_jobs_not_complete(ptable['STATUS']): # print(f"Starting iteration {ii} of queue updating and resubmissions of failures.") - # ptable, nsubmits = update_and_recurvsively_submit(ptable, submits=nsubmits, + # ptable, nsubmits = update_and_recursively_submit(ptable, submits=nsubmits, # ptab_name=proc_table_pathname, dry_run=dry_run_level) # if dry_run_level < 3: # write_table(ptable, tablename=proc_table_pathname) diff --git a/py/desispec/scripts/proc_night.py b/py/desispec/scripts/proc_night.py index 1db9d22b0..9a54ca3f3 100644 --- a/py/desispec/scripts/proc_night.py +++ b/py/desispec/scripts/proc_night.py @@ -7,8 +7,6 @@ from desispec.scripts.link_calibnight import derive_include_exclude from desispec.workflow.calibration_selection import \ determine_calibrations_to_proc -from desispec.workflow.redshifts import read_minimal_science_exptab_cols, \ - read_minimal_tilenight_proctab_cols from desispec.workflow.science_selection import determine_science_to_proc, \ get_tiles_cumulative from desiutil.log import get_logger @@ -26,9 +24,10 @@ from desispec.workflow.utils import sleep_and_report, \ verify_variable_with_environment, load_override_file from desispec.workflow.timing import what_night_is_it, during_operating_hours -from desispec.workflow.exptable import get_last_step_options +from desispec.workflow.exptable import get_last_step_options, \ + read_minimal_science_exptab_cols from desispec.workflow.proctable import default_obstypes_for_proctable, \ - erow_to_prow, default_prow + erow_to_prow, default_prow, read_minimal_tilenight_proctab_cols from desispec.workflow.processing import define_and_assign_dependency, \ create_and_submit, \ submit_tilenight_and_redshifts, \ @@ -36,7 +35,7 @@ night_to_starting_iid, make_joint_prow, \ set_calibrator_flag, make_exposure_prow, \ all_calibs_submitted, \ - update_and_recurvsively_submit, update_accounted_for_with_linking + update_and_recursively_submit, update_accounted_for_with_linking from desispec.workflow.queue import update_from_queue, any_jobs_failed from desispec.io.util import decode_camword, difference_camwords, \ create_camword, replace_prefix, erow_to_goodcamword, camword_union @@ -366,10 +365,10 @@ def proc_night(night=None, proc_obstypes=None, z_submit_types=None, if np.max([len(qids) for qids in ptable['ALL_QIDS']]) < 3: log.info("Job failures were detected. Resubmitting those jobs " + "before continuing with new submissions.") - ptable, nsubmits = update_and_recurvsively_submit(ptable, - ptab_name=proc_table_pathname, - dry_run=dry_run, - reservation=reservation) + ptable, nsubmits = update_and_recursively_submit(ptable, + ptab_name=proc_table_pathname, + dry_run=dry_run, + reservation=reservation) elif not ignore_proc_table_failures: err = "Some jobs have an incomplete job status. This script " \ + "will not fix them. You should remedy those first. " @@ -584,7 +583,10 @@ def create_submit_add_and_save(prow, proctable, check_outputs=check_for_outputs, unproc_table = None if len(ptable) > 0: ## All jobs now submitted, update information from job queue and save - ptable = update_from_queue(ptable, dry_run=dry_run_level) + ## But only if actually submitting or fully simulating, don't simulate + ## outputs that will be written to disk (levels 1 and 2) + if dry_run_level < 1 or dry_run_level > 2: + ptable = update_from_queue(ptable, dry_run=dry_run_level) if dry_run_level < 3: write_table(ptable, tablename=proc_table_pathname, tabletype='proctable') ## Now that processing is complete, lets identify what we didn't process diff --git a/py/desispec/scripts/submit_night.py b/py/desispec/scripts/submit_night.py index 9695a2a07..8b126ea84 100644 --- a/py/desispec/scripts/submit_night.py +++ b/py/desispec/scripts/submit_night.py @@ -16,7 +16,8 @@ from desispec.workflow.utils import pathjoin, sleep_and_report from desispec.workflow.timing import what_night_is_it from desispec.workflow.exptable import get_exposure_table_path, \ - get_exposure_table_name, get_last_step_options + get_exposure_table_name, get_last_step_options, \ + read_minimal_science_exptab_cols from desispec.workflow.proctable import default_obstypes_for_proctable, get_processing_table_path, \ get_processing_table_name, erow_to_prow, table_row_to_dict, \ default_prow @@ -25,7 +26,6 @@ checkfor_and_submit_joint_job, submit_tilenight_and_redshifts from desispec.workflow.queue import update_from_queue, any_jobs_not_complete from desispec.workflow.desi_proc_funcs import get_desi_proc_batch_file_path -from desispec.workflow.redshifts import read_minimal_science_exptab_cols from desispec.io.util import decode_camword, difference_camwords, create_camword def submit_night(night, proc_obstypes=None, z_submit_types=None, queue='realtime', diff --git a/py/desispec/scripts/tile_redshifts.py b/py/desispec/scripts/tile_redshifts.py index fbb203b60..b953e4419 100644 --- a/py/desispec/scripts/tile_redshifts.py +++ b/py/desispec/scripts/tile_redshifts.py @@ -11,8 +11,8 @@ from astropy.table import Table, vstack from desispec.io.util import parse_cameras -from desispec.workflow.redshifts import read_minimal_science_exptab_cols, \ - create_desi_zproc_batch_script +from desispec.workflow.redshifts import create_desi_zproc_batch_script +from desispec.workflow.exptable import read_minimal_science_exptab_cols from desiutil.log import get_logger from desispec.workflow import batch diff --git a/py/desispec/scripts/tile_redshifts_bash.py b/py/desispec/scripts/tile_redshifts_bash.py index 41ae5f160..6f85647b4 100644 --- a/py/desispec/scripts/tile_redshifts_bash.py +++ b/py/desispec/scripts/tile_redshifts_bash.py @@ -10,9 +10,9 @@ import numpy as np from astropy.table import Table, vstack -from desispec.workflow.redshifts import read_minimal_science_exptab_cols, \ - get_ztile_script_pathname, get_ztile_relpath, \ +from desispec.workflow.redshifts import get_ztile_script_pathname, get_ztile_relpath, \ get_ztile_script_suffix +from desispec.workflow.exptable import read_minimal_science_exptab_cols from desiutil.log import get_logger from desispec.workflow import batch diff --git a/py/desispec/scripts/zproc.py b/py/desispec/scripts/zproc.py index f23760b70..8434f1e11 100644 --- a/py/desispec/scripts/zproc.py +++ b/py/desispec/scripts/zproc.py @@ -29,8 +29,7 @@ import desiutil.iers from desispec.io.meta import get_nights_up_to_date -from desispec.workflow.redshifts import read_minimal_science_exptab_cols, \ - create_desi_zproc_batch_script +from desispec.workflow.redshifts import create_desi_zproc_batch_script #- internal desispec imports import desispec.io @@ -42,7 +41,8 @@ from desispec.scripts import group_spectra from desispec.parallel import stdouterr_redirected from desispec.workflow import batch -from desispec.workflow.exptable import get_exposure_table_pathname +from desispec.workflow.exptable import get_exposure_table_pathname, \ + read_minimal_science_exptab_cols from desispec.workflow.desi_proc_funcs import assign_mpi, update_args_with_headers, log_timer from desispec.workflow.desi_proc_funcs import determine_resources, create_desi_proc_batch_script diff --git a/py/desispec/scripts/zprocdashboard.py b/py/desispec/scripts/zprocdashboard.py index 6f00ff398..589ab1d04 100644 --- a/py/desispec/scripts/zprocdashboard.py +++ b/py/desispec/scripts/zprocdashboard.py @@ -19,7 +19,7 @@ from desispec.workflow.exptable import get_exposure_table_pathname, \ default_obstypes_for_exptable, \ get_exposure_table_column_types, \ - get_exposure_table_column_defaults + get_exposure_table_column_defaults, read_minimal_science_exptab_cols from desispec.workflow.proc_dashboard_funcs import get_skipped_ids, \ return_color_profile, find_new_exps, _hyperlink, _str_frac, \ get_output_dir, get_nights_dict, make_html_page, read_json, write_json, \ @@ -27,7 +27,6 @@ from desispec.workflow.proctable import get_processing_table_pathname, \ erow_to_prow, instantiate_processing_table from desispec.workflow.tableio import load_table -from desispec.workflow.redshifts import read_minimal_science_exptab_cols from desispec.io.meta import specprod_root, rawdata_root, findfile from desispec.io.util import decode_camword, camword_to_spectros, \ difference_camwords, parse_badamps, create_camword, camword_union, \ diff --git a/py/desispec/test/test_proc_night.py b/py/desispec/test/test_proc_night.py index 084602bf0..94934bbde 100644 --- a/py/desispec/test/test_proc_night.py +++ b/py/desispec/test/test_proc_night.py @@ -13,6 +13,9 @@ import numpy as np +import desispec.workflow.exptable +import desispec.workflow.proctable +from desispec.workflow.processing import update_and_recursively_submit from desispec.workflow.tableio import load_table, write_table from desispec.workflow.redshifts import get_ztile_script_pathname from desispec.workflow.desi_proc_funcs import \ @@ -66,7 +69,7 @@ def setUpClass(cls): cls.override_file = findfile('override', cls.night) # these are created in function def tearDown(self): - desispec.workflow.redshifts.reset_tilenight_ptab_cache() + desispec.workflow.proctable.reset_tilenight_ptab_cache() # remove everything from prod except exposure_table for self.night for path in glob.glob(self.proddir+'/*'): if os.path.basename(path) == 'exposure_tables': @@ -161,8 +164,8 @@ def test_proc_night_noz(self): def test_proc_night_cross_night_redshifts(self): """Test if crossnight redshifts are submitted properly.""" proctable1, unproctable1 = proc_night(self.prenight, sub_wait_time=0.0, dry_run_level=1) - desispec.workflow.redshifts.reset_science_etab_cache() - desispec.workflow.redshifts.reset_tilenight_ptab_cache() + desispec.workflow.exptable.reset_science_etab_cache() + desispec.workflow.proctable.reset_tilenight_ptab_cache() proctable2, unproctable2 = proc_night(self.night, sub_wait_time=0.0, dry_run_level=1, z_submit_types=['cumulative']) @@ -189,9 +192,62 @@ def test_proc_night_cross_night_redshifts(self): self.assertTrue(str(self.night) in line) self.assertTrue(str(tilenight2['EXPID'][0]) in line) + def test_proc_night_resubmit_queue_failures(self): + """Test if crossnight redshifts work properly with desi_resubmit_queue_failures.""" + proctable1, unproctable1 = proc_night(self.prenight, sub_wait_time=0.0, dry_run_level=1) + desispec.workflow.exptable.reset_science_etab_cache() + desispec.workflow.proctable.reset_tilenight_ptab_cache() + proctable2, unproctable2 = proc_night(self.night, sub_wait_time=0.0, + dry_run_level=1, z_submit_types=['cumulative']) + desispec.workflow.exptable.reset_science_etab_cache() + desispec.workflow.proctable.reset_tilenight_ptab_cache() + + ## test that the code runs + updatedtable2, nsubmits = update_and_recursively_submit(proctable2, submits=0, dry_run=3) + self.assertFalse(np.any(np.in1d(updatedtable2['STATUS'], [b'DEP_NOT_SUBD', b'TIMEOUT'])), + msg='No TIMEOUTs in nominal resubmission') + + ## now test that the resubmission works by forcing the failure in redshift job + for tileid in self.repeat_tiles: + tilematches2 = proctable2[proctable2['TILEID'] == tileid] + cumulative2 = tilematches2[tilematches2['JOBDESC'] == 'cumulative'][0] + proctable2['STATUS'][proctable2['INTID']==cumulative2['INTID']] = 'TIMEOUT' + updatedtable2, nsubmits = update_and_recursively_submit(proctable2, + submits=0, + dry_run=1) + self.assertFalse(np.any(np.in1d(updatedtable2['STATUS'], [b'DEP_NOT_SUBD', b'TIMEOUT'])), + msg='Cross night resubmission should leave no TIMEOUTs') + + ## now set the tilenight from the earlier night as bad + ## now resubmission should refuse to proceed + ## Set earlier tilenight as TIMEOUT, along with redshift job as TIMEOUT + for tileid in self.repeat_tiles: + tilematches1 = proctable1[proctable1['TILEID'] == tileid] + tilenight1 = tilematches1[tilematches1['JOBDESC'] == 'tilenight'][0] + proctable1['STATUS'][proctable1['INTID'] == tilenight1['INTID']] = 'TIMEOUT' + tilematches2 = proctable2[proctable2['TILEID'] == tileid] + cumulative2 = tilematches2[tilematches2['JOBDESC'] == 'cumulative'][0] + proctable2['STATUS'][proctable2['INTID']==cumulative2['INTID']] = 'TIMEOUT' + + ## Save the updated proctable so that the resubmission code finds it + tablename = findfile('proctable', night=self.prenight) + write_table(proctable1, tablename=tablename, tabletype='proctable') + desispec.workflow.proctable.reset_full_ptab_cache() + + ## Run resubmission code + updatedtable2, nsubmits = update_and_recursively_submit(proctable2, + submits=0, + dry_run=1) + self.assertTrue(np.any(np.in1d(updatedtable2['STATUS'], [b'DEP_NOT_SUBD', b'TIMEOUT'])), + msg='Cross night resubmission should leave two TIMEOUTs') + self.assertTrue(np.sum(updatedtable2['STATUS'] == 'DEP_NOT_SUBD')==2, + msg='Cross night resubmission should have 2 TIMEOUTs' \ + + ' after forcing failed previous night jobs.') + + def _override_write_run_delete(self, override_dict, night=None, **kwargs): """Write override, run proc_night, remove override file, and return outputs""" - desispec.workflow.redshifts.reset_tilenight_ptab_cache() + desispec.workflow.proctable.reset_tilenight_ptab_cache() if night is None: night = self.night @@ -439,8 +495,8 @@ def test_proc_night_daily(self): while True: num_newlinks = link_rawdata(self.real_rawnight_dir, self.test_rawnight_dir, numexp=10) - desispec.workflow.redshifts.reset_science_etab_cache() - desispec.workflow.redshifts.reset_tilenight_ptab_cache() + desispec.workflow.exptable.reset_science_etab_cache() + desispec.workflow.proctable.reset_tilenight_ptab_cache() if num_newlinks == 0: break else: diff --git a/py/desispec/test/test_workflow_queue.py b/py/desispec/test/test_workflow_queue.py index 2d263ab68..cea9447b8 100644 --- a/py/desispec/test/test_workflow_queue.py +++ b/py/desispec/test/test_workflow_queue.py @@ -20,30 +20,33 @@ def setUp(self): def test_queue_info_from_qids(self): """Test queue_info_from_qids""" qids = [1,10,2,5] - qinfo = queue.queue_info_from_qids(qids, dry_run=True) + qinfo = queue.queue_info_from_qids(qids, dry_run=3) self.assertEqual(list(qinfo['JOBID']), qids) def test_queue_state_cache(self): """Test queue state cache""" # Query qids to get state into cache qids = [1,10,2,5] - qinfo = queue.queue_info_from_qids(qids, dry_run=True) + qinfo = queue.queue_info_from_qids(qids, dry_run=3) # check cache matches state - qstates = queue.get_queue_states_from_qids(qids, use_cache=True) + qstates = queue.get_queue_states_from_qids(qids, use_cache=True, dry_run=3) self.assertEqual(list(qinfo['STATE']), list(qstates.values())) + # should be ['COMPLETED', 'COMPLETED', 'COMPLETED', 'COMPLETED'] # update all states and check qinfo['STATE'] = 'FAILED' qinfo['STATE'][0] = 'PENDING' queue.update_queue_state_cache_from_table(qinfo) - qstates = queue.get_queue_states_from_qids(qids, use_cache=True) + qstates = queue.get_queue_states_from_qids(qids, use_cache=True, dry_run=3) self.assertEqual(list(qinfo['STATE']), list(qstates.values())) + # should be ['PENDING', 'FAILED', 'FAILED', 'FAILED'] # update state of just one qid queue.update_queue_state_cache(10, 'COMPLETED') - qstates = queue.get_queue_states_from_qids(qids, use_cache=True) + qstates = queue.get_queue_states_from_qids(qids, use_cache=True, dry_run=3) + # should be ['PENDING', 'COMPLETED', 'FAILED', 'FAILED'] self.assertEqual(qstates[1], 'PENDING') self.assertEqual(qstates[10], 'COMPLETED') self.assertEqual(qstates[2], 'FAILED') @@ -52,7 +55,8 @@ def test_queue_state_cache(self): # Asking for qids not in the cache should requery sacct for all of them. # Since this is dry run, that will also reset all back to COMPLETED. qids.append(100) - qstates = queue.get_queue_states_from_qids(qids, use_cache=True, dry_run=True) + qstates = queue.get_queue_states_from_qids(qids, use_cache=True, dry_run=3) + # should be ['COMPLETED', 'COMPLETED', 'TIMEOUT', 'COMPLETED', 'COMPLETED'] for qid, state in qstates.items(): self.assertEqual(state, 'COMPLETED', f'{qid=} {state=} not COMPLETED') diff --git a/py/desispec/workflow/exptable.py b/py/desispec/workflow/exptable.py index 13e24fb6d..ad6bf43b8 100644 --- a/py/desispec/workflow/exptable.py +++ b/py/desispec/workflow/exptable.py @@ -6,8 +6,10 @@ import os import glob import numpy as np -from astropy.table import Table +from astropy.table import Table, vstack from astropy.io import fits + +from desispec.io import findfile ## Import some helper functions, you can see their definitions by uncomenting the bash shell command from desispec.workflow.utils import define_variable_from_environment, get_json_dict from desispec.workflow.desi_proc_funcs import load_raw_data_header, cameras_from_raw_data @@ -916,3 +918,172 @@ def airfac_to_aircorr(airfac): https://desi.lbl.gov/trac/wiki/SurveyOps/SurveySpeed """ return airmass_to_aircorr(airfac_to_airmass(airfac)) + + +_science_etab_cache = None + + +def read_minimal_science_exptab_cols(nights=None, tileids=None, + reset_cache=False, readonly=True): + """ + Read exposure tables while handling evolving formats + + Args: + nights (list of int): nights to include (default all nights found) + tileids (list of int): tileids to include (default all tiles found) + reset_cache (bool): If true, global cache is cleared + readonly (bool): If true, use readonly path to tables for laoding + + Returns exptable with just columns TILEID, NIGHT, EXPID, 'CAMWORD', + 'BADCAMWORD', filtered by science + exposures with LASTSTEP='all' and TILEID>=0 + + Note: the returned table is the full pipeline exposures table. It is trimmed + to science exposures that have LASTSTEP=='all' + """ + global _science_etab_cache + log = get_logger() + + ## If requested reset the science exposure table cache + if reset_cache: + reset_science_etab_cache() + + ## If the cache exists, use it speed up the search over tiles and nights + if _science_etab_cache is not None: + log.info(f'Using cached exposure table rows for science selection') + t = _science_etab_cache.copy() + if nights is not None: + t = t[np.isin(t['NIGHT'], nights)] + if tileids is not None: + t = t[np.isin(t['TILEID'], tileids)] + return t + + ## If not cached, then find all the relevant exposure tables and load them + if nights is None: + etab_path = findfile('exptable', night='99999999', readonly=readonly) + glob_path = etab_path.replace('99999999', '202?????').replace('999999', + '202???') + etab_files = glob.glob(glob_path) + else: + etab_files = list() + for night in nights: + etab_file = get_exposure_table_pathname(night) + if os.path.exists(etab_file): + etab_files.append(etab_file) + elif night >= 20201201: + log.error(f"Exposure table missing for night {night}") + else: + # - these are expected for the daily run, ok + log.debug(f"Exposure table missing for night {night}") + + ## Load each relevant exposure table file, subselect valid science's and + ## append to the full set + etab_files = sorted(etab_files) + exptables = list() + + for etab_file in etab_files: + ## correct way but slower and we don't need multivalue columns + # t = load_table(etab_file, tabletype='etable') + t = Table.read(etab_file, format='ascii.csv') + + ## Subselect only valid science exposures + t = _select_sciences_from_etab(t) + + ## For backwards compatibility if BADCAMWORD column does not + ## exist then add a blank one + if 'BADCAMWORD' not in t.colnames: + t.add_column(Table.Column(['' for i in range(len(t))], dtype='S36', + name='BADCAMWORD')) + + ## Need to ensure that the string columns are consistent + for col in ['CAMWORD', 'BADCAMWORD']: + ## Masked arrays need special handling + ## else just reassign with consistent dtype + if isinstance(t[col], Table.MaskedColumn): + ## If compeltely empty it's loaded as type int + ## otherwise fill masked with '' + if t[col].dtype == int: + t[col] = Table.Column(['' for i in range(len(t))], + dtype='S36', name=col) + else: + t[col] = Table.Column(t[col].filled(fill_value=''), + dtype='S36', name=col) + else: + t[col] = Table.Column(t[col], dtype='S36', name=col) + exptables.append(t['TILEID', 'NIGHT', 'EXPID', 'CAMWORD', 'BADCAMWORD']) + + outtable = vstack(exptables) + + ## If we've loaded all nights, then cache the result + if nights is None: + log.info(f'Caching exposure table rows for science selection') + _set_science_etab_cache(outtable.copy()) + + ## If requeted specific tileids, then subselect that + if tileids is not None: + outtable = outtable[np.isin(outtable['TILEID'], tileids)] + + return outtable + + +def _select_sciences_from_etab(etab): + """ + takes an exposure table or combination of exposure tables and subselects + valid science jobs. Those that pass selection are returned as a table. + """ + t = etab.copy() + t = t[((t['OBSTYPE'] == 'science') & (t['TILEID'] >= 0))] + if 'LASTSTEP' in t.colnames: + t = t[t['LASTSTEP'] == 'all'] + + return t + + +def reset_science_etab_cache(): + """ + reset the global cache of science exposure tables stored in var _science_etab_cache + """ + global _science_etab_cache + log = get_logger() + log.info(f'Resetting science exposure table row cache') + _science_etab_cache = None + + +def _set_science_etab_cache(etab): + """ + sets the global cache of science exposure tables stored in var _science_etab_cache + """ + global _science_etab_cache + log = get_logger() + log.info(f'Assigning science exposure table row cache to new table') + if 'OBSTYPE' in etab.colnames: + _science_etab_cache = _select_sciences_from_etab(etab) + else: + _science_etab_cache = etab + _science_etab_cache.sort(['EXPID']) + + +def update_science_etab_cache(etab): + """ + updates the global cache of science exposure tables stored in var + _science_etab_cache. + + Notes: this will remove all current entries for any night in the input + """ + global _science_etab_cache + log = get_logger() + ## If the cache doesn't exist, don't update it. + if _science_etab_cache is None: + log.debug(f'Science exptab cache does not exist, so not updating') + return + cleaned_etab = _select_sciences_from_etab(etab) + new_nights = np.unique(cleaned_etab['NIGHT']) + log.info(f'Removing all current entries in science exposure ' + + f'table row cache for nights {new_nights}') + conflicting_entries = np.isin(_science_etab_cache['NIGHT'], new_nights) + log.info( + f"Removing {len(conflicting_entries)} rows and adding {len(cleaned_etab)} rows " + + f"to science exposure table row cache.") + keep = np.bitwise_not(conflicting_entries) + _science_etab_cache = _science_etab_cache[keep] + _science_etab_cache = vstack([_science_etab_cache, cleaned_etab]) diff --git a/py/desispec/workflow/processing.py b/py/desispec/workflow/processing.py index 93d8cad69..4693c81b2 100644 --- a/py/desispec/workflow/processing.py +++ b/py/desispec/workflow/processing.py @@ -17,8 +17,8 @@ from desispec.scripts.tile_redshifts import generate_tile_redshift_scripts from desispec.workflow.redshifts import get_ztile_script_pathname, \ get_ztile_relpath, \ - get_ztile_script_suffix, read_minimal_tilenight_proctab_cols, \ - read_minimal_science_exptab_cols + get_ztile_script_suffix +from desispec.workflow.exptable import read_minimal_science_exptab_cols from desispec.workflow.queue import get_resubmission_states, update_from_queue, \ queue_info_from_qids, get_queue_states_from_qids, update_queue_state_cache from desispec.workflow.timing import what_night_is_it @@ -31,7 +31,9 @@ from desispec.workflow.utils import pathjoin, sleep_and_report, \ load_override_file from desispec.workflow.tableio import write_table, load_table -from desispec.workflow.proctable import table_row_to_dict, erow_to_prow +from desispec.workflow.proctable import table_row_to_dict, erow_to_prow, \ + read_minimal_tilenight_proctab_cols, read_minimal_full_proctab_cols, \ + update_full_ptab_cache from desiutil.log import get_logger from desispec.io import findfile, specprod_root @@ -631,7 +633,7 @@ def submit_batch_script(prow, dry_run=0, reservation=None, strictly_successful=F # workaround for sbatch --dependency bug not tracking jobs correctly # see NERSC TICKET INC0203024 if len(dep_qids) > 0 and not dry_run: - state_dict = get_queue_states_from_qids(dep_qids, dry_run=0, use_cache=True) + state_dict = get_queue_states_from_qids(dep_qids, dry_run=dry_run, use_cache=True) still_depids = [] for depid in dep_qids: if depid in state_dict.keys() and state_dict[int(depid)] == 'COMPLETED': @@ -1122,8 +1124,8 @@ def all_calibs_submitted(accounted_for, do_cte_flats): return np.all(list(test_dict.values())) -def update_and_recurvsively_submit(proc_table, submits=0, resubmission_states=None, - ptab_name=None, dry_run=0,reservation=None): +def update_and_recursively_submit(proc_table, submits=0, resubmission_states=None, + ptab_name=None, dry_run=0, reservation=None): """ Given an processing table, this loops over job rows and resubmits failed jobs (as defined by resubmission_states). Before submitting a job, it checks the dependencies for failures. If a dependency needs to be resubmitted, it recursively @@ -1157,7 +1159,7 @@ def update_and_recurvsively_submit(proc_table, submits=0, resubmission_states=No if resubmission_states is None: resubmission_states = get_resubmission_states() log.info(f"Resubmitting jobs with current states in the following: {resubmission_states}") - proc_table = update_from_queue(proc_table, dry_run=False) + proc_table = update_from_queue(proc_table, dry_run=dry_run) log.info("Updated processing table queue information:") cols = ['INTID', 'INT_DEP_IDS', 'EXPID', 'TILEID', 'OBSTYPE', 'JOBDESC', 'LATEST_QID', 'STATUS'] @@ -1172,7 +1174,7 @@ def update_and_recurvsively_submit(proc_table, submits=0, resubmission_states=No id_to_row_map, ptab_name, resubmission_states, reservation, dry_run) - proc_table = update_from_queue(proc_table) + proc_table = update_from_queue(proc_table, dry_run=dry_run) return proc_table, submits def recursive_submit_failed(rown, proc_table, submits, id_to_row_map, ptab_name=None, @@ -1220,15 +1222,49 @@ def recursive_submit_failed(rown, proc_table, submits, id_to_row_map, ptab_name= proc_table['LATEST_DEP_QID'][rown] = np.ndarray(shape=0).astype(int) else: all_valid_states = list(resubmission_states.copy()) - all_valid_states.extend(['RUNNING','PENDING','SUBMITTED','COMPLETED']) + good_states = ['RUNNING','PENDING','SUBMITTED','COMPLETED'] + all_valid_states.extend(good_states) + othernight_idep_qid_lookup = {} for idep in np.sort(np.atleast_1d(ideps)): - if idep not in id_to_row_map and idep // 1000 != row['INTID'] // 1000: - log.warning(f"Internal ID: {idep} not in id_to_row_map. " - + "This is expected since it's from another day. " - + f" This dependency will not be checked or " - + "resubmitted") + if idep not in id_to_row_map: + log.info(idep // 1000) + log.info(row['INTID'] // 1000) + if idep // 1000 != row['INTID'] // 1000: + log.info(f"Internal ID: {idep} not in id_to_row_map. " + + "This is expected since it's from another day. ") + reference_night = 20000000 + (idep // 1000) + log.info(reference_night) + reftab = read_minimal_full_proctab_cols(nights=[reference_night]) + if reftab is None: + msg = f"The dependency is from night={reference_night}" \ + + f" but read_minimal_full_proctab_cols couldn't" \ + + f" locate that processing table, this is a " \ + + f"fatal error." + log.critical(msg) + raise ValueError(msg) + reftab = update_from_queue(reftab, dry_run=dry_run) + entry = reftab[reftab['INTID'] == idep][0] + if entry['STATUS'] not in good_states: + msg = f"Internal ID: {idep} not in id_to_row_map. " \ + + f"Since the dependency is from night={reference_night} " \ + + f"and that job isn't in a good state this is an " \ + + f"error we can't overcome." + log.error(msg) + proc_table['STATUS'][rown] = "DEP_NOT_SUBD" + return proc_table, submits + else: + ## otherwise all is good, just update the cache to use this + ## in the next stage + othernight_idep_qid_lookup[idep] = entry['LATEST_QID'] + update_full_ptab_cache(reftab) + else: + msg = f"Internal ID: {idep} not in id_to_row_map. " \ + + f"Since the dependency is from the same night" \ + + f" and we can't find it, this is a fatal error." + log.critical(msg) + raise ValueError(msg) elif proc_table['STATUS'][id_to_row_map[idep]] not in all_valid_states: - log.warning(f"Proc INTID: {proc_table['INTID'][rown]} depended on" + + log.error(f"Proc INTID: {proc_table['INTID'][rown]} depended on" + f" INTID {proc_table['INTID'][id_to_row_map[idep]]}" + f" but that exposure has state" + f" {proc_table['STATUS'][id_to_row_map[idep]]} that" + @@ -1238,19 +1274,20 @@ def recursive_submit_failed(rown, proc_table, submits, id_to_row_map, ptab_name= return proc_table, submits qdeps = [] for idep in np.sort(np.atleast_1d(ideps)): - if idep not in id_to_row_map and idep // 1000 != row['INTID'] // 1000: - log.warning(f"Internal ID: {idep} not in id_to_row_map. " - + "This is expected since it's from another day. " - + f" This dependency will not be checked or " - + "resubmitted") - continue - elif proc_table['STATUS'][id_to_row_map[idep]] in resubmission_states: - proc_table, submits = recursive_submit_failed(id_to_row_map[idep], - proc_table, submits, - id_to_row_map, - reservation=reservation, - dry_run=dry_run) - qdeps.append(proc_table['LATEST_QID'][id_to_row_map[idep]]) + if idep in id_to_row_map: + if proc_table['STATUS'][id_to_row_map[idep]] in resubmission_states: + proc_table, submits = recursive_submit_failed(id_to_row_map[idep], + proc_table, submits, + id_to_row_map, + reservation=reservation, + dry_run=dry_run) + ## Now that we've resubmitted the dependency if necessary, + ## add the most recent QID to the list + qdeps.append(proc_table['LATEST_QID'][id_to_row_map[idep]]) + else: + ## Since we verified above that the cross night QID is still + ## either pending or successful, add that to the list of QID's + qdeps.append(othernight_idep_qid_lookup[idep]) qdeps = np.atleast_1d(qdeps) if len(qdeps) > 0: diff --git a/py/desispec/workflow/proctable.py b/py/desispec/workflow/proctable.py index f1e81c9a9..aef37d1fe 100644 --- a/py/desispec/workflow/proctable.py +++ b/py/desispec/workflow/proctable.py @@ -3,6 +3,7 @@ =========================== """ +import glob import numpy as np import os @@ -10,13 +11,18 @@ from astropy.table import Table, vstack from collections import OrderedDict +from desispec.io import findfile ## Import some helper functions, you can see their definitions by uncomenting the bash shell command from desispec.workflow.exptable import default_obstypes_for_exptable +from desispec.workflow.tableio import load_table from desispec.workflow.utils import define_variable_from_environment, pathjoin from desispec.io.util import difference_camwords, parse_badamps, create_camword, decode_camword from desiutil.log import get_logger +_full_ptab_cache = dict() +_tilenight_ptab_cache = None + ############################################### ##### Processing Table Column Definitions ##### ############################################### @@ -394,3 +400,285 @@ def table_row_to_dict(table_row): typ = type(table_row) log.error(f"Received table_row of type {typ}, can't convert to a dictionary. Exiting.") raise TypeError(f"Received table_row of type {typ}, can't convert to a dictionary. Exiting.") + + +def read_minimal_tilenight_proctab_cols(nights=None, tileids=None, + reset_cache=False, readonly=True): + """ + Read processing tables while handling evolving formats + + Args: + nights (list of int): nights to include (default all nights found) + tileids (list of int): tileids to include (default all tiles found) + reset_cache (bool): If true, global cache is cleared + readonly (bool): If true, use readonly path to tables for laoding + + Returns None if not proc tables exist or exptable with columns EXPID, + TILEID, NIGHT, PROCCAMWORD, INTID, LATEST_QID and rows matching + the input selection criteria + """ + global _tilenight_ptab_cache + global _full_ptab_cache + log = get_logger() + + ## If requested reset the tilenight processing table cache + if reset_cache: + reset_tilenight_ptab_cache() + + if _tilenight_ptab_cache is not None: + log.info(f'Using cached processing table rows for tilenight selection') + t = _tilenight_ptab_cache.copy() + if nights is not None: + t = t[np.isin(t['NIGHT'], nights)] + if tileids is not None: + t = t[np.isin(t['TILEID'], tileids)] + return t + + ## If not cached, then find all the relevant processing tables and load them + if nights is None: + ptab_path = findfile('proctable', night='99999999', readonly=readonly) + ptab_files = glob.glob(ptab_path.replace('99999999', '202?????')) + else: + ptab_files = list() + for night in nights: + ptab_file = findfile('proctable', night=night) + if os.path.exists(ptab_file): + ptab_files.append(ptab_file) + elif night >= 20201201: + log.error(f"Processing table missing for night {night}") + else: + # - these are expected for the daily run, ok + log.debug(f"Processing table missing for night {night}") + + ## Load each relevant processing table file, subselect valid tilenight's and + ## append to the full set + ptab_files = sorted(ptab_files) + ptables = list() + for ptab_file in ptab_files: + ## correct way but slower and we don't need multivalue columns + t = load_table(tablename=ptab_file, tabletype='proctable') + t = _select_tilenights_from_ptab(t) + + ## Need to ensure that the string columns are consistent + for col in ['PROCCAMWORD']: + ## Masked arrays need special handling + ## else just reassign with consistent dtype + if isinstance(t[col], Table.MaskedColumn): + ## If compeltely empty it's loaded as type int + ## otherwise fill masked with '' + if t[col].dtype == int: + t[col] = Table.Column(['' for i in range(len(t))], + dtype='S36', name=col) + else: + t[col] = Table.Column(t[col].filled(fill_value=''), + dtype='S36', name=col) + else: + t[col] = Table.Column(t[col], dtype='S36', name=col) + ptables.append(t['EXPID', 'TILEID', 'NIGHT', 'PROCCAMWORD', + 'INTID', 'LATEST_QID']) + + if len(ptables) > 0: + outtable = vstack(ptables) + else: + log.info(f"No processing tables found. Returning None.") + return None + + ## If we've loaded all nights, then cache the result + if nights is None: + log.info(f'Caching processing table rows for tilenight selection') + _set_tilenight_ptab_cache(outtable) + + ## If requested specific tileids, then subselect that + if tileids is not None: + outtable = outtable[np.isin(outtable['TILEID'], tileids)] + + return outtable + + +def _select_tilenights_from_ptab(ptab): + """ + takes a processing table or combination of processing tables and subselects + valid tilenight jobs. Those that pass selection are returned as a table. + """ + t = ptab.copy() + t = t[((t['OBSTYPE'] == 'science') & (t['JOBDESC'] == 'tilenight'))] + if 'LASTSTEP' in t.colnames: + t = t[t['LASTSTEP'] == 'all'] + return t + + +def reset_tilenight_ptab_cache(): + """ + reset the global cache of tilenight processing tables stored in var _tilenight_ptab_cache + """ + global _tilenight_ptab_cache + log = get_logger() + log.info(f'Resetting processing table row cache for tilenight selection') + _tilenight_ptab_cache = None + + +def _set_tilenight_ptab_cache(ptab): + """ + sets the global cache of tilenight processing tables stored in var _tilenight_ptab_cache + """ + global _tilenight_ptab_cache + log = get_logger() + log.info( + f'Asigning processing table row cache for tilenight selection to new table') + if 'OBSTYPE' in ptab.colnames: + t = _select_tilenights_from_ptab(ptab) + else: + t = ptab + _tilenight_ptab_cache = t['EXPID', 'TILEID', 'NIGHT', 'PROCCAMWORD', + 'INTID', 'LATEST_QID'] + + _tilenight_ptab_cache.sort(['INTID']) + + +def update_tilenight_ptab_cache(ptab): + """ + updates the global cache of tilenight processing tables stored in var + _tilenight_ptab_cache. + + Notes: this will remove all current entries for any night in the input + """ + global _tilenight_ptab_cache + log = get_logger() + ## If the cache doesn't exist, don't update it. + if _tilenight_ptab_cache is None: + log.debug(f'Science exptab cache does not exist, so not updating') + return + cleaned_ptab = _select_tilenights_from_ptab(ptab) + new_nights = np.unique(cleaned_ptab['NIGHT']) + log.info(f'Removing all current entries in processing table tilenight ' + + f'selection cache for nights {new_nights}') + conflicting_entries = np.isin(_tilenight_ptab_cache['NIGHT'], new_nights) + log.info(f"Removing {len(conflicting_entries)} rows and adding " + + f"{len(cleaned_ptab)} rows " + + f"to processing table tilenight cache.") + keep = np.bitwise_not(conflicting_entries) + _tilenight_ptab_cache = _tilenight_ptab_cache[keep] + _tilenight_ptab_cache = vstack([_tilenight_ptab_cache, cleaned_ptab]) + _tilenight_ptab_cache.sort(['INTID']) + + +def read_minimal_full_proctab_cols(nights=None, tileids=None, + reset_cache=False, readonly=True): + """ + Read processing tables and cache if applicable + + Args: + nights (list of int): nights to include (default all nights found) + tileids (list of int): tileids to include (default all tiles found) + reset_cache (bool): If true, global cache is cleared + readonly (bool): If true, use readonly path to tables for laoding + + Returns None if not proc tables exist or exptable with columns EXPID, + TILEID, NIGHT, PROCCAMWORD, INTID, LATEST_QID, STATUS and rows matching + the input selection criteria + """ + global _full_ptab_cache + log = get_logger() + + ## If requested reset the full processing table cache + if reset_cache: + reset_full_ptab_cache() + + ## If the cache exists, use it speed up the search over tiles and nights + if nights is not None and np.all( + np.in1d(nights, list(_full_ptab_cache.keys()))): + log.info(f'Using cached processing table rows') + tablist = [] + for night in nights: + tablist.append(_full_ptab_cache[night]) + t = vstack(tablist) + if tileids is not None: + t = t[np.isin(t['TILEID'], tileids)] + return t + + ## If not cached, then find all the relevant processing tables and load them + if nights is None: + ptab_path = findfile('proctable', night='99999999', readonly=readonly) + ptab_files = glob.glob(ptab_path.replace('99999999', '202?????')) + else: + ptab_files = list() + for night in nights: + ptab_file = findfile('proctable', night=night) + if os.path.exists(ptab_file): + ptab_files.append(ptab_file) + elif night >= 20201201: + log.error(f"Processing table missing for night {night}") + else: + # - these are expected for the daily run, ok + log.debug(f"Processing table missing for night {night}") + + ## Load each relevant processing table file, subselect valid tilenight's and + ## append to the full set + ptab_files = sorted(ptab_files) + ptables = list() + for ptab_file in ptab_files: + ## correct way but slower and we don't need multivalue columns + t = load_table(tablename=ptab_file, tabletype='proctable') + + ## Need to ensure that the string columns are consistent + for col in ['PROCCAMWORD']: + ## Masked arrays need special handling + ## else just reassign with consistent dtype + if isinstance(t[col], Table.MaskedColumn): + ## If compeltely empty it's loaded as type int + ## otherwise fill masked with '' + if t[col].dtype == int: + t[col] = Table.Column(['' for i in range(len(t))], + dtype='S36', name=col) + else: + t[col] = Table.Column(t[col].filled(fill_value=''), + dtype='S36', name=col) + else: + t[col] = Table.Column(t[col], dtype='S36', name=col) + ptables.append(t['EXPID', 'TILEID', 'NIGHT', 'PROCCAMWORD', 'OBSTYPE', + 'JOBDESC', 'INTID', 'LATEST_QID', 'STATUS']) + + if len(ptables) > 0: + outtable = vstack(ptables) + else: + log.info(f"No processing tables found. Returning None.") + return None + + ## Cache the result + log.info(f'Caching processing table rows for full cache') + update_full_ptab_cache(outtable) + + ## If requested specific tileids, then subselect that + if tileids is not None: + outtable = outtable[np.isin(outtable['TILEID'], tileids)] + + return outtable + + +def reset_full_ptab_cache(): + """ + reset the global cache of full processing tables stored in var _full_ptab_cache + """ + global _full_ptab_cache + log = get_logger() + log.info(f'Resetting full processing table row cache') + _full_ptab_cache = dict() + + +def update_full_ptab_cache(ptab): + """ + updates the global cache of all processing tables stored in var + _full_ptab_cache. + + Notes: this will remove all current entries for any night in the input + """ + global _full_ptab_cache + log = get_logger() + + t = ptab['EXPID', 'TILEID', 'NIGHT', 'PROCCAMWORD', 'OBSTYPE', + 'JOBDESC', 'INTID', 'LATEST_QID', 'STATUS'] + new_nights = np.unique(t['NIGHT']) + log.info(f'Replacing all current entries in processing table ' + + f'cache for nights {new_nights.data}') + for night in new_nights: + _full_ptab_cache[night] = t[t['NIGHT'] == night] diff --git a/py/desispec/workflow/queue.py b/py/desispec/workflow/queue.py index ba9c0b7d3..ccff9c704 100644 --- a/py/desispec/workflow/queue.py +++ b/py/desispec/workflow/queue.py @@ -235,12 +235,21 @@ def queue_info_from_qids(qids, columns='jobid,jobname,partition,submit,'+ if dry_run: log.info("Dry run, would have otherwise queried Slurm with the" +f" following: {' '.join(cmd_as_list)}") - string = 'JobID,JobName,Partition,Submit,Eligible,Start,End,State,ExitCode' - for jobid, expid in zip(qids, 100000+np.arange(len(qids))): - string += f'\n{jobid},arc-20211102-{expid:08d}-a0123456789,realtime,2021-11-02'\ - +'T18:31:14,2021-11-02T18:36:33,2021-11-02T18:36:33,2021-11-02T'\ - +'18:48:32,COMPLETED,0:0' - + ### Set a random 5% of jobs as TIMEOUT, set seed for reproducibility + # np.random.seed(qids[0]) + states = np.array(['COMPLETED'] * len(qids)) + #states[np.random.random(len(qids)) < 0.05] = 'TIMEOUT' + ## Try two different column configurations, otherwise give up trying to simulate + string = 'JobID,JobName,Partition,Submit,Eligible,Start,End,Elapsed,State,ExitCode' + if columns.lower() == string.lower(): + for jobid, expid, state in zip(qids, 100000+np.arange(len(qids)), states): + string += f'\n{jobid},arc-20211102-{expid:08d}-a0123456789,realtime,2021-11-02'\ + +'T18:31:14,2021-11-02T18:36:33,2021-11-02T18:36:33,2021-11-02T'\ + +f'18:48:32,00:11:59,{state},0:0' + elif columns.lower() == 'jobid,state': + string = 'JobID,State' + for jobid, state in zip(qids, states): + string += f'\n{jobid},{state}' # create command to run to exercise subprocess -> stdout parsing cmd_as_list = ['echo', string] else: @@ -305,9 +314,10 @@ def get_queue_states_from_qids(qids, dry_run=0, use_cache=False): for qid in qids: outdict[qid] = _cached_slurm_states[qid] else: - outtable = queue_info_from_qids(qids, columns='jobid,state', dry_run=dry_run) - for row in outtable: - outdict[int(row['JOBID'])] = row['STATE'] + if dry_run > 2 or dry_run < 1: + outtable = queue_info_from_qids(qids, columns='jobid,state', dry_run=dry_run) + for row in outtable: + outdict[int(row['JOBID'])] = row['STATE'] return outdict def update_queue_state_cache_from_table(queue_info_table): @@ -406,7 +416,10 @@ def update_from_queue(ptable, qtable=None, dry_run=0, ignore_scriptnames=False): + f" but the jobname in the queue was " + f"{row['JOBNAME']}.") state = str(row['STATE']).split(' ')[0] - ptable['STATUS'][ind] = state + ## Since dry run 1 and 2 save proc tables, don't alter the + ## states for these when simulating + if dry_run > 2 or dry_run < 1: + ptable['STATUS'][ind] = state return ptable diff --git a/py/desispec/workflow/redshifts.py b/py/desispec/workflow/redshifts.py index cfa2ea25a..1eb5c3af2 100644 --- a/py/desispec/workflow/redshifts.py +++ b/py/desispec/workflow/redshifts.py @@ -3,12 +3,8 @@ =========================== """ -import sys, os, glob -import re -import subprocess -import argparse +import sys, os import numpy as np -from astropy.table import Table, vstack, Column from desispec.io import findfile from desispec.io.util import parse_cameras, decode_camword @@ -16,18 +12,14 @@ from desiutil.log import get_logger import desispec.io -from desispec.workflow.exptable import get_exposure_table_path, get_exposure_table_name, \ - get_exposure_table_pathname -from desispec.workflow.tableio import load_table from desispec.workflow import batch -from desispec.util import parse_int_args +# full processing table row cache # processing table row cache for tilenight selection -_tilenight_ptab_cache = None # exposure table row cache for science exposure selection -_science_etab_cache = None -def get_ztile_relpath(tileid,group,night=None,expid=None): + +def get_ztile_relpath(tileid, group, night=None, expid=None): """ Determine the relative output directory of the tile redshift batch script for spectra+coadd+redshifts for a tile @@ -52,10 +44,12 @@ def get_ztile_relpath(tileid,group,night=None,expid=None): outdir = f'tiles/{tileid}/{night}' else: outdir = f'tiles/{group}/{tileid}' - log.warning(f'Non-standard tile group={group}; writing outputs to {outdir}/*') + log.warning( + f'Non-standard tile group={group}; writing outputs to {outdir}/*') return outdir -def get_ztile_script_pathname(tileid,group,night=None,expid=None): + +def get_ztile_script_pathname(tileid, group, night=None, expid=None): """ Generate the pathname of the tile redshift batch script for spectra+coadd+redshifts for a tile @@ -69,13 +63,14 @@ def get_ztile_script_pathname(tileid,group,night=None,expid=None): (str): the pathname of the tile redshift batch script """ reduxdir = desispec.io.specprod_root() - outdir = get_ztile_relpath(tileid,group,night=night,expid=expid) + outdir = get_ztile_relpath(tileid, group, night=night, expid=expid) scriptdir = f'{reduxdir}/run/scripts/{outdir}' - suffix = get_ztile_script_suffix(tileid,group,night=night,expid=expid) + suffix = get_ztile_script_suffix(tileid, group, night=night, expid=expid) batchscript = f'ztile-{suffix}.slurm' return os.path.join(scriptdir, batchscript) -def get_ztile_script_suffix(tileid,group,night=None,expid=None): + +def get_ztile_script_suffix(tileid, group, night=None, expid=None): """ Generate the suffix of the tile redshift batch script for spectra+coadd+redshifts for a tile @@ -99,9 +94,11 @@ def get_ztile_script_suffix(tileid,group,night=None,expid=None): suffix = f'{tileid}-{night}' else: suffix = f'{tileid}-{group}' - log.warning(f'Non-standard tile group={group}; writing outputs to {suffix}.*') + log.warning( + f'Non-standard tile group={group}; writing outputs to {suffix}.*') return suffix + def get_zpix_redshift_script_pathname(healpix, survey, program): """Return healpix-based coadd+redshift+afterburner script pathname @@ -114,7 +111,7 @@ def get_zpix_redshift_script_pathname(healpix, survey, program): zpix_script_pathname """ if np.isscalar(healpix): - healpix = [healpix,] + healpix = [healpix, ] hpixmin = np.min(healpix) hpixmax = np.max(healpix) @@ -125,7 +122,8 @@ def get_zpix_redshift_script_pathname(healpix, survey, program): reduxdir = desispec.io.specprod_root() return os.path.join(reduxdir, 'run', 'scripts', 'healpix', - survey, program, str(hpixmin//100), scriptname) + survey, program, str(hpixmin // 100), scriptname) + def create_desi_zproc_batch_script(group, tileid=None, cameras=None, @@ -134,7 +132,8 @@ def create_desi_zproc_batch_script(group, queue='regular', batch_opts=None, runtime=None, timingfile=None, batchdir=None, jobname=None, cmdline=None, system_name=None, - max_gpuprocs=None, no_gpu=False, run_zmtl=False, + max_gpuprocs=None, no_gpu=False, + run_zmtl=False, no_afterburners=False): """ Generate a SLURM batch script to be submitted to the slurm scheduler to run desi_proc. @@ -201,7 +200,7 @@ def create_desi_zproc_batch_script(group, scriptpath = get_zpix_redshift_script_pathname(healpix, survey, program) else: scriptpath = get_ztile_script_pathname(tileid, group=group, - night=night, expid=expid) + night=night, expid=expid) if cameras is None: cameras = decode_camword('a0123456789') @@ -291,8 +290,8 @@ def create_desi_zproc_batch_script(group, cmd += ' --mpi' ncores, nodes, runtime = determine_resources( - ncameras, group.upper(), queue=queue, nexps=nexps, - forced_runtime=runtime, system_name=system_name) + ncameras, group.upper(), queue=queue, nexps=nexps, + forced_runtime=runtime, system_name=system_name) runtime_hh = int(runtime // 60) runtime_mm = int(runtime % 60) @@ -312,7 +311,8 @@ def create_desi_zproc_batch_script(group, fx.write('#SBATCH --account desi\n') fx.write('#SBATCH --job-name {}\n'.format(jobname)) fx.write('#SBATCH --output {}/{}-%j.log\n'.format(batchdir, jobname)) - fx.write('#SBATCH --time={:02d}:{:02d}:00\n'.format(runtime_hh, runtime_mm)) + fx.write( + '#SBATCH --time={:02d}:{:02d}:00\n'.format(runtime_hh, runtime_mm)) fx.write('#SBATCH --exclusive\n') fx.write('\n') @@ -322,9 +322,9 @@ def create_desi_zproc_batch_script(group, # fx.write("export OMP_NUM_THREADS={}\n".format(threads_per_core)) fx.write("export OMP_NUM_THREADS=1\n") - #- Special case CFS readonly mount at NERSC - #- SB 2023-01-27: disable this since Perlmutter might deprecate /dvs_ro; - #- inherit it from the environment but don't hardcode into script itself + # - Special case CFS readonly mount at NERSC + # - SB 2023-01-27: disable this since Perlmutter might deprecate /dvs_ro; + # - inherit it from the environment but don't hardcode into script itself # if 'DESI_ROOT_READONLY' in os.environ: # readonlydir = os.environ['DESI_ROOT_READONLY'] # elif os.environ['DESI_ROOT'].startswith('/global/cfs/cdirs'): @@ -364,304 +364,3 @@ def create_desi_zproc_batch_script(group, return scriptfile -def read_minimal_science_exptab_cols(nights=None, tileids=None, reset_cache=False): - """ - Read exposure tables while handling evolving formats - - Args: - nights (list of int): nights to include (default all nights found) - tileids (list of int): tileids to include (default all tiles found) - reset_cache (bool): If true, global cache is cleared - - Returns exptable with just columns TILEID, NIGHT, EXPID, 'CAMWORD', - 'BADCAMWORD', filtered by science - exposures with LASTSTEP='all' and TILEID>=0 - - Note: the returned table is the full pipeline exposures table. It is trimmed - to science exposures that have LASTSTEP=='all' - """ - global _science_etab_cache - log = get_logger() - - ## If requested reset the science exposure table cache - if reset_cache: - reset_science_etab_cache() - - ## If the cache exists, use it speed up the search over tiles and nights - if _science_etab_cache is not None: - log.info(f'Using cached exposure table rows for science selection') - t = _science_etab_cache.copy() - if nights is not None: - t = t[np.isin(t['NIGHT'], nights)] - if tileids is not None: - t = t[np.isin(t['TILEID'], tileids)] - return t - - ## If not cached, then find all the relevant exposure tables and load them - if nights is None: - etab_path = findfile('exptable', night='99999999', readonly=True) - glob_path = etab_path.replace('99999999', '202?????').replace('999999', '202???') - etab_files = glob.glob(glob_path) - else: - etab_files = list() - for night in nights: - etab_file = get_exposure_table_pathname(night) - if os.path.exists(etab_file): - etab_files.append(etab_file) - elif night >= 20201201: - log.error(f"Exposure table missing for night {night}") - else: - # - these are expected for the daily run, ok - log.debug(f"Exposure table missing for night {night}") - - ## Load each relevant exposure table file, subselect valid science's and - ## append to the full set - etab_files = sorted(etab_files) - exptables = list() - - for etab_file in etab_files: - ## correct way but slower and we don't need multivalue columns - #t = load_table(etab_file, tabletype='etable') - t = Table.read(etab_file, format='ascii.csv') - - ## Subselect only valid science exposures - t = _select_sciences_from_etab(t) - - ## For backwards compatibility if BADCAMWORD column does not - ## exist then add a blank one - if 'BADCAMWORD' not in t.colnames: - t.add_column(Table.Column(['' for i in range(len(t))], dtype='S36', name='BADCAMWORD')) - - ## Need to ensure that the string columns are consistent - for col in ['CAMWORD', 'BADCAMWORD']: - ## Masked arrays need special handling - ## else just reassign with consistent dtype - if isinstance(t[col], Table.MaskedColumn): - ## If compeltely empty it's loaded as type int - ## otherwise fill masked with '' - if t[col].dtype == int: - t[col] = Table.Column(['' for i in range(len(t))], dtype='S36', name=col) - else: - t[col] = Table.Column(t[col].filled(fill_value=''), dtype='S36', name=col) - else: - t[col] = Table.Column(t[col], dtype='S36', name=col) - exptables.append(t['TILEID', 'NIGHT', 'EXPID', 'CAMWORD', 'BADCAMWORD']) - - outtable = vstack(exptables) - - ## If we've loaded all nights, then cache the result - if nights is None: - log.info(f'Caching exposure table rows for science selection') - _set_science_etab_cache(outtable.copy()) - - ## If requeted specific tileids, then subselect that - if tileids is not None: - outtable = outtable[np.isin(outtable['TILEID'], tileids)] - - return outtable - -def _select_sciences_from_etab(etab): - """ - takes an exposure table or combination of exposure tables and subselects - valid science jobs. Those that pass selection are returned as a table. - """ - t = etab.copy() - t = t[((t['OBSTYPE'] == 'science') & (t['TILEID'] >= 0))] - if 'LASTSTEP' in t.colnames: - t = t[t['LASTSTEP'] == 'all'] - - return t - -def reset_science_etab_cache(): - """ - reset the global cache of science exposure tables stored in var _science_etab_cache - """ - global _science_etab_cache - log = get_logger() - log.info(f'Resetting science exposure table row cache') - _science_etab_cache = None - -def _set_science_etab_cache(etab): - """ - sets the global cache of science exposure tables stored in var _science_etab_cache - """ - global _science_etab_cache - log = get_logger() - log.info(f'Assigning science exposure table row cache to new table') - if 'OBSTYPE' in etab.colnames: - _science_etab_cache = _select_sciences_from_etab(etab) - else: - _science_etab_cache = etab - _science_etab_cache.sort(['EXPID']) - -def update_science_etab_cache(etab): - """ - updates the global cache of science exposure tables stored in var - _science_etab_cache. - - Notes: this will remove all current entries for any night in the input - """ - global _science_etab_cache - log = get_logger() - ## If the cache doesn't exist, don't update it. - if _science_etab_cache is None: - log.debug(f'Science exptab cache does not exist, so not updating') - return - cleaned_etab = _select_sciences_from_etab(etab) - new_nights = np.unique(cleaned_etab['NIGHT']) - log.info(f'Removing all current entries in science exposure ' - + f'table row cache for nights {new_nights}') - conflicting_entries = np.isin(_science_etab_cache['NIGHT'], new_nights) - log.info(f"Removing {len(conflicting_entries)} rows and adding {len(cleaned_etab)} rows " - + f"to science exposure table row cache.") - keep = np.bitwise_not(conflicting_entries) - _science_etab_cache = _science_etab_cache[keep] - _science_etab_cache = vstack([_science_etab_cache, cleaned_etab]) - - -def read_minimal_tilenight_proctab_cols(nights=None, tileids=None, reset_cache=False): - """ - Read processing tables while handling evolving formats - - Args: - nights (list of int): nights to include (default all nights found) - tileids (list of int): tileids to include (default all tiles found) - reset_cache (bool): If true, global cache is cleared - - Returns None if not proc tables exist or exptable with columns EXPID, - TILEID, NIGHT, PROCCAMWORD, INTID, LATEST_QID and rows matching - the input selection criteria - """ - global _tilenight_ptab_cache - log = get_logger() - - ## If requested reset the tilenight processing table cache - if reset_cache: - reset_tilenight_ptab_cache() - - ## If the cache exists, use it speed up the search over tiles and nights - if _tilenight_ptab_cache is not None: - log.info(f'Using cached processing table rows for tilenight selection') - t = _tilenight_ptab_cache.copy() - if nights is not None: - t = t[np.isin(t['NIGHT'], nights)] - if tileids is not None: - t = t[np.isin(t['TILEID'], tileids)] - return t - - ## If not cached, then find all the relevant processing tables and load them - if nights is None: - ptab_path = findfile('proctable', night='99999999', readonly=True) - ptab_files = glob.glob(ptab_path.replace('99999999', '202?????')) - else: - ptab_files = list() - for night in nights: - ptab_file = findfile('proctable', night=night) - if os.path.exists(ptab_file): - ptab_files.append(ptab_file) - elif night >= 20201201: - log.error(f"Processing table missing for night {night}") - else: - # - these are expected for the daily run, ok - log.debug(f"Processing table missing for night {night}") - - ## Load each relevant processing table file, subselect valid tilenight's and - ## append to the full set - ptab_files = sorted(ptab_files) - ptables = list() - for ptab_file in ptab_files: - ## correct way but slower and we don't need multivalue columns - t = load_table(tablename=ptab_file, tabletype='proctable') - t = _select_tilenights_from_ptab(t) - - ## Need to ensure that the string columns are consistent - for col in ['PROCCAMWORD']: - ## Masked arrays need special handling - ## else just reassign with consistent dtype - if isinstance(t[col], Table.MaskedColumn): - ## If compeltely empty it's loaded as type int - ## otherwise fill masked with '' - if t[col].dtype == int: - t[col] = Table.Column(['' for i in range(len(t))], dtype='S36', name=col) - else: - t[col] = Table.Column(t[col].filled(fill_value=''), dtype='S36', name=col) - else: - t[col] = Table.Column(t[col], dtype='S36', name=col) - ptables.append(t['EXPID', 'TILEID', 'NIGHT', 'PROCCAMWORD', - 'INTID', 'LATEST_QID']) - - if len(ptables) > 0: - outtable = vstack(ptables) - else: - log.info(f"No processing tables found. Returning None.") - return None - - ## If we've loaded all nights, then cache the result - if nights is None: - log.info(f'Caching processing table rows for tilenight selection') - _set_tilenight_ptab_cache(outtable) - - ## If requested specific tileids, then subselect that - if tileids is not None: - outtable = outtable[np.isin(outtable['TILEID'], tileids)] - - return outtable - -def _select_tilenights_from_ptab(ptab): - """ - takes a processing table or combination of processing tables and subselects - valid tilenight jobs. Those that pass selection are returned as a table. - """ - t = ptab.copy() - t = t[((t['OBSTYPE'] == 'science') & (t['JOBDESC'] == 'tilenight'))] - if 'LASTSTEP' in t.colnames: - t = t[t['LASTSTEP'] == 'all'] - return t - -def reset_tilenight_ptab_cache(): - """ - reset the global cache of tilenight processing tables stored in var _tilenight_ptab_cache - """ - global _tilenight_ptab_cache - log = get_logger() - log.info(f'Resetting processing table row cache for tilenight selection') - _tilenight_ptab_cache = None - -def _set_tilenight_ptab_cache(ptab): - """ - sets the global cache of tilenight processing tables stored in var _tilenight_ptab_cache - """ - global _tilenight_ptab_cache - log = get_logger() - log.info(f'Asigning processing table row cache for tilenight selection to new table') - if 'OBSTYPE' in ptab.colnames: - _tilenight_ptab_cache = _select_tilenights_from_ptab(ptab) - else: - _tilenight_ptab_cache = ptab - _tilenight_ptab_cache.sort(['INTID']) - -def update_tilenight_ptab_cache(ptab): - """ - updates the global cache of tilenight processing tables stored in var - _tilenight_ptab_cache. - - Notes: this will remove all current entries for any night in the input - """ - global _tilenight_ptab_cache - log = get_logger() - ## If the cache doesn't exist, don't update it. - if _tilenight_ptab_cache is None: - log.debug(f'Science exptab cache does not exist, so not updating') - return - cleaned_ptab = _select_tilenights_from_ptab(ptab) - new_nights = np.unique(cleaned_ptab['NIGHT']) - log.info(f'Removing all current entries in processing table tilenight ' - + f'selection cache for nights {new_nights}') - conflicting_entries = np.isin(_tilenight_ptab_cache['NIGHT'], new_nights) - log.info(f"Removing {len(conflicting_entries)} rows and adding " - + f"{len(cleaned_ptab)} rows " - + f"to processing table tilenight cache.") - keep = np.bitwise_not(conflicting_entries) - _tilenight_ptab_cache = _tilenight_ptab_cache[keep] - _tilenight_ptab_cache = vstack([_tilenight_ptab_cache, cleaned_ptab]) - _tilenight_ptab_cache.sort(['INTID']) \ No newline at end of file diff --git a/py/desispec/workflow/science_selection.py b/py/desispec/workflow/science_selection.py index 04a555796..366fbe03c 100644 --- a/py/desispec/workflow/science_selection.py +++ b/py/desispec/workflow/science_selection.py @@ -17,7 +17,8 @@ from desispec.scripts.tile_redshifts import generate_tile_redshift_scripts from desispec.workflow.redshifts import get_ztile_script_pathname, \ get_ztile_relpath, \ - get_ztile_script_suffix, read_minimal_science_exptab_cols + get_ztile_script_suffix +from desispec.workflow.exptable import read_minimal_science_exptab_cols from desispec.workflow.queue import get_resubmission_states, update_from_queue, queue_info_from_qids from desispec.workflow.timing import what_night_is_it from desispec.workflow.desi_proc_funcs import get_desi_proc_batch_file_pathname, \ diff --git a/py/desispec/workflow/tableio.py b/py/desispec/workflow/tableio.py index 872f4b266..6d6ecf1cf 100644 --- a/py/desispec/workflow/tableio.py +++ b/py/desispec/workflow/tableio.py @@ -7,6 +7,7 @@ import numpy as np from astropy.table import Table +from desispec.io import findfile ################################################### ################ Table Functions ################# ################################################### @@ -137,8 +138,8 @@ def write_table(origtable, tablename=None, tabletype=None, joinsymb='|', overwri table in the SPECPROD rather than the exptab repository. Default is True. """ ## avoid circular import by importing only in this function that uses it - from desispec.workflow.redshifts import update_science_etab_cache, \ - update_tilenight_ptab_cache + from desispec.workflow.exptable import update_science_etab_cache + from desispec.workflow.proctable import update_tilenight_ptab_cache log = get_logger() if tablename is None and tabletype is None: log.error("Pathname or type of table is required to save the table") @@ -148,7 +149,14 @@ def write_table(origtable, tablename=None, tabletype=None, joinsymb='|', overwri tabletype = standardize_tabletype(tabletype) if tablename is None: - tablename = translate_type_to_pathname(tabletype, use_specprod=use_specprod) + night = None + if 'NIGHT' in origtable.colnames: + night = np.unique(origtable['NIGHT'].data)[0] + else: + msg = f'NIGHT must be in table if tablename is not provided.' + log.critical(msg) + raise ValueError(msg) + tablename = findfile(tabletype, night=night) if not write_empty and len(origtable) == 0: log.warning(f'NOT writing zero length table to {tablename}') @@ -234,39 +242,8 @@ def standardize_tabletype(tabletype): tabletype = 'unproctable' return tabletype -def translate_type_to_pathname(tabletype, use_specprod=True): - """ - Given the type of table it returns the proper file pathname - - Parameters - ---------- - tabletype : str - Allows for a flexible number of input options, but should refer to either the 'exposure', - 'processing', or 'unprocessed' table types. - use_specprod : bool - If True and tablename not specified and tabletype is exposure table, this looks for the - table in the SPECPROD rather than the exptab repository. Default is True. - Returns - ------- - tablename : str - Full pathname including extension of the table type. Uses environment variables to determine - the location. - """ - from desispec.workflow.exptable import get_exposure_table_path, get_exposure_table_pathname, get_exposure_table_name - from desispec.workflow.proctable import get_processing_table_path, get_processing_table_pathname, get_processing_table_name - tabletype = standardize_tabletype(tabletype) - if tabletype == 'exptable': - tablename = get_exposure_table_pathname(night=None,usespecprod=use_specprod) - elif tabletype == 'proctable': - tablename = get_processing_table_pathname() - elif tabletype == 'unproctable': - tablepath = get_processing_table_path() - tablename = get_processing_table_name().replace("processing", 'unprocessed') - tablename = pathjoin(tablepath, tablename) - return tablename - -def load_table(tablename=None, tabletype=None, joinsymb='|', verbose=False, +def load_table(tablename=None, tabletype=None, night=None, joinsymb='|', verbose=False, process_mixins=True, use_specprod=True, suppress_logging=False): """ Workflow function to read in exposure, processing, and unprocessed tables. It allows for multi-valued table cells, which are @@ -283,6 +260,8 @@ def load_table(tablename=None, tabletype=None, joinsymb='|', verbose=False, tabletype : str 'exptable', 'proctable', or 'unproctable'. Used if tablename is None to get the default name for the type of table. Also used to get the column datatypes and defaults. + night : int or None + Must be provided if tablename is not provided. The night of the table you want to open. joinsymb : str The symbol used to join values in a list/array when saving. Should not be a comma. verbose : bool @@ -313,29 +292,31 @@ def load_table(tablename=None, tabletype=None, joinsymb='|', verbose=False, if tabletype is not None: tabletype = standardize_tabletype(tabletype) + if tablename is None and night is None: + log.error("Must specify either tablename or night in load_table()") + return None + if tabletype is None and tablename is None: + log.error("Must specify either tablename or tabletype in load_table()") + return None if tablename is None: + tablename = findfile(tabletype, night=night) + + if tabletype is None: + if not suppress_logging: + log.info("tabletype not given in load_table(), trying to guess based on filename") + filename = os.path.split(tablename)[-1] + if 'exp' in filename or 'etable' in filename: + tabletype = 'exptable' + elif 'unproc' in filename: + tabletype = 'unproctable' + elif 'proc' in filename or 'ptable' in filename: + tabletype = 'proctable' + if tabletype is None: - log.error("Must specify either tablename or tabletype in load_table()") - return None + log.warning(f"Couldn't identify type based on filename {filename}") else: - tablename = translate_type_to_pathname(tabletype, use_specprod=use_specprod) - else: - if tabletype is None: if not suppress_logging: - log.info("tabletype not given in load_table(), trying to guess based on filename") - filename = os.path.split(tablename)[-1] - if 'exp' in filename or 'etable' in filename: - tabletype = 'exptable' - elif 'unproc' in filename: - tabletype = 'unproctable' - elif 'proc' in filename or 'ptable' in filename: - tabletype = 'proctable' - - if tabletype is None: - log.warning(f"Couldn't identify type based on filename {filename}") - else: - if not suppress_logging: - log.info(f"Based on filename {filename}, identified type as {tabletype}") + log.info(f"Based on filename {filename}, identified type as {tabletype}") if os.path.isfile(tablename): if not suppress_logging: From c13406d001e67a2f935481ab108e5ca597093eec Mon Sep 17 00:00:00 2001 From: akremin Date: Thu, 8 Aug 2024 23:00:39 -0700 Subject: [PATCH 04/27] modernize desi_run_prod and add get_jobs_in_queue --- bin/desi_run_prod | 11 +- py/desispec/scripts/submit_prod.py | 265 ++++++++++++++++++++++------- py/desispec/workflow/queue.py | 108 ++++++++++++ 3 files changed, 314 insertions(+), 70 deletions(-) diff --git a/bin/desi_run_prod b/bin/desi_run_prod index 94bd46a3b..460d3dd56 100755 --- a/bin/desi_run_prod +++ b/bin/desi_run_prod @@ -16,11 +16,12 @@ def parse_args(): # options=None): help="Relative or absolute pathname to the yaml file summarizing the production.") # Code Flags - parser.add_argument("--dry-run", action="store_true", - help="Perform a dry run where no jobs are actually created or submitted.") - parser.add_argument("--error-if-not-available", action="store_true", - help="Raise an error instead of reporting and moving on if an exposure "+\ - "table doesn't exist.") + parser.add_argument("--dry-run-level", type=int, default=0, + help="Perform a dry run where no jobs are actually created or submitted." + + " Give what --dry-run-level to pass to desi_proc_night.") + # parser.add_argument("--error-if-not-available", action="store_true", + # help="Raise an error instead of reporting and moving on if an exposure "+\ + # "table doesn't exist.") args = parser.parse_args() diff --git a/py/desispec/scripts/submit_prod.py b/py/desispec/scripts/submit_prod.py index 603f298da..6e8512574 100644 --- a/py/desispec/scripts/submit_prod.py +++ b/py/desispec/scripts/submit_prod.py @@ -9,109 +9,244 @@ import sys import time import re +import glob + +from desiutil.log import get_logger +from desispec.io import findfile +from desispec.scripts.proc_night import proc_night ## Import some helper functions, you can see their definitions by uncomenting the bash shell command from desispec.workflow.utils import verify_variable_with_environment, listpath +from desispec.workflow.redshifts import read_minimal_exptables_columns from desispec.scripts.submit_night import submit_night +from desispec.workflow.queue import check_queue_count -def assign_survey(night, conf): +def get_all_nights(first_night, last_night): """ - Takes a desi production configuration (yaml) dictionary and determines - the survey corresponding to a given night based on the contents of the conf - dictionary, if psosible. Otherwise returns None. + Returns a full list of all nights that have an exposure table + exposure - Args: - night (int): The night you want to know the survey it corresponds to. - conf (dict): Dictionary that returned when the configuration yaml file was read in. + Inputs: + first_night, int. First night to include (inclusive). + last_night, int. Last night to include (inclusive). Returns: - survey, str. The survey the night was taken under, according to the conf file. + nights, list. A list of nights on or after Jan 1 2020 in which data exists at NERSC. """ - for survey in conf['DateRanges']: - first, last = conf['DateRanges'][survey] - if night >= first and night <= last: - return survey - else: - return None + etab_path = findfile('exptable', night='99999999', readonly=True) + glob_path = etab_path.replace('99999999', '202?????').replace('999999', '202???') + etab_files = sorted(glob.glob(glob_path)) + nights = [] + for n in etab_files: + # - nights are 202YMMDD + if re.match('^202\d{5}$', n): + nights.append(int(n)) + nights = np.array(nights) + nights = nights[((nights >= first_night) & (nights <= last_night))] + return nights -def get_all_nights(): +def get_all_valid_nights(first_night, last_night): """ - Returns a full list of all nights availabel in the DESI Raw data directory. + Returns a full list of all nights that have at least one valid science + exposure + + Inputs: + first_night, int. First night to include (inclusive). + last_night, int. Last night to include (inclusive). Returns: nights, list. A list of nights on or after Jan 1 2020 in which data exists at NERSC. """ - nights = list() - for n in listpath(os.getenv('DESI_SPECTRO_DATA')): - # - nights are 202YMMDD - if re.match('^202\d{5}$', n): - nights.append(int(n)) + # TODO when merged into branch with crossnight dependencies this function changes name + fulletab = read_minimal_exptables_columns() + nights = np.unique(fulletab['NIGHT']) + nights = nights[((nights>=first_night)&(nights<=last_night))] return nights -def submit_production(production_yaml, dry_run=False, error_if_not_available=False): +def submit_production(production_yaml, dry_run_level=False): """ Interprets a production_yaml file and submits the respective nights for processing within the defined production. Args: production_yaml (str): Pathname of the yaml file that defines the production. - dry_run (bool, optional): Default is False. Should the jobs written to the processing table actually be submitted - for processing. - error_if_not_available (bool, optional): Default is True. Raise as error if the required exposure table doesn't exist, - otherwise prints an error and returns. + dry_run_level (int, optional): Default is 0. Should the jobs written to the processing table actually be submitted + for processing. This is passed directly to desi_proc_night. Returns: None. """ + log = get_logger() + ## Load the yaml file if not os.path.exists(production_yaml): - raise IOError(f"Prod Yaml file doesn't exist: {production_yaml} not found. Exiting.") - conf = yaml.safe_load(open(production_yaml, 'rb')) - specprod = str(conf['name']).lower() - specprod = verify_variable_with_environment(var=specprod, var_name='specprod', env_name='SPECPROD') - if 'reservation' in conf: - reservation = str(conf['reservation']) - if reservation.lower() == 'none': - reservation = None + raise IOError(f"Prod yaml file doesn't exist: {production_yaml} not found.") + with open(production_yaml, 'rb') as yamlfile: + conf = yaml.safe_load(yamlfile) + + ## Make sure the specprod matches, if not set it to that in the file + if 'SPECPROD' not in conf: + raise ValueError(f"SPECPROD required in yaml file {production_yaml}") + specprod = str(conf['SPECPROD']).lower() + specprod = verify_variable_with_environment(var=specprod, var_name='specprod', + env_name='SPECPROD') + user = os.environ['USER'] + + ## Look for sentinal + sentinel_file = os.path.join(os.environ['DESI_SPECTRO_REDUX'], + os.environ['SPECPROD'], 'run', 'sentinel.txt') + if os.path.exists(sentinel_file): + log.info(f"Sentinel file {sentinel_file} exists, therefore all " + + f"nights already submitted.") + return 0 + + ## Load the nights to process + all_nights, first_night = None, None + if 'NIGHTS' in conf and 'LAST_NIGHT' in conf: + log.error(f"Both NIGHTS and LAST_NIGHT specified. Using NIGHTS " + + f"and ignoring LAST_NIGHT.") + if 'NIGHTS' in conf: + all_nights = np.array(list(conf['NIGHTS'])).astype(int) + log.info(f"Setting all_nights to NIGHTS: {all_nights}") + log.info("Setting first_night to earliest night in NIGHTS:" + + f" {np.min(all_nights)}") + first_night = np.min(all_nights) + log.info("Setting last_night to latest night in NIGHTS: " + + f"{np.max(all_nights)}") + last_night = np.max(all_nights) + elif 'LAST_NIGHT' in conf: + last_night = int(conf['LAST_NIGHT']) + log.info(f"Setting last_night to LATEST_NIGHT: {last_night}") else: - reservation = None - if 'queue' in conf: - queue = conf['queue'] + raise ValueError("Either NIGHT or LAST_NIGHT required in yaml " + + f"file {production_yaml}") + + if first_night is None: + if 'FIRST_NIGHT' in conf: + first_night = int(conf['FIRST_NIGHT']) + log.info(f"Setting first_night to FIRST_NIGHT: {first_night}") + else: + log.info("Setting first_night to earliest in a normal prod: 20201214") + first_night = 20201214 + + if all_nights is None: + # all_nights = get_all_nights(first_night, last_night) + log.info("Populating all_nights with all of the nights with valid science " + + f"exposures between {first_night} and {last_night} inclusive") + all_nights = get_all_valid_nights(first_night, last_night) + + ## Load the other parameters for running desi_proc_night + if 'THRU_NIGHT' in conf: + thru_night = int(conf['THRU_NIGHT']) + log.info(f"Setting thru_night to THRU_NIGHT: {thru_night}") + else: + thru_night = last_night + log.warning(f"Setting thru_night to last_night: {thru_night}") + + no_redshifts = False + if 'Z_SUBMIT_TYPES' in conf: + z_submit_types_str = str(conf['Z_SUBMIT_TYPES']) + if z_submit_types_str.lower() in ['false', 'none']: + z_submit_types = None + no_redshifts = True + else: + z_submit_types = [ztype.strip().lower() for ztype in + z_submit_types_str.split(',')] else: - queue = 'realtime' + z_submit_types = None - if 'OVERWRITEEXISTING' in conf: - overwrite_existing = conf['OVERWRITEEXISTING'] + if 'SURVEYS' in conf: + surveys_str = str(conf['SURVEYS']) + if surveys_str.lower() in ['false', 'none']: + surveys = None + else: + surveys = [survey.strip().lower() for survey in + surveys_str.split(',')] + else: + surveys = None + + ## Bring in the queue and reservation information, if any + if 'QUEUE' in conf: + queue = conf['QUEUE'] else: - overwrite_existing = False + queue = 'regular' - print(f'Using queue: {queue}') + if 'RESERVATION' in conf: + reservation = str(conf['RESERVATION']) + if reservation.lower() == 'none': + reservation = None + else: + reservation = None + + ## Let user know what was defined + if z_submit_types is not None: + log.info(f'Using z_submit_types: {z_submit_types}') + if surveys is not None: + log.info(f'Using surveys: {surveys}') + log.info(f'Using queue: {queue}') if reservation is not None: - print(f'Using reservation: {reservation}') - if overwrite_existing: - print("Ignoring the fact that files exists and submitting those nights anyway") + log.info(f'Using reservation: {reservation}') - all_nights = get_all_nights() - non_survey_nights = [] + ## Do the main processing + finished = False + processed_nights, skipped_nights = [], [] for night in all_nights: - survey = assign_survey(night, conf) - if survey is None: - non_survey_nights.append(night) - continue - elif survey in conf['ProcessData'] and conf['ProcessData'][survey] is False: - print(f'Asked not to process survey: {survey}, Not processing night={night}.', '\n\n\n') - continue - elif survey in conf['SkipNights'] and night in conf['SkipNights'][survey]: - print(f'Asked to skip night={night} (in survey: {survey}). Skipping.', '\n\n\n') + num_in_queue = check_queue_count(user=user, include_scron=False, + dry_run_level=dry_run_level) + ## In Jura the largest night had 115 jobs, to be conservative say 200 + if num_in_queue > 4800: + break + if os.path.exists(findfile('proctable', night=night, readonly=True)): + skipped_nights.append(night) continue + ## We don't expect exposure tables to change during code execution here + ## but we do expect processing tables to evolve, so clear that cache + # TODO uncomment when merged into branch with crossnight dependencies + #desispec.workflow.proctable.reset_tilenight_ptab_cache() + if dry_run_level < 4: + proc_night(night=night, z_submit_types=z_submit_types, + no_redshifts=no_redshifts, + complete_tiles_thrunight=thru_night, + surveys=surveys, dry_run_level=dry_run_level, + queue=queue, reservation=reservation) else: - print(f'Processing {survey} night: {night}') - submit_night(night, proc_obstypes=None, dry_run=dry_run, queue=queue, reservation=reservation, - overwrite_existing=overwrite_existing, error_if_not_available=error_if_not_available) - print(f"Completed {night}. Sleeping for 30s") - time.sleep(30) - - print("Skipped the following nights that were not assigned to a survey:") - print(non_survey_nights, '\n\n\n') - print("All nights submitted") + log.info(f"{dry_run_level=} so not running desi_proc_night. " + + f"Would have run for {night=}") + + processed_nights.append(night) + # proc_night(night=None, proc_obstypes=None, z_submit_types=None, + # queue=None, reservation=None, system_name=None, + # exp_table_pathname=None, proc_table_pathname=None, + # override_pathname=None, update_exptable=False, + # dry_run_level=0, dry_run=False, no_redshifts=False, + # ignore_proc_table_failures=False, + # dont_check_job_outputs=False, + # dont_resubmit_partial_jobs=False, + # tiles=None, surveys=None, science_laststeps=None, + # all_tiles=False, specstatus_path=None, use_specter=False, + # no_cte_flats=False, complete_tiles_thrunight=None, + # all_cumulatives=False, daily=False, specprod=None, + # path_to_data=None, exp_obstypes=None, camword=None, + # badcamword=None, badamps=None, exps_to_ignore=None, + # sub_wait_time=0.1, verbose=False, + # dont_require_cals=False, + # psf_linking_without_fflat=False, + # still_acquiring=False) + log.info(f"Completed {night}.") + else: + ## I.e. if the above loop didn't "break" because of exceeding the queue + ## and all nights finished + finished = True + # write the sentinel + with open(sentinel_file, 'w') as sentinel: + sentinel.write(f"All done with processing for {production_yaml}") + sentinel.write(f"Nights processed: {all_nights}") + + log.info("Skipped the following nights that already had a processing table:") + log.info(skipped_nights) + log.info("Processed the following nights:") + log.info(processed_nights) + if finished: + log.info('\n\n\n') + log.info("All nights submitted") diff --git a/py/desispec/workflow/queue.py b/py/desispec/workflow/queue.py index ba9c0b7d3..b3ef09cf0 100644 --- a/py/desispec/workflow/queue.py +++ b/py/desispec/workflow/queue.py @@ -452,3 +452,111 @@ def any_jobs_failed(statuses, failed_states=None): if failed_states is None: failed_states = get_failed_states() return np.any([status in failed_states for status in statuses]) + +def get_jobs_in_queue(user=None, include_scron=False, dry_run_level=0): + """ + Queries the NERSC Slurm database using sacct with appropriate flags to get + information about specific jobs based on their jobids. + + Parameters + ---------- + user : str + NERSC user to query the jobs for + include_scron : bool + True if you want to include scron entries in the returned table. + Default is False. + dry_run_level : int + Whether this is a simulated run or real run. If nonzero, it is a + simulation and it returns a default table that doesn't query the + Slurm scheduler. + + Returns + ------- + Table + Table with the columns JOBID, PARTITION, NAME, USER, ST, TIME, NODES, + NODELIST(REASON) for the specified user. + """ + log = get_logger() + + cmd = f'squeue -u {user} -o "%i,%P,%j,%u,%t,%M,%D,%R"' + cmd_as_list = cmd.split() + + if dry_run_level > 0: + log.info("Dry run, would have otherwise queried Slurm with the" + +f" following: {' '.join(cmd_as_list)}") + string = 'JOBID,PARTITION,NAME,USER,ST,TIME,NODES,NODELIST(REASON)' + string += f"27650097,cron,scron_ar,{user},PD,0:00,1,(BeginTime)" + string += f"27650100,cron,scron_nh,{user},PD,0:00,1,(BeginTime)" + string += f"27650098,cron,scron_up,{user},PD,0:00,1,(BeginTime)" + string += f"29078887,gpu_ss11,tilenight-20230413-24315,{user},PD,0:00,1,(Priority)" + string += f"29078892,gpu_ss11,tilenight-20230413-21158,{user},PD,0:00,1,(Priority)" + string += f"29079325,gpu_ss11,tilenight-20240309-24526,{user},PD,0:00,1,(Dependency)" + string += f"29079322,gpu_ss11,ztile-22959-thru20240309,{user},PD,0:00,1,(Dependency)" + string += f"29078883,gpu_ss11,tilenight-20230413-21187,{user},R,10:18,1,nid003960" + string += f"29079242,regular_milan_ss11,arc-20240309-00229483-a0123456789,{user},PD,0:00,3,(Priority)" + string += f"29079246,regular_milan_ss11,arc-20240309-00229484-a0123456789,{user},PD,0:00,3,(Priority)" + + # create command to run to exercise subprocess -> stdout parsing + cmd = 'echo ' + string + cmd_as_list = ['echo', string] + else: + log.info(f"Querying Slurm with the following: {' '.join(cmd_as_list)}") + + #- sacct sometimes fails; try several times before giving up + max_attempts = 3 + for attempt in range(max_attempts): + try: + table_as_string = subprocess.check_output(cmd_as_list, text=True, + stderr=subprocess.STDOUT) + break + except subprocess.CalledProcessError as err: + log.error(f'{cmd} job query failure at {datetime.datetime.now()}') + log.error(f'{cmd_as_list}') + log.error(f'{err.output=}') + else: #- for/else happens if loop doesn't succeed + msg = f'{cmd} query failed {max_attempts} times; exiting' + log.critical(msg) + raise RuntimeError(msg) + + queue_info_table = Table.read(table_as_string, format='ascii.csv') + for col in queue_info_table.colnames: + queue_info_table.rename_column(col, col.upper()) + + if np.any(queue_info_table['USER']!=user): + msg = f"Warning {np.sum(queue_info_table['USER']!=user)} " \ + + f"jobs returned were not {user=}\n" \ + + f"{queue_info_table['USER'][queue_info_table['USER']!=user]}" + log.critical(msg) + raise ValueError(msg) + + if not include_scron: + queue_info_table = queue_info_table[queue_info_table['PARTITION'] != 'cron'] + + return queue_info_table + + +def check_queue_count(user=None, include_scron=False, dry_run_level=0): + """ + Queries the NERSC Slurm database using sacct with appropriate flags to get + information about specific jobs based on their jobids. + + Parameters + ---------- + user : str + NERSC user to query the jobs for + include_scron : bool + True if you want to include scron entries in the returned table. + Default is False. + dry_run_level : int + Whether this is a simulated run or real run. If nonzero, it is a + simulation and it returns a default table that doesn't query the + Slurm scheduler. + + Returns + ------- + int + The number of jobs for that user in the queue (including or excluding + scron entries depending on include_scron). + """ + return len(get_jobs_in_queue(user=user, include_scron=include_scron, + dry_run_level=dry_run_level)) \ No newline at end of file From 24a8112ae944355360d3e0e9d12cb0a4180d43d0 Mon Sep 17 00:00:00 2001 From: kremin Date: Fri, 9 Aug 2024 13:20:16 -0700 Subject: [PATCH 05/27] bug fix when appending crossnights --- py/desispec/workflow/processing.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/py/desispec/workflow/processing.py b/py/desispec/workflow/processing.py index 4693c81b2..0a07fe5e8 100644 --- a/py/desispec/workflow/processing.py +++ b/py/desispec/workflow/processing.py @@ -863,14 +863,16 @@ def assign_dependency(prow, dependency): if type(dependency) in [list, np.array]: ids, qids = [], [] for curdep in dependency: + ids.append(curdep['INTID']) if still_a_dependency(curdep): - ids.append(curdep['INTID']) + # ids.append(curdep['INTID']) qids.append(curdep['LATEST_QID']) prow['INT_DEP_IDS'] = np.array(ids, dtype=int) prow['LATEST_DEP_QID'] = np.array(qids, dtype=int) - elif type(dependency) in [dict, OrderedDict, Table.Row] and still_a_dependency(dependency): + elif type(dependency) in [dict, OrderedDict, Table.Row]: prow['INT_DEP_IDS'] = np.array([dependency['INTID']], dtype=int) - prow['LATEST_DEP_QID'] = np.array([dependency['LATEST_QID']], dtype=int) + if still_a_dependency(dependency): + prow['LATEST_DEP_QID'] = np.array([dependency['LATEST_QID']], dtype=int) return prow def still_a_dependency(dependency): @@ -1615,7 +1617,7 @@ def submit_redshifts(ptable, prows, tnight, internal_id, queue, reservation, if matched_prows is not None: matched_prows = matched_prows[matched_prows['NIGHT'] <= night] for prow in matched_prows: - if matched_prows['INTID'] != tnight['INTID']: + if prow['INTID'] != tnight['INTID']: tnights.append(prow) log.info(f"Internal Processing IDs: {[prow['INTID'] for prow in tnights]}.\n") ## Identify all exposures that should go into the fit From 455e3cb4c64241b68d7be3cd2c3094adf1829db5 Mon Sep 17 00:00:00 2001 From: akremin Date: Sun, 11 Aug 2024 00:42:42 -0700 Subject: [PATCH 06/27] Improve logging in submit_prod --- py/desispec/scripts/submit_prod.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/py/desispec/scripts/submit_prod.py b/py/desispec/scripts/submit_prod.py index 6e8512574..a36a8415b 100644 --- a/py/desispec/scripts/submit_prod.py +++ b/py/desispec/scripts/submit_prod.py @@ -144,6 +144,8 @@ def submit_production(production_yaml, dry_run_level=False): thru_night = last_night log.warning(f"Setting thru_night to last_night: {thru_night}") + ## note that None defaults to "cumulative" when handed to desi_proc_night + ## unless no_redshifts is True no_redshifts = False if 'Z_SUBMIT_TYPES' in conf: z_submit_types_str = str(conf['Z_SUBMIT_TYPES']) @@ -191,17 +193,22 @@ def submit_production(production_yaml, dry_run_level=False): ## Do the main processing finished = False processed_nights, skipped_nights = [], [] - for night in all_nights: + all_nights = sorted(all_nights) + log.info(f"Processing {all_nights=}") + for night in sorted(all_nights): num_in_queue = check_queue_count(user=user, include_scron=False, dry_run_level=dry_run_level) ## In Jura the largest night had 115 jobs, to be conservative say 200 if num_in_queue > 4800: + log.info(f"{num_in_queue} jobs in the queue, so stopping the job submissions.") break if os.path.exists(findfile('proctable', night=night, readonly=True)): skipped_nights.append(night) + log.info(f"{night=} already has a proctable, skipping.") continue ## We don't expect exposure tables to change during code execution here ## but we do expect processing tables to evolve, so clear that cache + log.info(f"Processing {night=}") # TODO uncomment when merged into branch with crossnight dependencies #desispec.workflow.proctable.reset_tilenight_ptab_cache() if dry_run_level < 4: @@ -233,7 +240,7 @@ def submit_production(production_yaml, dry_run_level=False): # dont_require_cals=False, # psf_linking_without_fflat=False, # still_acquiring=False) - log.info(f"Completed {night}.") + log.info(f"Completed {night=}.") else: ## I.e. if the above loop didn't "break" because of exceeding the queue ## and all nights finished From 335b20ea94e0fda19ba3f38f730961a4cf1ad408 Mon Sep 17 00:00:00 2001 From: akremin Date: Sun, 11 Aug 2024 01:00:13 -0700 Subject: [PATCH 07/27] Redirect proc_night outputs to individual logs in submit_prod --- py/desispec/scripts/submit_prod.py | 32 ++++++++++++++++++++++-------- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/py/desispec/scripts/submit_prod.py b/py/desispec/scripts/submit_prod.py index a36a8415b..a6902f5e7 100644 --- a/py/desispec/scripts/submit_prod.py +++ b/py/desispec/scripts/submit_prod.py @@ -11,6 +11,7 @@ import re import glob +from desispec.parallel import stdouterr_redirected from desiutil.log import get_logger from desispec.io import findfile from desispec.scripts.proc_night import proc_night @@ -190,6 +191,14 @@ def submit_production(production_yaml, dry_run_level=False): if reservation is not None: log.info(f'Using reservation: {reservation}') + ## Define log location + logpath = os.path.join(os.environ['DESI_SPECTRO_REDUX'], + os.environ['SPECPROD'], 'run', 'logs') + if dry_run_level < 4: + os.makedirs(logpath, exist_ok=True) + else: + log.info(f"{dry_run_level=} so not creating {logpath}") + ## Do the main processing finished = False processed_nights, skipped_nights = [], [] @@ -212,11 +221,13 @@ def submit_production(production_yaml, dry_run_level=False): # TODO uncomment when merged into branch with crossnight dependencies #desispec.workflow.proctable.reset_tilenight_ptab_cache() if dry_run_level < 4: - proc_night(night=night, z_submit_types=z_submit_types, - no_redshifts=no_redshifts, - complete_tiles_thrunight=thru_night, - surveys=surveys, dry_run_level=dry_run_level, - queue=queue, reservation=reservation) + logfile = os.path.join(logpath, f'night-{night}.log') + with stdouterr_redirected(logfile): + proc_night(night=night, z_submit_types=z_submit_types, + no_redshifts=no_redshifts, + complete_tiles_thrunight=thru_night, + surveys=surveys, dry_run_level=dry_run_level, + queue=queue, reservation=reservation) else: log.info(f"{dry_run_level=} so not running desi_proc_night. " + f"Would have run for {night=}") @@ -246,9 +257,14 @@ def submit_production(production_yaml, dry_run_level=False): ## and all nights finished finished = True # write the sentinel - with open(sentinel_file, 'w') as sentinel: - sentinel.write(f"All done with processing for {production_yaml}") - sentinel.write(f"Nights processed: {all_nights}") + if dry_run_level < 4: + with open(sentinel_file, 'w') as sentinel: + sentinel.write( + f"All done with processing for {production_yaml}") + sentinel.write(f"Nights processed: {all_nights}") + else: + log.info(f"{dry_run_level=} so not creating {sentinel_file}") + log.info("Skipped the following nights that already had a processing table:") log.info(skipped_nights) From 59f95e6e0c844becf07409c97a4de34646021e2a Mon Sep 17 00:00:00 2001 From: kremin Date: Mon, 12 Aug 2024 14:42:39 -0700 Subject: [PATCH 08/27] improve squeue handling and add zs by default to submit_prod --- bin/{desi_run_prod => desi_submit_prod} | 0 py/desispec/scripts/submit_prod.py | 6 +++--- py/desispec/workflow/queue.py | 21 +++++++++++++++++++-- 3 files changed, 22 insertions(+), 5 deletions(-) rename bin/{desi_run_prod => desi_submit_prod} (100%) diff --git a/bin/desi_run_prod b/bin/desi_submit_prod similarity index 100% rename from bin/desi_run_prod rename to bin/desi_submit_prod diff --git a/py/desispec/scripts/submit_prod.py b/py/desispec/scripts/submit_prod.py index a6902f5e7..253aefcdd 100644 --- a/py/desispec/scripts/submit_prod.py +++ b/py/desispec/scripts/submit_prod.py @@ -145,8 +145,8 @@ def submit_production(production_yaml, dry_run_level=False): thru_night = last_night log.warning(f"Setting thru_night to last_night: {thru_night}") - ## note that None defaults to "cumulative" when handed to desi_proc_night - ## unless no_redshifts is True + ## If not specified, run "cumulative" redshifts, otherwise do + ## as directed no_redshifts = False if 'Z_SUBMIT_TYPES' in conf: z_submit_types_str = str(conf['Z_SUBMIT_TYPES']) @@ -157,7 +157,7 @@ def submit_production(production_yaml, dry_run_level=False): z_submit_types = [ztype.strip().lower() for ztype in z_submit_types_str.split(',')] else: - z_submit_types = None + z_submit_types = ['cumulative'] if 'SURVEYS' in conf: surveys_str = str(conf['SURVEYS']) diff --git a/py/desispec/workflow/queue.py b/py/desispec/workflow/queue.py index b3ef09cf0..21864b5d5 100644 --- a/py/desispec/workflow/queue.py +++ b/py/desispec/workflow/queue.py @@ -4,6 +4,7 @@ """ import os +import re import numpy as np from astropy.table import Table, vstack import subprocess @@ -518,7 +519,23 @@ def get_jobs_in_queue(user=None, include_scron=False, dry_run_level=0): log.critical(msg) raise RuntimeError(msg) - queue_info_table = Table.read(table_as_string, format='ascii.csv') + ## remove extra quotes that astropy table does't like + table_as_string = table_as_string.replace('"','') + + ## remove parenthesis are also not very desirable + table_as_string = table_as_string.replace('(', '').replace(')', '') + + + ## remove node list with hyphen or comma otherwise it will break table reader + table_as_string = re.sub(r"nid\[[0-9,-]*\]", "multiple nodes", table_as_string) + + try: + queue_info_table = Table.read(table_as_string, format='ascii.csv') + except: + log.info("Table retured by squeue couldn't be parsed. The string was:") + print(table_as_string) + raise + for col in queue_info_table.colnames: queue_info_table.rename_column(col, col.upper()) @@ -559,4 +576,4 @@ def check_queue_count(user=None, include_scron=False, dry_run_level=0): scron entries depending on include_scron). """ return len(get_jobs_in_queue(user=user, include_scron=include_scron, - dry_run_level=dry_run_level)) \ No newline at end of file + dry_run_level=dry_run_level)) From c314650cd3d1ceddc785b7856816245cc89d8521 Mon Sep 17 00:00:00 2001 From: kremin Date: Wed, 14 Aug 2024 16:04:15 -0700 Subject: [PATCH 09/27] move prod night parsing to a func --- bin/desi_submit_prod | 5 +- py/desispec/scripts/submit_prod.py | 119 +++++++++++++++++++---------- py/desispec/workflow/queue.py | 5 ++ 3 files changed, 86 insertions(+), 43 deletions(-) diff --git a/bin/desi_submit_prod b/bin/desi_submit_prod index 460d3dd56..9c3102a92 100755 --- a/bin/desi_submit_prod +++ b/bin/desi_submit_prod @@ -12,8 +12,11 @@ def parse_args(): # options=None): """ parser = argparse.ArgumentParser(description="Submit a full production run of the DESI data pipeline for processing.") - parser.add_argument("--production-yaml", type=str, required=True, + parser.add_argument("-p", "--production-yaml", type=str, required=True, help="Relative or absolute pathname to the yaml file summarizing the production.") + parser.add_argument("-p", "--queue-threshold", type=int, default=4800, + help="The number of jobs for the current user in the queue at which the" + + " at which the script stops submitting new jobs.") # Code Flags parser.add_argument("--dry-run-level", type=int, default=0, diff --git a/py/desispec/scripts/submit_prod.py b/py/desispec/scripts/submit_prod.py index 253aefcdd..eef83700e 100644 --- a/py/desispec/scripts/submit_prod.py +++ b/py/desispec/scripts/submit_prod.py @@ -22,12 +22,12 @@ from desispec.workflow.queue import check_queue_count -def get_all_nights(first_night, last_night): +def get_nights_in_date_range(first_night, last_night): """ Returns a full list of all nights that have an exposure table exposure - Inputs: + Args: first_night, int. First night to include (inclusive). last_night, int. Last night to include (inclusive). @@ -52,7 +52,7 @@ def get_all_valid_nights(first_night, last_night): Returns a full list of all nights that have at least one valid science exposure - Inputs: + Args: first_night, int. First night to include (inclusive). last_night, int. Last night to include (inclusive). @@ -65,14 +65,80 @@ def get_all_valid_nights(first_night, last_night): nights = nights[((nights>=first_night)&(nights<=last_night))] return nights +def get_nights_to_process(production_yaml, verbose=False): + """ + Derives the nights to be processed based on a production yaml file and + returns a list of int nights. + + Args: + production_yaml (str or dict): Production yaml or pathname of the + yaml file that defines the production. + verbose (bool): Whether to be verbose in log outputs. + + Returns: + nights, list. A list of nights on or after Jan 1 2020 in which data exists at NERSC. + """ + log = get_logger() + ## If production_yaml not loaded, load the file + if isinstance(production_yaml, str): + if not os.path.exists(production_yaml): + raise IOError(f"Prod yaml file doesn't exist: {production_yaml} not found.") + with open(production_yaml, 'rb') as yamlfile: + config = yaml.safe_load(yamlfile) + else: + config = production_yaml + + all_nights, first_night = None, None + if 'NIGHTS' in config and 'LAST_NIGHT' in config: + log.error(f"Both NIGHTS and LAST_NIGHT specified. Using NIGHTS " + + f"and ignoring LAST_NIGHT.") + if 'NIGHTS' in config: + all_nights = np.array(list(config['NIGHTS'])).astype(int) + if verbose: + log.info(f"Setting all_nights to NIGHTS: {all_nights}") + log.info("Setting first_night to earliest night in NIGHTS:" + + f" {np.min(all_nights)}") + first_night = np.min(all_nights) + if verbose: + log.info("Setting last_night to latest night in NIGHTS: " + + f"{np.max(all_nights)}") + last_night = np.max(all_nights) + elif 'LAST_NIGHT' in config: + last_night = int(config['LAST_NIGHT']) + if verbose: + log.info(f"Setting last_night to LATEST_NIGHT: {last_night}") + else: + raise ValueError("Either NIGHT or LAST_NIGHT required in yaml " + + f"file {production_yaml}") + + if first_night is None: + if 'FIRST_NIGHT' in config: + first_night = int(config['FIRST_NIGHT']) + if verbose: + log.info(f"Setting first_night to FIRST_NIGHT: {first_night}") + else: + if verbose: + log.info("Setting first_night to earliest in a normal prod: 20201214") + first_night = 20201214 + + if all_nights is None: + # all_nights = get_nights_in_date_range(first_night, last_night) + if verbose: + log.info("Populating all_nights with all of the nights with valid science " + + f"exposures between {first_night} and {last_night} inclusive") + all_nights = get_all_valid_nights(first_night, last_night) + return sorted(all_nights) + -def submit_production(production_yaml, dry_run_level=False): +def submit_production(production_yaml, queue_threshold=4800, dry_run_level=False): """ Interprets a production_yaml file and submits the respective nights for processing within the defined production. Args: production_yaml (str): Pathname of the yaml file that defines the production. + queue_threshold (int): The number of jobs for the current user in the queue + at which the script stops submitting new jobs. dry_run_level (int, optional): Default is 0. Should the jobs written to the processing table actually be submitted for processing. This is passed directly to desi_proc_night. @@ -103,47 +169,15 @@ def submit_production(production_yaml, dry_run_level=False): return 0 ## Load the nights to process - all_nights, first_night = None, None - if 'NIGHTS' in conf and 'LAST_NIGHT' in conf: - log.error(f"Both NIGHTS and LAST_NIGHT specified. Using NIGHTS " - + f"and ignoring LAST_NIGHT.") - if 'NIGHTS' in conf: - all_nights = np.array(list(conf['NIGHTS'])).astype(int) - log.info(f"Setting all_nights to NIGHTS: {all_nights}") - log.info("Setting first_night to earliest night in NIGHTS:" - + f" {np.min(all_nights)}") - first_night = np.min(all_nights) - log.info("Setting last_night to latest night in NIGHTS: " - + f"{np.max(all_nights)}") - last_night = np.max(all_nights) - elif 'LAST_NIGHT' in conf: - last_night = int(conf['LAST_NIGHT']) - log.info(f"Setting last_night to LATEST_NIGHT: {last_night}") - else: - raise ValueError("Either NIGHT or LAST_NIGHT required in yaml " - + f"file {production_yaml}") - - if first_night is None: - if 'FIRST_NIGHT' in conf: - first_night = int(conf['FIRST_NIGHT']) - log.info(f"Setting first_night to FIRST_NIGHT: {first_night}") - else: - log.info("Setting first_night to earliest in a normal prod: 20201214") - first_night = 20201214 - - if all_nights is None: - # all_nights = get_all_nights(first_night, last_night) - log.info("Populating all_nights with all of the nights with valid science " - + f"exposures between {first_night} and {last_night} inclusive") - all_nights = get_all_valid_nights(first_night, last_night) + all_nights = get_nights_to_process(production_yaml=conf, verbose=True) ## Load the other parameters for running desi_proc_night if 'THRU_NIGHT' in conf: thru_night = int(conf['THRU_NIGHT']) log.info(f"Setting thru_night to THRU_NIGHT: {thru_night}") else: - thru_night = last_night - log.warning(f"Setting thru_night to last_night: {thru_night}") + thru_night = np.max(all_nights) + log.warning(f"Setting thru_night to last night: {thru_night}") ## If not specified, run "cumulative" redshifts, otherwise do ## as directed @@ -207,9 +241,10 @@ def submit_production(production_yaml, dry_run_level=False): for night in sorted(all_nights): num_in_queue = check_queue_count(user=user, include_scron=False, dry_run_level=dry_run_level) - ## In Jura the largest night had 115 jobs, to be conservative say 200 - if num_in_queue > 4800: - log.info(f"{num_in_queue} jobs in the queue, so stopping the job submissions.") + ## In Jura the largest night had 115 jobs, to be conservative say 200 by default + if num_in_queue > queue_threshold: + log.info(f"{num_in_queue} jobs in the queue > {queue_threshold}," + + " so stopping the job submissions.") break if os.path.exists(findfile('proctable', night=night, readonly=True)): skipped_nights.append(night) diff --git a/py/desispec/workflow/queue.py b/py/desispec/workflow/queue.py index 21864b5d5..4f4a2d41d 100644 --- a/py/desispec/workflow/queue.py +++ b/py/desispec/workflow/queue.py @@ -539,6 +539,11 @@ def get_jobs_in_queue(user=None, include_scron=False, dry_run_level=0): for col in queue_info_table.colnames: queue_info_table.rename_column(col, col.upper()) + ## If the table is empty, return it immediately, otherwise perform + ## sanity check and cuts + if len(queue_info_table) == 0: + return queue_info_table + if np.any(queue_info_table['USER']!=user): msg = f"Warning {np.sum(queue_info_table['USER']!=user)} " \ + f"jobs returned were not {user=}\n" \ From a2e519038c6a5a22e0ab4d02394e370d829e6164 Mon Sep 17 00:00:00 2001 From: akremin Date: Wed, 14 Aug 2024 17:12:58 -0700 Subject: [PATCH 10/27] standardize columns in proctable caches --- py/desispec/workflow/proctable.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/py/desispec/workflow/proctable.py b/py/desispec/workflow/proctable.py index aef37d1fe..5455ab649 100644 --- a/py/desispec/workflow/proctable.py +++ b/py/desispec/workflow/proctable.py @@ -402,6 +402,11 @@ def table_row_to_dict(table_row): raise TypeError(f"Received table_row of type {typ}, can't convert to a dictionary. Exiting.") +## This cache is used in initial processing when we need to identify tilenights +## to use +_required_tilenight_ptab_cols = ['EXPID', 'TILEID', 'NIGHT', 'PROCCAMWORD', + 'OBSTYPE', 'JOBDESC', 'INTID', 'LATEST_QID', + 'STATUS'] def read_minimal_tilenight_proctab_cols(nights=None, tileids=None, reset_cache=False, readonly=True): """ @@ -474,8 +479,7 @@ def read_minimal_tilenight_proctab_cols(nights=None, tileids=None, dtype='S36', name=col) else: t[col] = Table.Column(t[col], dtype='S36', name=col) - ptables.append(t['EXPID', 'TILEID', 'NIGHT', 'PROCCAMWORD', - 'INTID', 'LATEST_QID']) + ptables.append(t[_required_tilenight_ptab_cols]) if len(ptables) > 0: outtable = vstack(ptables) @@ -529,8 +533,7 @@ def _set_tilenight_ptab_cache(ptab): t = _select_tilenights_from_ptab(ptab) else: t = ptab - _tilenight_ptab_cache = t['EXPID', 'TILEID', 'NIGHT', 'PROCCAMWORD', - 'INTID', 'LATEST_QID'] + _tilenight_ptab_cache = t[_required_tilenight_ptab_cols] _tilenight_ptab_cache.sort(['INTID']) @@ -562,6 +565,9 @@ def update_tilenight_ptab_cache(ptab): _tilenight_ptab_cache.sort(['INTID']) +## This cache is used in reprocessing where calibration jobs are also required +## for now need the same columns but different rows +_required_full_ptab_cols = _required_tilenight_ptab_cols def read_minimal_full_proctab_cols(nights=None, tileids=None, reset_cache=False, readonly=True): """ @@ -635,8 +641,7 @@ def read_minimal_full_proctab_cols(nights=None, tileids=None, dtype='S36', name=col) else: t[col] = Table.Column(t[col], dtype='S36', name=col) - ptables.append(t['EXPID', 'TILEID', 'NIGHT', 'PROCCAMWORD', 'OBSTYPE', - 'JOBDESC', 'INTID', 'LATEST_QID', 'STATUS']) + ptables.append(t[_required_full_ptab_cols]) if len(ptables) > 0: outtable = vstack(ptables) @@ -675,8 +680,7 @@ def update_full_ptab_cache(ptab): global _full_ptab_cache log = get_logger() - t = ptab['EXPID', 'TILEID', 'NIGHT', 'PROCCAMWORD', 'OBSTYPE', - 'JOBDESC', 'INTID', 'LATEST_QID', 'STATUS'] + t = ptab[_required_full_ptab_cols] new_nights = np.unique(t['NIGHT']) log.info(f'Replacing all current entries in processing table ' + f'cache for nights {new_nights.data}') From c3444b806358762eddcef6a30ed8527e580483ed Mon Sep 17 00:00:00 2001 From: kremin Date: Wed, 14 Aug 2024 22:36:54 -0700 Subject: [PATCH 11/27] bug fixes to resub and standardize cache cols --- py/desispec/workflow/exptable.py | 4 ++-- py/desispec/workflow/processing.py | 5 +---- py/desispec/workflow/proctable.py | 8 ++++---- 3 files changed, 7 insertions(+), 10 deletions(-) diff --git a/py/desispec/workflow/exptable.py b/py/desispec/workflow/exptable.py index ad6bf43b8..0edb399ae 100644 --- a/py/desispec/workflow/exptable.py +++ b/py/desispec/workflow/exptable.py @@ -1079,10 +1079,10 @@ def update_science_etab_cache(etab): cleaned_etab = _select_sciences_from_etab(etab) new_nights = np.unique(cleaned_etab['NIGHT']) log.info(f'Removing all current entries in science exposure ' - + f'table row cache for nights {new_nights}') + + f'table row cache for nights {list(new_nights)}') conflicting_entries = np.isin(_science_etab_cache['NIGHT'], new_nights) log.info( - f"Removing {len(conflicting_entries)} rows and adding {len(cleaned_etab)} rows " + f"Removing {np.sum(conflicting_entries)} rows and adding {len(cleaned_etab)} rows " + f"to science exposure table row cache.") keep = np.bitwise_not(conflicting_entries) _science_etab_cache = _science_etab_cache[keep] diff --git a/py/desispec/workflow/processing.py b/py/desispec/workflow/processing.py index 0a07fe5e8..5868dd4d1 100644 --- a/py/desispec/workflow/processing.py +++ b/py/desispec/workflow/processing.py @@ -1220,7 +1220,7 @@ def recursive_submit_failed(rown, proc_table, submits, id_to_row_map, ptab_name= if resubmission_states is None: resubmission_states = get_resubmission_states() ideps = proc_table['INT_DEP_IDS'][rown] - if ideps is None: + if ideps is None or len(ideps)==0: proc_table['LATEST_DEP_QID'][rown] = np.ndarray(shape=0).astype(int) else: all_valid_states = list(resubmission_states.copy()) @@ -1229,13 +1229,10 @@ def recursive_submit_failed(rown, proc_table, submits, id_to_row_map, ptab_name= othernight_idep_qid_lookup = {} for idep in np.sort(np.atleast_1d(ideps)): if idep not in id_to_row_map: - log.info(idep // 1000) - log.info(row['INTID'] // 1000) if idep // 1000 != row['INTID'] // 1000: log.info(f"Internal ID: {idep} not in id_to_row_map. " + "This is expected since it's from another day. ") reference_night = 20000000 + (idep // 1000) - log.info(reference_night) reftab = read_minimal_full_proctab_cols(nights=[reference_night]) if reftab is None: msg = f"The dependency is from night={reference_night}" \ diff --git a/py/desispec/workflow/proctable.py b/py/desispec/workflow/proctable.py index 5455ab649..3d995c66f 100644 --- a/py/desispec/workflow/proctable.py +++ b/py/desispec/workflow/proctable.py @@ -549,14 +549,14 @@ def update_tilenight_ptab_cache(ptab): log = get_logger() ## If the cache doesn't exist, don't update it. if _tilenight_ptab_cache is None: - log.debug(f'Science exptab cache does not exist, so not updating') + log.debug(f'Tilenight proctab cache does not exist, so not updating') return cleaned_ptab = _select_tilenights_from_ptab(ptab) new_nights = np.unique(cleaned_ptab['NIGHT']) log.info(f'Removing all current entries in processing table tilenight ' - + f'selection cache for nights {new_nights}') + + f'selection cache for nights {list(new_nights)}') conflicting_entries = np.isin(_tilenight_ptab_cache['NIGHT'], new_nights) - log.info(f"Removing {len(conflicting_entries)} rows and adding " + log.info(f"Removing {np.sum(conflicting_entries)} rows and adding " + f"{len(cleaned_ptab)} rows " + f"to processing table tilenight cache.") keep = np.bitwise_not(conflicting_entries) @@ -683,6 +683,6 @@ def update_full_ptab_cache(ptab): t = ptab[_required_full_ptab_cols] new_nights = np.unique(t['NIGHT']) log.info(f'Replacing all current entries in processing table ' - + f'cache for nights {new_nights.data}') + + f'cache for nights {list(new_nights)}') for night in new_nights: _full_ptab_cache[night] = t[t['NIGHT'] == night] From 2adb960bb8da6912b8d70ca2826484919c210e39 Mon Sep 17 00:00:00 2001 From: kremin Date: Wed, 14 Aug 2024 23:22:00 -0700 Subject: [PATCH 12/27] fix typo in submit_prod and better future support --- bin/desi_submit_prod | 2 +- py/desispec/scripts/submit_prod.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/bin/desi_submit_prod b/bin/desi_submit_prod index 9c3102a92..069a80bcf 100755 --- a/bin/desi_submit_prod +++ b/bin/desi_submit_prod @@ -14,7 +14,7 @@ def parse_args(): # options=None): parser.add_argument("-p", "--production-yaml", type=str, required=True, help="Relative or absolute pathname to the yaml file summarizing the production.") - parser.add_argument("-p", "--queue-threshold", type=int, default=4800, + parser.add_argument("-q", "--queue-threshold", type=int, default=4800, help="The number of jobs for the current user in the queue at which the" + " at which the script stops submitting new jobs.") diff --git a/py/desispec/scripts/submit_prod.py b/py/desispec/scripts/submit_prod.py index eef83700e..1849a4e84 100644 --- a/py/desispec/scripts/submit_prod.py +++ b/py/desispec/scripts/submit_prod.py @@ -17,7 +17,8 @@ from desispec.scripts.proc_night import proc_night ## Import some helper functions, you can see their definitions by uncomenting the bash shell command from desispec.workflow.utils import verify_variable_with_environment, listpath -from desispec.workflow.redshifts import read_minimal_exptables_columns +# TODO when merged into branch with crossnight dependencies this is in workflow.exptable as the new name +from desispec.workflow.redshifts import read_minimal_exptables_columns as read_minimal_science_exptab_cols from desispec.scripts.submit_night import submit_night from desispec.workflow.queue import check_queue_count @@ -59,8 +60,7 @@ def get_all_valid_nights(first_night, last_night): Returns: nights, list. A list of nights on or after Jan 1 2020 in which data exists at NERSC. """ - # TODO when merged into branch with crossnight dependencies this function changes name - fulletab = read_minimal_exptables_columns() + fulletab = read_minimal_science_exptab_cols() nights = np.unique(fulletab['NIGHT']) nights = nights[((nights>=first_night)&(nights<=last_night))] return nights From d026aa60891bbcfd3a7cf64c60fb29273beea8e3 Mon Sep 17 00:00:00 2001 From: Julien Guy Date: Thu, 15 Aug 2024 08:57:57 -0700 Subject: [PATCH 13/27] adapt to changes in the function desispec.scripts.specex.merge_psf --- bin/desi_merge_psf | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/bin/desi_merge_psf b/bin/desi_merge_psf index 643ca849c..c5f6132f4 100755 --- a/bin/desi_merge_psf +++ b/bin/desi_merge_psf @@ -12,6 +12,4 @@ parser.add_argument('-o','--outfile', type = str, default = None, required=True, args = parser.parse_args() -merge_psf(args.infiles,args.outfile) - - +merge_psf(args.infiles[0],args.infiles[1:],args.outfile) From 050caf40a8cf0079dc779de88b3fc5d12cc8cd64 Mon Sep 17 00:00:00 2001 From: Julien Guy Date: Thu, 15 Aug 2024 09:56:17 -0700 Subject: [PATCH 14/27] add option --dont-merge-with-psf-input to desi_proc and --dont-merge-with-input to desi_compute_psf --- py/desispec/scripts/proc.py | 3 +++ py/desispec/scripts/specex.py | 9 ++++++++- py/desispec/workflow/desi_proc_funcs.py | 1 + 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/py/desispec/scripts/proc.py b/py/desispec/scripts/proc.py index b3b7fad30..795d6ac11 100644 --- a/py/desispec/scripts/proc.py +++ b/py/desispec/scripts/proc.py @@ -645,6 +645,9 @@ def main(args=None, comm=None): cmd += ' --input-psf {}'.format(inpsf) cmd += ' --output-psf {}'.format(outpsf) + if args.dont_merge_with_psf_input : + cmd += ' --dont-merge-with-input' + # fibers to ignore for the PSF fit # specex uses the fiber index in a camera fibers_to_ignore = badfibers([hdr, camhdr[camera]],["BROKENFIBERS","BADCOLUMNFIBERS"])%500 diff --git a/py/desispec/scripts/specex.py b/py/desispec/scripts/specex.py index 282cddc73..e8c45ff07 100644 --- a/py/desispec/scripts/specex.py +++ b/py/desispec/scripts/specex.py @@ -50,6 +50,9 @@ def parse(options=None): help="comma separated list of broken fibers") parser.add_argument("--disable-merge", action = 'store_true', help="disable merging fiber bundles") + parser.add_argument("--dont-merge-with-input", action = 'store_true', + help="dont use the input PSF as default when merging bundles") + args = parser.parse_args(options) @@ -271,7 +274,11 @@ def main(args=None, comm=None): time.sleep(5.) try: - merge_psf(inpsffile, bundlefiles, outfits) + if args.dont_merge_with_input : + log.info("Do not include input PSF when merging bundles") + merge_psf(bundlefiles[0], bundlefiles[1:], outfits) + else : + merge_psf(inpsffile, bundlefiles, outfits) except Exception as e: log.error(e) log.error("merging failed for {}".format(outfits)) diff --git a/py/desispec/workflow/desi_proc_funcs.py b/py/desispec/workflow/desi_proc_funcs.py index 41989633a..029d45949 100755 --- a/py/desispec/workflow/desi_proc_funcs.py +++ b/py/desispec/workflow/desi_proc_funcs.py @@ -89,6 +89,7 @@ def get_shared_desi_proc_parser(): parser.add_argument("--extract-subcomm-size", type=int, default=None, help="Size to use for GPU extract subcomm") parser.add_argument("--no-gpu", action="store_true", help="Do not use GPU for extractions even if available") parser.add_argument("--use-specter", action="store_true", help="Use classic specter instead of gpu_specter") + parser.add_argument("--dont-merge-with-psf-input", action="store_true", help="Do not merge with PSF input") parser.add_argument("--mpistdstars", action="store_true", help="Use MPI parallelism in stdstar fitting instead of multiprocessing") parser.add_argument("--no-skygradpca", action="store_true", help="Do not fit sky gradient") parser.add_argument("--no-tpcorrparam", action="store_true", help="Do not apply tpcorrparam spatial model or fit tpcorrparam pca terms") From 301d390e92367386c9c7f4661dd60fe5ab5ec2e9 Mon Sep 17 00:00:00 2001 From: akremin Date: Thu, 15 Aug 2024 18:04:04 -0700 Subject: [PATCH 15/27] dont exit on submission failure, just mark unsubmitted --- py/desispec/workflow/processing.py | 38 +++++++++++++++++++----------- py/desispec/workflow/proctable.py | 15 ++++++++---- py/desispec/workflow/queue.py | 17 ++++++++++--- 3 files changed, 48 insertions(+), 22 deletions(-) diff --git a/py/desispec/workflow/processing.py b/py/desispec/workflow/processing.py index 5868dd4d1..c566054db 100644 --- a/py/desispec/workflow/processing.py +++ b/py/desispec/workflow/processing.py @@ -33,7 +33,7 @@ from desispec.workflow.tableio import write_table, load_table from desispec.workflow.proctable import table_row_to_dict, erow_to_prow, \ read_minimal_tilenight_proctab_cols, read_minimal_full_proctab_cols, \ - update_full_ptab_cache + update_full_ptab_cache, default_prow, get_default_qid from desiutil.log import get_logger from desispec.io import findfile, specprod_root @@ -690,7 +690,7 @@ def submit_batch_script(prow, dry_run=0, reservation=None, strictly_successful=F batch_params.append(f'--reservation={reservation}') batch_params.append(f'{script_path}') - + submitted = True if dry_run: current_qid = _get_fake_qid() else: @@ -709,21 +709,31 @@ def submit_batch_script(prow, dry_run=0, reservation=None, strictly_successful=F log.info('Sleeping 60 seconds then retrying') time.sleep(60) else: #- for/else happens if loop doesn't succeed - msg = f'{jobname} submission failed {max_attempts} times; exiting' - log.critical(msg) - raise RuntimeError(msg) + msg = f'{jobname} submission failed {max_attempts} times.' \ + + ' setting as unsubmitted and moving on' + log.error(msg) + current_qid = get_default_qid() + submitted = False log.info(batch_params) - log.info(f'Submitted {jobname} with dependencies {dep_str} and reservation={reservation}. Returned qid: {current_qid}') ## Update prow with new information prow['LATEST_QID'] = current_qid - prow['ALL_QIDS'] = np.append(prow['ALL_QIDS'],current_qid) - prow['STATUS'] = 'SUBMITTED' - prow['SUBMIT_DATE'] = int(time.time()) - ## Update the Slurm jobid cache of job states - update_queue_state_cache(qid=prow['LATEST_QID'], state=prow['STATUS']) + ## If we didn't submit, don't say we did and don't add to ALL_QIDS + if submitted: + log.info(f'Submitted {jobname} with dependencies {dep_str} and ' + + f'reservation={reservation}. Returned qid: {current_qid}') + + ## Update prow with new information + prow['ALL_QIDS'] = np.append(prow['ALL_QIDS'],current_qid) + prow['STATUS'] = 'SUBMITTED' + prow['SUBMIT_DATE'] = int(time.time()) + else: + prow['STATUS'] = 'UNSUBMITTED' + + ## Update the Slurm jobid cache of job states + update_queue_state_cache(qid=prow['LATEST_QID'], state=prow['STATUS']) return prow @@ -1784,7 +1794,7 @@ def make_joint_prow(prows, descriptor, internal_id): joint_prow['LATEST_QID'] = -99 joint_prow['ALL_QIDS'] = np.ndarray(shape=0).astype(int) joint_prow['SUBMIT_DATE'] = -99 - joint_prow['STATUS'] = 'U' + joint_prow['STATUS'] = 'UNSUBMITTED' joint_prow['SCRIPTNAME'] = '' joint_prow['EXPID'] = np.unique(np.concatenate([currow['EXPID'] for currow in prows])).astype(int) @@ -1861,7 +1871,7 @@ def make_tnight_prow(prows, calibjobs, internal_id): joint_prow['LATEST_QID'] = -99 joint_prow['ALL_QIDS'] = np.ndarray(shape=0).astype(int) joint_prow['SUBMIT_DATE'] = -99 - joint_prow['STATUS'] = 'U' + joint_prow['STATUS'] = 'UNSUBMITTED' joint_prow['SCRIPTNAME'] = '' joint_prow['EXPID'] = np.array([currow['EXPID'][0] for currow in prows], dtype=int) @@ -1892,7 +1902,7 @@ def make_redshift_prow(prows, tnights, descriptor, internal_id): redshift_prow['LATEST_QID'] = -99 redshift_prow['ALL_QIDS'] = np.ndarray(shape=0).astype(int) redshift_prow['SUBMIT_DATE'] = -99 - redshift_prow['STATUS'] = 'U' + redshift_prow['STATUS'] = 'UNSUBMITTED' redshift_prow['SCRIPTNAME'] = '' redshift_prow['EXPID'] = np.array([currow['EXPID'][0] for currow in prows], dtype=int) diff --git a/py/desispec/workflow/proctable.py b/py/desispec/workflow/proctable.py index 3d995c66f..4b37253d4 100644 --- a/py/desispec/workflow/proctable.py +++ b/py/desispec/workflow/proctable.py @@ -77,7 +77,7 @@ def get_processing_table_column_defs(return_default_values=False, """ ## Define the column names for the internal production table and their respective datatypes, split in two ## only for readability's sake - + defqid = get_default_qid() colnames1 = ['EXPID' , 'OBSTYPE', 'TILEID', 'NIGHT' ] coltypes1 = [np.ndarray , 'S10' , int , int ] coldeflt1 = [np.ndarray(shape=0).astype(int), 'unknown', -99 , 20000101] @@ -88,11 +88,11 @@ def get_processing_table_column_defs(return_default_values=False, colnames2 = [ 'PROCCAMWORD' ,'CALIBRATOR', 'INTID', 'OBSDESC', 'JOBDESC', 'LATEST_QID'] coltypes2 = [ 'S40' , np.int8 , int , 'S16' , 'S12' , int ] - coldeflt2 = [ 'a0123456789' , 0 , -99 , '' , 'unknown', -99 ] + coldeflt2 = [ 'a0123456789' , 0 , -99 , '' , 'unknown', defqid ] - colnames2 += [ 'SUBMIT_DATE', 'STATUS', 'SCRIPTNAME'] - coltypes2 += [ int , 'S14' , 'S40' ] - coldeflt2 += [ -99 , 'U' , '' ] + colnames2 += [ 'SUBMIT_DATE', 'STATUS' , 'SCRIPTNAME'] + coltypes2 += [ int , 'S14' , 'S40' ] + coldeflt2 += [ -99 , 'UNSUBMITTED', '' ] colnames2 += ['INT_DEP_IDS' , 'LATEST_DEP_QID' , 'ALL_QIDS' ] coltypes2 += [np.ndarray , np.ndarray , np.ndarray ] @@ -111,6 +111,11 @@ def get_processing_table_column_defs(return_default_values=False, return colnames, coldtypes, coldeflts else: return colnames, coldtypes +def get_default_qid(): + """ + Returns the default slurm job id (QID) for the pipeline + """ + return 1 #99999999 def default_obstypes_for_proctable(): """ diff --git a/py/desispec/workflow/queue.py b/py/desispec/workflow/queue.py index ccff9c704..0d6c04320 100644 --- a/py/desispec/workflow/queue.py +++ b/py/desispec/workflow/queue.py @@ -7,6 +7,8 @@ import numpy as np from astropy.table import Table, vstack import subprocess + +from desispec.workflow.proctable import get_default_qid from desiutil.log import get_logger import time, datetime @@ -39,7 +41,11 @@ def get_resubmission_states(): Returns: list. A list of strings outlining the job states that should be resubmitted. """ - return ['UNSUBMITTED', 'BOOT_FAIL', 'DEADLINE', 'NODE_FAIL', 'OUT_OF_MEMORY', 'PREEMPTED', 'TIMEOUT', 'CANCELLED'] + ## 'UNSUBMITTED' is default pipeline state for things not yet submitted + ## 'DEP_NOT_SUBD' is set when resubmission can't proceed because a + ## dependency has failed + return ['UNSUBMITTED', 'DEP_NOT_SUBD', 'BOOT_FAIL', 'DEADLINE', 'NODE_FAIL', + 'OUT_OF_MEMORY', 'PREEMPTED', 'TIMEOUT', 'CANCELLED'] def get_termination_states(): @@ -301,6 +307,7 @@ def get_queue_states_from_qids(qids, dry_run=0, use_cache=False): Dict Dictionary with the keys as jobids and values as the slurm state of the job. """ + def_qid = get_default_qid() global _cached_slurm_states qids = np.atleast_1d(qids).astype(int) log = get_logger() @@ -317,7 +324,8 @@ def get_queue_states_from_qids(qids, dry_run=0, use_cache=False): if dry_run > 2 or dry_run < 1: outtable = queue_info_from_qids(qids, columns='jobid,state', dry_run=dry_run) for row in outtable: - outdict[int(row['JOBID'])] = row['STATE'] + if int(row['JOBID']) != def_qid: + outdict[int(row['JOBID'])] = row['STATE'] return outdict def update_queue_state_cache_from_table(queue_info_table): @@ -357,7 +365,8 @@ def update_queue_state_cache(qid, state): """ global _cached_slurm_states - _cached_slurm_states[int(qid)] = state + if int(qid) != get_default_qid(): + _cached_slurm_states[int(qid)] = state def clear_queue_state_cache(): """ @@ -407,6 +416,8 @@ def update_from_queue(ptable, qtable=None, dry_run=0, ignore_scriptnames=False): log.info("Will be verifying that the file names are consistent") for row in qtable: + if int(row['JOBID']) == get_default_qid(): + continue match = (int(row['JOBID']) == ptable['LATEST_QID']) if np.any(match): ind = np.where(match)[0][0] From f3f6ba010ce5c8a597beaa01aa7e89d7091afcce Mon Sep 17 00:00:00 2001 From: akremin Date: Thu, 15 Aug 2024 18:19:20 -0700 Subject: [PATCH 16/27] fix unit test to use a non-default QID --- py/desispec/test/test_workflow_queue.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/py/desispec/test/test_workflow_queue.py b/py/desispec/test/test_workflow_queue.py index cea9447b8..ce6a0733e 100644 --- a/py/desispec/test/test_workflow_queue.py +++ b/py/desispec/test/test_workflow_queue.py @@ -19,14 +19,14 @@ def setUp(self): def test_queue_info_from_qids(self): """Test queue_info_from_qids""" - qids = [1,10,2,5] + qids = [11,10,2,5] qinfo = queue.queue_info_from_qids(qids, dry_run=3) self.assertEqual(list(qinfo['JOBID']), qids) def test_queue_state_cache(self): """Test queue state cache""" # Query qids to get state into cache - qids = [1,10,2,5] + qids = [11,10,2,5] qinfo = queue.queue_info_from_qids(qids, dry_run=3) # check cache matches state From 491837c9aaac820300341ceca65d8944aa5ae078 Mon Sep 17 00:00:00 2001 From: akremin Date: Thu, 15 Aug 2024 18:20:19 -0700 Subject: [PATCH 17/27] fix unit test to use a non-default QID --- py/desispec/test/test_workflow_queue.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/py/desispec/test/test_workflow_queue.py b/py/desispec/test/test_workflow_queue.py index ce6a0733e..49924533d 100644 --- a/py/desispec/test/test_workflow_queue.py +++ b/py/desispec/test/test_workflow_queue.py @@ -47,7 +47,7 @@ def test_queue_state_cache(self): queue.update_queue_state_cache(10, 'COMPLETED') qstates = queue.get_queue_states_from_qids(qids, use_cache=True, dry_run=3) # should be ['PENDING', 'COMPLETED', 'FAILED', 'FAILED'] - self.assertEqual(qstates[1], 'PENDING') + self.assertEqual(qstates[11], 'PENDING') self.assertEqual(qstates[10], 'COMPLETED') self.assertEqual(qstates[2], 'FAILED') self.assertEqual(qstates[5], 'FAILED') From 6bae2aea9fda3befc6b585a854656784e3e8c586 Mon Sep 17 00:00:00 2001 From: akremin Date: Fri, 16 Aug 2024 01:13:42 -0500 Subject: [PATCH 18/27] submit_prod threshold to 4500 and remove Slurm env vars --- bin/desi_submit_prod | 4 ++-- py/desispec/scripts/submit_prod.py | 14 +++++++++++--- py/desispec/workflow/queue.py | 5 +++++ py/desispec/workflow/utils.py | 11 +++++++++++ 4 files changed, 29 insertions(+), 5 deletions(-) diff --git a/bin/desi_submit_prod b/bin/desi_submit_prod index 069a80bcf..455f1c9f4 100755 --- a/bin/desi_submit_prod +++ b/bin/desi_submit_prod @@ -1,5 +1,5 @@ #!/usr/bin/env python -# coding: utf-8 +# -*- coding: utf-8 -*- import argparse @@ -14,7 +14,7 @@ def parse_args(): # options=None): parser.add_argument("-p", "--production-yaml", type=str, required=True, help="Relative or absolute pathname to the yaml file summarizing the production.") - parser.add_argument("-q", "--queue-threshold", type=int, default=4800, + parser.add_argument("-q", "--queue-threshold", type=int, default=4500, help="The number of jobs for the current user in the queue at which the" + " at which the script stops submitting new jobs.") diff --git a/py/desispec/scripts/submit_prod.py b/py/desispec/scripts/submit_prod.py index 1849a4e84..5aabbabaa 100644 --- a/py/desispec/scripts/submit_prod.py +++ b/py/desispec/scripts/submit_prod.py @@ -16,8 +16,10 @@ from desispec.io import findfile from desispec.scripts.proc_night import proc_night ## Import some helper functions, you can see their definitions by uncomenting the bash shell command -from desispec.workflow.utils import verify_variable_with_environment, listpath +from desispec.workflow.utils import verify_variable_with_environment, listpath, \ + remove_slurm_environment_variables # TODO when merged into branch with crossnight dependencies this is in workflow.exptable as the new name +# from desispec.workflow.exptable import read_minimal_science_exptab_cols from desispec.workflow.redshifts import read_minimal_exptables_columns as read_minimal_science_exptab_cols from desispec.scripts.submit_night import submit_night from desispec.workflow.queue import check_queue_count @@ -130,7 +132,7 @@ def get_nights_to_process(production_yaml, verbose=False): return sorted(all_nights) -def submit_production(production_yaml, queue_threshold=4800, dry_run_level=False): +def submit_production(production_yaml, queue_threshold=4500, dry_run_level=False): """ Interprets a production_yaml file and submits the respective nights for processing within the defined production. @@ -152,17 +154,23 @@ def submit_production(production_yaml, queue_threshold=4800, dry_run_level=False with open(production_yaml, 'rb') as yamlfile: conf = yaml.safe_load(yamlfile) + ## Unset Slurm environment variables set when running in scrontab + remove_slurm_environment_variables() + ## Make sure the specprod matches, if not set it to that in the file if 'SPECPROD' not in conf: raise ValueError(f"SPECPROD required in yaml file {production_yaml}") specprod = str(conf['SPECPROD']).lower() specprod = verify_variable_with_environment(var=specprod, var_name='specprod', env_name='SPECPROD') + + ## Define the user user = os.environ['USER'] ## Look for sentinal sentinel_file = os.path.join(os.environ['DESI_SPECTRO_REDUX'], - os.environ['SPECPROD'], 'run', 'sentinel.txt') + os.environ['SPECPROD'], 'run', + 'prod_submission_complete.txt') if os.path.exists(sentinel_file): log.info(f"Sentinel file {sentinel_file} exists, therefore all " + f"nights already submitted.") diff --git a/py/desispec/workflow/queue.py b/py/desispec/workflow/queue.py index 4f4a2d41d..5d3fb15a8 100644 --- a/py/desispec/workflow/queue.py +++ b/py/desispec/workflow/queue.py @@ -478,6 +478,11 @@ def get_jobs_in_queue(user=None, include_scron=False, dry_run_level=0): NODELIST(REASON) for the specified user. """ log = get_logger() + if user is None: + if 'USER' in os.environ: + user = os.environ['USER'] + else: + user = 'desi' cmd = f'squeue -u {user} -o "%i,%P,%j,%u,%t,%M,%D,%R"' cmd_as_list = cmd.split() diff --git a/py/desispec/workflow/utils.py b/py/desispec/workflow/utils.py index 2c2460336..a693c72e5 100644 --- a/py/desispec/workflow/utils.py +++ b/py/desispec/workflow/utils.py @@ -282,3 +282,14 @@ def sleep_and_report(sleep_duration=0.1, message_suffix="", logfunc=None, dry_ru time.sleep(sleep_duration) logfunc(f"Resuming...") logfunc("\n\n") + +def remove_slurm_environment_variables(): + """ + Removes SLURM_MEM_PER_CPU and SLURM_OPEN_MODE from os.environ if present + """ + log = get_logger() + for var in ['SLURM_MEM_PER_CPU', 'SLURM_OPEN_MODE']: + if var in os.environ: + log.info(f"Removing Slurm variable {var} from the environment" + + " before running.") + del os.environ[var] \ No newline at end of file From 94871bb69cd898cfca8a79c91c0670e445ad356f Mon Sep 17 00:00:00 2001 From: Stephen Bailey Date: Fri, 16 Aug 2024 10:19:31 -0700 Subject: [PATCH 19/27] FIBERSTATUS VARIABLETHRU bad for sky and stdstars --- py/desispec/fiberbitmasking.py | 36 ++++++++++++++++++++-- py/desispec/test/test_fiberbitmask.py | 43 +++++++++++++++++++++++++++ 2 files changed, 76 insertions(+), 3 deletions(-) diff --git a/py/desispec/fiberbitmasking.py b/py/desispec/fiberbitmasking.py index e6c99cd34..44c6987c7 100644 --- a/py/desispec/fiberbitmasking.py +++ b/py/desispec/fiberbitmasking.py @@ -110,7 +110,7 @@ def get_fiberbitmasked_frame_arrays(frame,bitmask=None,ivar_framemask=True,retur def get_fiberbitmask_comparison_value(kind,band): """Takes a string argument and returns a 32-bit integer representing the logical OR of all - relevant fibermask bits for that given reduction step + fatally bad fibermask bits for that given reduction step Args: kind: str : string designating which combination of bits to use based on the operation. @@ -122,6 +122,8 @@ def get_fiberbitmask_comparison_value(kind,band): Returns: bitmask : 32 bit bitmask corresponding to the fiberbitmask of the desired kind in the desired cameras (bands). + + if FIBERSTATUS & bitmask != 0, then that fiber should not be used """ if kind.lower() == 'all': return get_all_fiberbitmask_with_amp(band) @@ -141,17 +143,33 @@ def get_fiberbitmask_comparison_value(kind,band): def get_skysub_fiberbitmask_val(band): - return get_all_fiberbitmask_with_amp(band) + """ + Return mask of bad FIBERSTATUS bits for selecting sky fibers, + i.e. fibers with these bits set should not be used for the sky model + """ + return get_all_fiberbitmask_with_amp(band) | fmsk.VARIABLETHRU def get_flat_fiberbitmask_val(band): + """ + Return mask of bad FIBERSTATUS bits for fiberflats + i.e. fibers with these bits set have a bad fiberflat and cannot be used + """ return (fmsk.BROKENFIBER | fmsk.BADFIBER | fmsk.BADTRACE | fmsk.BADARC | \ fmsk.MANYBADCOL | fmsk.MANYREJECTED ) def get_fluxcalib_fiberbitmask_val(band): + """ + Return mask of bad FIBERSTATUS bits that should trigger flux=ivar=0 + instead of flux calibrating the spectra. + """ return get_all_fiberbitmask_with_amp(band) def get_stdstars_fiberbitmask_val(band): - return get_all_fiberbitmask_with_amp(band) | fmsk.POORPOSITION + """ + Return mask of bad FIBERSTATUS bits for selecting standard stars, + i.e. fibers with these bits set should not be used as standard stars + """ + return get_all_fiberbitmask_with_amp(band) | fmsk.POORPOSITION | fmsk.VARIABLETHRU def get_all_nonamp_fiberbitmask_val(): """Return a mask for all fatally bad FIBERSTATUS bits except BADAMPB/R/Z @@ -161,6 +179,8 @@ def get_all_nonamp_fiberbitmask_val(): be on a valid sky location, or even a target for RESTRICTED. Also does not include POORPOSITION which is bad for stdstars but not necessarily fatal for otherwise processing a normal fiber. + NEARCHARGETRAP and VARIABLETHRU are also not included since + they are ok for some types of processing but not others. """ return (fmsk.BROKENFIBER | fmsk.MISSINGPOSITION | \ fmsk.BADPOSITION | \ @@ -168,9 +188,16 @@ def get_all_nonamp_fiberbitmask_val(): fmsk.MANYBADCOL | fmsk.MANYREJECTED ) def get_justamps_fiberbitmask(): + """ + Return a mask of the amp-specific FIBERSTATUS bits + """ return ( fmsk.BADAMPB | fmsk.BADAMPR | fmsk.BADAMPZ ) def get_all_fiberbitmask_with_amp(band): + """ + Return all fatally bad FIBERSTATUS bits including the amp-specific + bit for this band + """ amp_mask = get_all_nonamp_fiberbitmask_val() if band.lower().find('b')>=0: amp_mask |= fmsk.BADAMPB @@ -181,4 +208,7 @@ def get_all_fiberbitmask_with_amp(band): return amp_mask def get_all_fiberbitmask_val(): + """ + Return a mask of all fatally bad FIBERSTATUS bits + """ return ( get_all_nonamp_fiberbitmask_val() | get_justamps_fiberbitmask() ) diff --git a/py/desispec/test/test_fiberbitmask.py b/py/desispec/test/test_fiberbitmask.py index a3b7a91d4..c0cede27d 100644 --- a/py/desispec/test/test_fiberbitmask.py +++ b/py/desispec/test/test_fiberbitmask.py @@ -11,6 +11,8 @@ from desispec.test.util import get_frame_data from desispec.fiberbitmasking import get_fiberbitmasked_frame_arrays +from desispec.fiberbitmasking import get_fiberbitmask_comparison_value +from desispec.maskbits import fibermask class TestFrameBitMask(unittest.TestCase): @@ -38,3 +40,44 @@ def test_framebitmask(self): ivar2 = get_fiberbitmasked_frame_arrays(self.frame, bitmask=1) self.assertTrue( np.all(ivar1 == ivar2) ) + + def check_mask(self, bitname, ok_steps, bad_steps): + """ + Check get_fiberbitmask_comparison_value(step, 'b') for every step. + FIBERSTATUS fibermask.mask(bitname) should be set for every step + in bad_steps and not set for every step in ok_steps. + """ + for step in ok_steps: + mask = get_fiberbitmask_comparison_value(step, 'b') + self.assertTrue(mask & fibermask.mask(bitname) == 0, f"{step=} unnecessarily excludes {bitname}") + + for step in bad_steps: + mask = get_fiberbitmask_comparison_value(step, 'b') + self.assertTrue(mask & fibermask.mask(bitname) != 0, f"{step=} should exclude {bitname} but doesn't") + + def test_ambiguous_maskbits(self): + """Test cases that are bad for some steps but not for others + """ + + # NOTE: fiberfitmask doesn't currently support arc + + #- BROKENFIBER is bad for everything + self.check_mask('BROKENFIBER', ok_steps=[], bad_steps=['flat', 'sky', 'stdstar', 'fluxcalib']) + + #- RESTRICTED is ok for everything + self.check_mask('RESTRICTED', ok_steps=['flat', 'sky', 'stdstar', 'fluxcalib'], bad_steps=[]) + + #- BADPOSITION is ok for flats, but bad for others + self.check_mask('BADPOSITION', ok_steps=['flat',], bad_steps=['sky', 'stdstar', 'fluxcalib']) + + #- POORPOSITION is ok for flats, sky, and fluxcalib; but bad for stdstars + self.check_mask('POORPOSITION', ok_steps=['flat', 'sky', 'fluxcalib'], bad_steps=['stdstar']) + + #- NEARCHARGETRAP is informative; treated as ok for everyone including sky + #- TODO: it's actually bad for faint targets and sky for a single amp, but we structurally + #- don't have a way to encode that in FIBERSTATUS (fiber not CCD or amp) + self.check_mask('NEARCHARGETRAP', ok_steps=['flat', 'sky', 'stdstar', 'fluxcalib'], bad_steps=[]) + + #- VARIABLETHRU is ok for flats because otherwise we'd block the entire fiber, + #- and ok to at least attempt to flux calibrate it, but it shouldn't be used for sky or stdstars + self.check_mask('VARIABLETHRU', ok_steps=['flat', 'fluxcalib'], bad_steps=['sky', 'stdstar']) From 3870655ca4fd97085c8aad7fb1b1171eccaf2788 Mon Sep 17 00:00:00 2001 From: Stephen Bailey Date: Fri, 16 Aug 2024 11:46:32 -0700 Subject: [PATCH 20/27] update comment about caches --- py/desispec/scripts/proc_night.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/py/desispec/scripts/proc_night.py b/py/desispec/scripts/proc_night.py index 9a54ca3f3..f5191b8a5 100644 --- a/py/desispec/scripts/proc_night.py +++ b/py/desispec/scripts/proc_night.py @@ -337,8 +337,9 @@ def proc_night(night=None, proc_obstypes=None, z_submit_types=None, etable, ptable = load_tables(tablenames=table_pathnames, tabletypes=table_types) full_etable = etable.copy() - ## Pre-populate exposure table and processing table caches of all nights - ## if doing cross-night redshifts + ## For I/O efficiency, pre-populate exposure table and processing table caches + ## of all nights if doing cross-night redshifts so that future per-night "reads" + ## will use the cache. if z_submit_types is not None and 'cumulative' in z_submit_types: ## this shouldn't need to change since we've already updated the exptab read_minimal_science_exptab_cols() From 37d8249e22515c6436cc781893bfb47e0290018d Mon Sep 17 00:00:00 2001 From: Stephen Bailey Date: Fri, 16 Aug 2024 13:10:25 -0700 Subject: [PATCH 21/27] special case jobgraph UNSUBMITTED qid=1 --- bin/desi_job_graph | 3 +++ 1 file changed, 3 insertions(+) diff --git a/bin/desi_job_graph b/bin/desi_job_graph index 9145416b2..d35f57b23 100755 --- a/bin/desi_job_graph +++ b/bin/desi_job_graph @@ -83,6 +83,7 @@ fx.write(f""" classDef OUT_OF_MEMORY fill:#d95f0e; classDef TIMEOUT fill:#d95f0e; classDef CANCELLED fill:#fed98e; + classDef NOTSUBMITTED fill:#fcae1e; classDef UNKNOWN fill:#ffffcc; """) @@ -106,6 +107,8 @@ for row in proctable: if qid in jobinfo: state = jobinfo[qid]['STATE'].split()[0] + elif qid == 1: + state = 'NOTSUBMITTED' else: state = 'UNKNOWN' From 81aba3255442eab0a03b8b904479959f6a3f2e64 Mon Sep 17 00:00:00 2001 From: Stephen Bailey Date: Fri, 16 Aug 2024 13:41:01 -0700 Subject: [PATCH 22/27] exptable/proctable updates after PR #2321 --- py/desispec/scripts/submit_prod.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/py/desispec/scripts/submit_prod.py b/py/desispec/scripts/submit_prod.py index 5aabbabaa..ccca7b422 100644 --- a/py/desispec/scripts/submit_prod.py +++ b/py/desispec/scripts/submit_prod.py @@ -18,12 +18,10 @@ ## Import some helper functions, you can see their definitions by uncomenting the bash shell command from desispec.workflow.utils import verify_variable_with_environment, listpath, \ remove_slurm_environment_variables -# TODO when merged into branch with crossnight dependencies this is in workflow.exptable as the new name -# from desispec.workflow.exptable import read_minimal_science_exptab_cols -from desispec.workflow.redshifts import read_minimal_exptables_columns as read_minimal_science_exptab_cols +from desispec.workflow.exptable import read_minimal_science_exptab_cols from desispec.scripts.submit_night import submit_night from desispec.workflow.queue import check_queue_count - +import desispec.workflow.proctable def get_nights_in_date_range(first_night, last_night): """ @@ -261,8 +259,11 @@ def submit_production(production_yaml, queue_threshold=4500, dry_run_level=False ## We don't expect exposure tables to change during code execution here ## but we do expect processing tables to evolve, so clear that cache log.info(f"Processing {night=}") - # TODO uncomment when merged into branch with crossnight dependencies - #desispec.workflow.proctable.reset_tilenight_ptab_cache() + + ## Belt-and-suspenders: reset the processing table cache to force a re-read. + ## This shouldn't be necessary, but resetting the cache is conservative. + desispec.workflow.proctable.reset_tilenight_ptab_cache() + if dry_run_level < 4: logfile = os.path.join(logpath, f'night-{night}.log') with stdouterr_redirected(logfile): From 19aa9ab345e33735e32a7ee25fcb617900ec1d3f Mon Sep 17 00:00:00 2001 From: Stephen Bailey Date: Fri, 16 Aug 2024 13:52:26 -0700 Subject: [PATCH 23/27] carriage returns in production sentinel file --- py/desispec/scripts/submit_prod.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/py/desispec/scripts/submit_prod.py b/py/desispec/scripts/submit_prod.py index ccca7b422..d5be16673 100644 --- a/py/desispec/scripts/submit_prod.py +++ b/py/desispec/scripts/submit_prod.py @@ -304,8 +304,8 @@ def submit_production(production_yaml, queue_threshold=4500, dry_run_level=False if dry_run_level < 4: with open(sentinel_file, 'w') as sentinel: sentinel.write( - f"All done with processing for {production_yaml}") - sentinel.write(f"Nights processed: {all_nights}") + f"All done with processing for {production_yaml}\n") + sentinel.write(f"Nights processed: {all_nights}\n") else: log.info(f"{dry_run_level=} so not creating {sentinel_file}") From 8368c74fbbdb455edc0d2f4cba2618eb8c17fd37 Mon Sep 17 00:00:00 2001 From: anand_raichoor Date: Fri, 16 Aug 2024 14:11:41 -0700 Subject: [PATCH 24/27] faflavor2program(): update special faflavors list --- py/desispec/io/meta.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/py/desispec/io/meta.py b/py/desispec/io/meta.py index 6b22d6a5c..035b85080 100755 --- a/py/desispec/io/meta.py +++ b/py/desispec/io/meta.py @@ -807,11 +807,36 @@ def faflavor2program(faflavor): dark |= faflavor == 'sv1elgqso' dark |= faflavor == 'sv1lrgqso' dark |= faflavor == 'sv1lrgqso2' + dark |= np.in1d( + faflavor, + np.char.add( + "special", + [ + 'm31', 'odin', 'tertiary1', 'tertiary2', 'tertiary4', 'tertiary5', + 'tertiary7', 'tertiary9', 'tertiary11', 'tertiary14', 'tertiary15', + 'tertiary16', 'tertiary17', 'tertiary18', 'tertiary21', 'tertiary23', + 'tertiary25', 'tertiary26', 'tertiary27', 'tertiary31', 'tertiary35', + 'tertiary37', 'tertiary38', 'tertiary40', 'tertiary41', + ] + ) + ) dark |= np.char.endswith(faflavor, 'dark') #- SV1 FAFLAVOR options that map to FAPRGRM='bright' bright = faflavor == 'sv1bgsmws' bright |= (faflavor != 'sv1unwisebluebright') & np.char.endswith(faflavor, 'bright') + bright |= np.in1d( + faflavor, + np.char.add( + "special", + [ + 'tertiary3', 'tertiary6', 'tertiary8', 'tertiary10', 'tertiary12', + 'tertiary13', 'tertiary19', 'tertiary20', 'tertiary22', 'tertiary24', + 'tertiary28', 'tertiary29', 'tertiary30', 'tertiary32', 'tertiary33', + 'tertiary34', 'tertiary36', 'tertiary39', + ] + ) + ) #- SV1 FAFLAVOR options that map to FAPRGRM='backup' backup = faflavor == 'sv1backup1' From 72283eb09a3261ea142419d723b2f1d538531624 Mon Sep 17 00:00:00 2001 From: anand_raichoor Date: Fri, 16 Aug 2024 14:12:51 -0700 Subject: [PATCH 25/27] faflavor2program(): add note in docstr --- py/desispec/io/meta.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/py/desispec/io/meta.py b/py/desispec/io/meta.py index 035b85080..317cf26bc 100755 --- a/py/desispec/io/meta.py +++ b/py/desispec/io/meta.py @@ -790,7 +790,10 @@ def faflavor2program(faflavor): faprgm (str or array of str): what FAPRGM would be if we had set it (dark, bright, backup, other) - Note: this was standardized by sv3 and main, but evolved during sv1 and sv2 + Note: this was standardized by sv3 and main, but evolved during sv1 and sv2. + for the survey=special tiles (m31, odin, and tertiary), the info + is/can be retrieve from the GOALTYPE keyword in the zero-th extension + of the fiberassign file. """ #- Handle scalar or array input, upcasting bytes to str as needed scalar_input = np.isscalar(faflavor) From 9527351806bb3e952b7b6f5155f11e7c7dad3958 Mon Sep 17 00:00:00 2001 From: Stephen Bailey Date: Fri, 16 Aug 2024 15:57:28 -0700 Subject: [PATCH 26/27] update release notes and version for desispec/0.65.0 --- doc/changes.rst | 20 +++++++++++++++++++- py/desispec/_version.py | 2 +- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/doc/changes.rst b/doc/changes.rst index 2305d665f..25321709b 100644 --- a/doc/changes.rst +++ b/doc/changes.rst @@ -2,7 +2,12 @@ desispec Change Log =================== -0.64.1 (unreleased) +0.65.1 (unreleased) +------------------- + +* No changes yet. + +0.65.0 (2024-08-16) ------------------- * healpix grouping memory and I/O efficiency improvements (PR `#2290`_). @@ -21,6 +26,13 @@ desispec Change Log of DESI_SPECTRO_CALIB yaml files (PR `#2313`_). * Use read noise estimated per CCD row for some amplifiers (PR `#2314`_). * Allow certain old redrock files in daily reductions to be read (PR `#2317`_). +* Fix assemble_fibermap and unit tests for old data without dark models + (PR `#2319`_). +* merge_psf option to not include input psf for sims (PR `#2320`_). +* Add cross-night dependency tracking for cumulative ztile jobs (PR `#2321`_). +* Modernize submit_prod (PR `#2322`_). +* FIBERSTATUS VARIABLETHRU bad for sky and stdstars (PR `#2323`_). +* faflavor2program for special tiles: fix other to bright or dark (PR `#2325`_). .. _`#2290`: https://github.com/desihub/desispec/pull/2290 .. _`#2294`: https://github.com/desihub/desispec/pull/2294 @@ -31,6 +43,12 @@ desispec Change Log .. _`#2313`: https://github.com/desihub/desispec/pull/2313 .. _`#2314`: https://github.com/desihub/desispec/pull/2314 .. _`#2317`: https://github.com/desihub/desispec/pull/2317 +.. _`#2319`: https://github.com/desihub/desispec/pull/2319 +.. _`#2320`: https://github.com/desihub/desispec/pull/2320 +.. _`#2321`: https://github.com/desihub/desispec/pull/2321 +.. _`#2322`: https://github.com/desihub/desispec/pull/2322 +.. _`#2323`: https://github.com/desihub/desispec/pull/2323 +.. _`#2325`: https://github.com/desihub/desispec/pull/2325 0.64.0 (2024-07-01) ------------------- diff --git a/py/desispec/_version.py b/py/desispec/_version.py index 6267bafbf..0b9d1ca44 100644 --- a/py/desispec/_version.py +++ b/py/desispec/_version.py @@ -1 +1 @@ -__version__ = '0.64.0.dev8624' +__version__ = '0.65.0' From 2f63761ff67085e7453f6d7a033fbd4547f64ae1 Mon Sep 17 00:00:00 2001 From: Stephen Bailey Date: Fri, 16 Aug 2024 15:58:10 -0700 Subject: [PATCH 27/27] update dev version after 0.65.0 tag --- py/desispec/_version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/py/desispec/_version.py b/py/desispec/_version.py index 0b9d1ca44..998562ab9 100644 --- a/py/desispec/_version.py +++ b/py/desispec/_version.py @@ -1 +1 @@ -__version__ = '0.65.0' +__version__ = '0.65.0.dev8717'