Skip to content

Commit

Permalink
Merge branch 'master' into add_FRF_noise_cov
Browse files Browse the repository at this point in the history
  • Loading branch information
jsdillon committed Oct 22, 2024
2 parents a3de339 + 3dc44e9 commit 3eecfad
Show file tree
Hide file tree
Showing 3 changed files with 300 additions and 332 deletions.
19 changes: 10 additions & 9 deletions notebooks/delay_filtered_average_zscore.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
"source": [
"# Single File Delay Filtered Average Z-Score\n",
"\n",
"**by Josh Dillon and Tyler Cox**, last updated October 10, 2024\n",
"**by Josh Dillon and Tyler Cox**, last updated October 18, 2024\n",
"\n",
"This notebook is designed to calculate a metric used for finding low-level RFI in redundantly-averaged cross-correlations, which are then incoherently averaged across well-sampled baselines.\n",
"\n",
Expand Down Expand Up @@ -289,15 +289,16 @@
"metadata": {},
"outputs": [],
"source": [
"zscore = {}\n",
"for pol in ['ee', 'nn']:\n",
" abs_SNRs_this_pol = [np.abs(dly_filt_SNRs[bl]) for bl in bls_to_filter if (bl[2] == pol) and (bl[0] != bl[1])]\n",
" \n",
" predicted_mean = 1.0\n",
" sigma = predicted_mean * np.sqrt(2 / np.pi)\n",
" variance_expected = (4 - np.pi) / 2 * sigma**2 / len(abs_SNRs_this_pol)\n",
"if not ALL_FLAGGED:\n",
" zscore = {}\n",
" for pol in ['ee', 'nn']:\n",
" abs_SNRs_this_pol = [np.abs(dly_filt_SNRs[bl]) for bl in bls_to_filter if (bl[2] == pol) and (bl[0] != bl[1])]\n",
"\n",
" predicted_mean = 1.0\n",
" sigma = predicted_mean * np.sqrt(2 / np.pi)\n",
" variance_expected = (4 - np.pi) / 2 * sigma**2 / len(abs_SNRs_this_pol)\n",
"\n",
" zscore[pol] = (np.nanmean(abs_SNRs_this_pol, axis=0) - predicted_mean) / variance_expected**.5"
" zscore[pol] = (np.nanmean(abs_SNRs_this_pol, axis=0) - predicted_mean) / variance_expected**.5"
]
},
{
Expand Down
126 changes: 100 additions & 26 deletions notebooks/full_day_rfi_round_2.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
"source": [
"# Second Round of Full Day RFI Flagging\n",
"\n",
"**by Josh Dillon**, last updated July 31, 2023\n",
"**by Josh Dillon**, last updated October 13, 2024\n",
"\n",
"This notebook is synthesizes information from individual [delay_filtered_average_zscore](https://github.com/HERA-Team/hera_notebook_templates/blob/master/notebooks/delay_filtered_average_zscore.ipynb) notebooks to find low-level RFI and flag it. That notebook takes `smooth_cal`ibrated data, redundantly averages it, performs a high-pass delay filter, and then incoherently averages across baselines, creating a per-polarization z-score. This notebook then takes that whole night of z-scores and finds a new set of flags to both add to the `smooth_cal` files, which are updated in place, and to write down as new `UVFlag` waterfall-type `.h5` files.\n",
"\n",
Expand Down Expand Up @@ -53,6 +53,7 @@
"from hera_cal import utils\n",
"from hera_qm import xrfi\n",
"from hera_qm.time_series_metrics import true_stretches\n",
"from hera_filters import dspec\n",
"\n",
"from IPython.display import display, HTML\n",
"%matplotlib inline\n",
Expand Down Expand Up @@ -91,13 +92,18 @@
"out_yaml_file = os.path.join(OUT_YAML_DIR, SUM_FILE.split('.')[-4] + OUT_YAML_SUFFIX) \n",
"\n",
"# get flagging parameters\n",
"Z_THRESH = float(os.environ.get(\"Z_THRESH\", 5))\n",
"WS_Z_THRESH = float(os.environ.get(\"WS_Z_THRESH\", 4))\n",
"Z_THRESH = float(os.environ.get(\"Z_THRESH\", 4))\n",
"WS_Z_THRESH = float(os.environ.get(\"WS_Z_THRESH\", 2))\n",
"AVG_Z_THRESH = float(os.environ.get(\"AVG_Z_THRESH\", 1))\n",
"MAX_FREQ_FLAG_FRAC = float(os.environ.get(\"MAX_FREQ_FLAG_FRAC\", .25))\n",
"MAX_TIME_FLAG_FRAC = float(os.environ.get(\"MAX_TIME_FLAG_FRAC\", .1))\n",
"AVG_SPECTRUM_FILTER_DELAY = float(os.environ.get(\"AVG_SPECTRUM_FILTER_DELAY\", 250)) # in ns\n",
"EIGENVAL_CUTOFF = float(os.environ.get(\"EIGENVAL_CUTOFF\", 1e-12))\n",
"TIME_AVG_DELAY_FILT_SNR_THRESH = float(os.environ.get(\"TIME_AVG_DELAY_FILT_SNR_THRESH\", 4.0))\n",
"TIME_AVG_DELAY_FILT_SNR_DYNAMIC_RANGE = float(os.environ.get(\"TIME_AVG_DELAY_FILT_SNR_DYNAMIC_RANGE\", 1.5))\n",
"\n",
"for setting in ['Z_THRESH', 'WS_Z_THRESH', 'AVG_Z_THRESH', 'MAX_FREQ_FLAG_FRAC', 'MAX_TIME_FLAG_FRAC']:\n",
"for setting in ['Z_THRESH', 'WS_Z_THRESH', 'AVG_Z_THRESH', 'MAX_FREQ_FLAG_FRAC', 'MAX_TIME_FLAG_FRAC', 'AVG_SPECTRUM_FILTER_DELAY',\n",
" 'EIGENVAL_CUTOFF', 'TIME_AVG_DELAY_FILT_SNR_THRESH', 'TIME_AVG_DELAY_FILT_SNR_DYNAMIC_RANGE']:\n",
" print(f'{setting} = {eval(setting)}')"
]
},
Expand Down Expand Up @@ -184,12 +190,12 @@
"metadata": {},
"outputs": [],
"source": [
"def plot_max_z_score(zscore, flags=None):\n",
"def plot_max_z_score(zscore, flags=None, vmin=-5, vmax=5):\n",
" if flags is None:\n",
" flags = np.any(~np.isfinite(list(zscore.values())), axis=0)\n",
" plt.figure(figsize=(14,10), dpi=100)\n",
" plt.imshow(np.where(flags, np.nan, np.nanmax([zscore['ee'], zscore['nn']], axis=0)), aspect='auto', \n",
" cmap='coolwarm', interpolation='none', vmin=-10, vmax=10, extent=extent)\n",
" cmap='coolwarm', interpolation='none', vmin=vmin, vmax=vmax, extent=extent)\n",
" plt.colorbar(location='top', label='Max z-score of either polarization', extend='both', aspect=40, pad=.02)\n",
" plt.xlabel('Frequency (MHz)')\n",
" plt.ylabel(f'JD - {int(times[0])}')\n",
Expand All @@ -210,9 +216,7 @@
"cell_type": "code",
"execution_count": null,
"id": "bf8ae866",
"metadata": {
"scrolled": false
},
"metadata": {},
"outputs": [],
"source": [
"plot_max_z_score(zscore)"
Expand Down Expand Up @@ -278,14 +282,14 @@
"metadata": {},
"outputs": [],
"source": [
"def iteratively_flag_on_averaged_zscore(flags, zscore, avg_z_thresh=1.5, verbose=True):\n",
"def iteratively_flag_on_averaged_zscore(flags, zscore, avg_func=np.nanmean, avg_z_thresh=AVG_Z_THRESH, verbose=True):\n",
" '''Flag whole integrations or channels based on average z-score. This is done\n",
" iteratively to prevent bad times affecting channel averages or vice versa.'''\n",
" flagged_chan_count = 0\n",
" flagged_int_count = 0\n",
" while True:\n",
" zspec = np.nanmean(np.where(flags, np.nan, zscore), axis=0)\n",
" ztseries = np.nanmean(np.where(flags, np.nan, zscore), axis=1)\n",
" zspec = avg_func(np.where(flags, np.nan, zscore), axis=0)\n",
" ztseries = avg_func(np.where(flags, np.nan, zscore), axis=1)\n",
"\n",
" if (np.nanmax(zspec) < avg_z_thresh) and (np.nanmax(ztseries) < avg_z_thresh):\n",
" break\n",
Expand All @@ -300,21 +304,68 @@
" if verbose:\n",
" print(f'\\tFlagging an additional {flagged_int_count} integrations and {flagged_chan_count} channels.')\n",
"\n",
"def impose_max_chan_flag_frac(flags, max_flag_frac=.25, verbose=True):\n",
"def impose_max_chan_flag_frac(flags, max_flag_frac=MAX_FREQ_FLAG_FRAC, verbose=True):\n",
" '''Flag channels already flagged more than max_flag_frac (excluding completely flagged times).'''\n",
" unflagged_times = ~np.all(flags, axis=1)\n",
" frequently_flagged_chans = np.mean(flags[unflagged_times, :], axis=0) >= max_flag_frac\n",
" if verbose:\n",
" print(f'\\tFlagging {np.sum(frequently_flagged_chans) - np.sum(np.all(flags, axis=0))} channels previously flagged {max_flag_frac:.2%} or more.') \n",
" flags[:, frequently_flagged_chans] = True \n",
" \n",
"def impose_max_time_flag_frac(flags, max_flag_frac=.25, verbose=True):\n",
"def impose_max_time_flag_frac(flags, max_flag_frac=MAX_TIME_FLAG_FRAC, verbose=True):\n",
" '''Flag times already flagged more than max_flag_frac (excluding completely flagged channels).'''\n",
" unflagged_chans = ~np.all(flags, axis=0)\n",
" frequently_flagged_times = np.mean(flags[:, unflagged_chans], axis=1) >= max_flag_frac\n",
" if verbose:\n",
" print(f'\\tFlagging {np.sum(frequently_flagged_times) - np.sum(np.all(flags, axis=1))} times previously flagged {max_flag_frac:.2%} or more.')\n",
" flags[frequently_flagged_times, :] = True "
" flags[frequently_flagged_times, :] = True\n",
"\n",
"def time_avg_zscore_dly_filt_SNRs(flags, filter_delay=AVG_SPECTRUM_FILTER_DELAY, eigenval_cutoff=EIGENVAL_CUTOFF):\n",
" \"\"\"Produces SNRs after time-averaging z-scores and delay filtering, accounting for flagging's effect on the filter.\"\"\"\n",
" # figure out high and low band based on FM gap at 100 MHz\n",
" flagged_stretches = true_stretches(np.all(flags, axis=0))\n",
" FM_gap = [fs for fs in flagged_stretches if fs.start <= np.argmin(np.abs(freqs - 100e6)) < fs.stop][0]\n",
" low_band = slice((0 if flagged_stretches[0].start != 0 else flagged_stretches[0].stop), FM_gap.start)\n",
" high_band = slice(FM_gap.stop, (len(freqs) if flagged_stretches[-1].stop != len(freqs) else flagged_stretches[-1].start))\n",
" \n",
" filt_SNR = {}\n",
" for pol in zscore:\n",
" # calculate timeavg_SNR and filter\n",
" noise_prediction = 1.0 / np.sum(~flags, axis=0)**.5\n",
" timeavg_SNR = np.nanmean(np.where(flags, np.nan, zscore[pol] / noise_prediction), axis=0) \n",
" wgts = np.where(np.isfinite(timeavg_SNR), 1, 0)\n",
" model = np.zeros_like(timeavg_SNR)\n",
" for band in [low_band, high_band]:\n",
" model[band], _, _ = dspec.fourier_filter(freqs[band], np.where(np.isfinite(timeavg_SNR[band]), timeavg_SNR[band], 0),\n",
" wgts[band], [0], [AVG_SPECTRUM_FILTER_DELAY / 1e9], mode=\"dpss_solve\", \n",
" eigenval_cutoff=[EIGENVAL_CUTOFF], suppression_factors=[EIGENVAL_CUTOFF])\n",
" filt_SNR[pol] = timeavg_SNR - model\n",
"\n",
" # correct for impact of filter\n",
" correction_factors = np.ones_like(wgts) * np.nan\n",
" for band in [low_band, high_band]:\n",
" X = dspec.dpss_operator(freqs[band], [0], filter_half_widths=[AVG_SPECTRUM_FILTER_DELAY / 1e9], eigenval_cutoff=[EIGENVAL_CUTOFF])[0]\n",
" W = wgts[band]\n",
" leverage = np.diag(X @ np.linalg.pinv(np.dot(X.T * W, X)) @ (X.T * W))\n",
" correction_factors[band] = np.where(leverage > 0, (1 - leverage)**.5, np.nan) # because the underlying data should be gaussian\n",
" filt_SNR[pol] /= correction_factors\n",
" \n",
" return filt_SNR\n",
"\n",
"def iteratively_flag_on_delay_filtered_time_avg_zscore(flags, thresh=TIME_AVG_DELAY_FILT_SNR_THRESH, dynamic_range=TIME_AVG_DELAY_FILT_SNR_DYNAMIC_RANGE,\n",
" filter_delay=AVG_SPECTRUM_FILTER_DELAY, eigenval_cutoff=EIGENVAL_CUTOFF):\n",
" \"\"\"Flag whole channels based on their outlierness after delay-filterd time-averaged zscores.\n",
" This is done iteratively since the delay filter can be unduly influenced by large outliers.\"\"\"\n",
" filt_SNR = time_avg_zscore_dly_filt_SNRs(flags, filter_delay=AVG_SPECTRUM_FILTER_DELAY, eigenval_cutoff=EIGENVAL_CUTOFF)\n",
" while True:\n",
" largest_SNR = np.nanmax(list(filt_SNR.values()))\n",
" if largest_SNR < thresh:\n",
" break\n",
" # \n",
" cut = np.max([thresh, largest_SNR / dynamic_range])\n",
" for pol in filt_SNR:\n",
" flags[:, filt_SNR[pol] > cut] = True\n",
" filt_SNR = time_avg_zscore_dly_filt_SNRs(flags, filter_delay=AVG_SPECTRUM_FILTER_DELAY, eigenval_cutoff=EIGENVAL_CUTOFF)"
]
},
{
Expand All @@ -327,6 +378,17 @@
"flags = np.any(~np.isfinite(list(zscore.values())), axis=0)\n",
"print(f'{np.mean(flags):.3%} of waterfall flagged to start.')\n",
"\n",
"# flag whole integrations or channels using outliers in median\n",
"while True:\n",
" nflags = np.sum(flags)\n",
" for pol in ['ee', 'nn']: \n",
" iteratively_flag_on_averaged_zscore(flags, zscore[pol], avg_func=np.nanmedian, avg_z_thresh=AVG_Z_THRESH, verbose=True)\n",
" impose_max_chan_flag_frac(flags, max_flag_frac=MAX_FREQ_FLAG_FRAC, verbose=True)\n",
" impose_max_time_flag_frac(flags, max_flag_frac=MAX_TIME_FLAG_FRAC, verbose=True)\n",
" if np.sum(flags) == nflags:\n",
" break \n",
"print(f'{np.mean(flags):.3%} of waterfall flagged after flagging whole times and channels with median z > {AVG_Z_THRESH}.')\n",
"\n",
"# flag largest outliers\n",
"for pol in ['ee', 'nn']:\n",
" flags |= (zscore[pol] > Z_THRESH) \n",
Expand All @@ -339,18 +401,32 @@
" flags |= xrfi._ws_flag_waterfall(zscore[pol], flags, WS_Z_THRESH)\n",
" if np.sum(flags) == nflags:\n",
" break\n",
"print(f'{np.mean(flags):.3%} of waterfall flagged after watershed flagging on z > {WS_Z_THRESH} neightbors of prior flags.')\n",
"print(f'{np.mean(flags):.3%} of waterfall flagged after watershed flagging on z > {WS_Z_THRESH} neighbors of prior flags.')\n",
" \n",
"# flag whole integrations or channels\n",
"# flag whole integrations or channels using outliers in mean\n",
"while True:\n",
" nflags = np.sum(flags)\n",
" for pol in ['ee', 'nn']: \n",
" iteratively_flag_on_averaged_zscore(flags, zscore[pol], avg_z_thresh=AVG_Z_THRESH, verbose=True)\n",
" iteratively_flag_on_averaged_zscore(flags, zscore[pol], avg_func=np.nanmean, avg_z_thresh=AVG_Z_THRESH, verbose=True)\n",
" impose_max_chan_flag_frac(flags, max_flag_frac=MAX_FREQ_FLAG_FRAC, verbose=True)\n",
" impose_max_time_flag_frac(flags, max_flag_frac=MAX_TIME_FLAG_FRAC, verbose=True)\n",
" if np.sum(flags) == nflags:\n",
" break \n",
"print(f'{np.mean(flags):.3%} of waterfall flagged after flagging whole times and channels with average z > {AVG_Z_THRESH}.')"
"print(f'{np.mean(flags):.3%} of waterfall flagged after flagging whole times and channels with average z > {AVG_Z_THRESH}.')\n",
"\n",
"# flag channels based on delay filter\n",
"iteratively_flag_on_delay_filtered_time_avg_zscore(flags, thresh=TIME_AVG_DELAY_FILT_SNR_THRESH, dynamic_range=TIME_AVG_DELAY_FILT_SNR_DYNAMIC_RANGE,\n",
" filter_delay=AVG_SPECTRUM_FILTER_DELAY, eigenval_cutoff=EIGENVAL_CUTOFF)\n",
"print(f'{np.mean(flags):.3%} of flagging channels that are {TIME_AVG_DELAY_FILT_SNR_THRESH}σ outliers after delay filtering the time average.')\n",
"\n",
"# watershed flagging again\n",
"while True:\n",
" nflags = np.sum(flags)\n",
" for pol in ['ee', 'nn']:\n",
" flags |= xrfi._ws_flag_waterfall(zscore[pol], flags, WS_Z_THRESH)\n",
" if np.sum(flags) == nflags:\n",
" break\n",
"print(f'{np.mean(flags):.3%} of waterfall flagged after another round of watershed flagging on z > {WS_Z_THRESH} neighbors of prior flags.')"
]
},
{
Expand Down Expand Up @@ -388,15 +464,15 @@
"metadata": {},
"outputs": [],
"source": [
"def zscore_spectra():\n",
"def zscore_spectra(ylim=[-3, 3], flags=flags):\n",
" fig, axes = plt.subplots(2, 1, figsize=(14,6), dpi=100, sharex=True, sharey=True, gridspec_kw={'hspace': 0})\n",
" for ax, pol in zip(axes, ['ee', 'nn']):\n",
"\n",
" ax.plot(freqs / 1e6, np.nanmean(zscore[pol], axis=0),'r', label=f'{pol}-Polarization Before Round 2 Flagging', lw=.5)\n",
" ax.plot(freqs / 1e6, np.nanmean(np.where(flags, np.nan, zscore[pol]), axis=0), label=f'{pol}-Polarization After Round 2 Flagging')\n",
" ax.legend(loc='lower right')\n",
" ax.set_ylabel('Time-Averged Z-Score\\n(Excluding Flags)')\n",
" ax.set_ylim(-11, 11)\n",
" ax.set_ylim(ylim)\n",
" axes[1].set_xlabel('Frequency (MHz)')\n",
" plt.tight_layout()"
]
Expand Down Expand Up @@ -428,7 +504,7 @@
"metadata": {},
"outputs": [],
"source": [
"def summarize_flagging():\n",
"def summarize_flagging(flags=flags):\n",
" plt.figure(figsize=(14,10), dpi=100)\n",
" cmap = matplotlib.colors.ListedColormap(((0, 0, 0),) + matplotlib.cm.get_cmap(\"Set2\").colors[0:2])\n",
" plt.imshow(np.where(np.any(~np.isfinite(list(zscore.values())), axis=0), 1, np.where(flags, 2, 0)), \n",
Expand Down Expand Up @@ -562,9 +638,7 @@
"cell_type": "code",
"execution_count": null,
"id": "2b67d967",
"metadata": {
"scrolled": true
},
"metadata": {},
"outputs": [],
"source": [
"for repo in ['hera_cal', 'hera_qm', 'hera_filters', 'hera_notebook_templates', 'pyuvdata']:\n",
Expand Down Expand Up @@ -599,7 +673,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.16"
"version": "3.12.0"
},
"toc": {
"base_numbering": 1,
Expand Down
Loading

0 comments on commit 3eecfad

Please sign in to comment.