Merge branch 'master' into add_FRF_noise_cov

HERA-Team · Oct 22, 2024 · 3eecfad · 3eecfad
2 parents a3de339 + 3dc44e9
commit 3eecfad
Show file tree

Hide file tree

Showing 3 changed files with 300 additions and 332 deletions.
diff --git a/notebooks/delay_filtered_average_zscore.ipynb b/notebooks/delay_filtered_average_zscore.ipynb
@@ -7,7 +7,7 @@
    "source": [
     "# Single File Delay Filtered Average Z-Score\n",
     "\n",
-    "**by Josh Dillon and Tyler Cox**, last updated October 10, 2024\n",
+    "**by Josh Dillon and Tyler Cox**, last updated October 18, 2024\n",
     "\n",
     "This notebook is designed to calculate a metric used for finding low-level RFI in redundantly-averaged cross-correlations, which are then incoherently averaged across well-sampled baselines.\n",
     "\n",
@@ -289,15 +289,16 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "zscore = {}\n",
-    "for pol in ['ee', 'nn']:\n",
-    "    abs_SNRs_this_pol = [np.abs(dly_filt_SNRs[bl]) for bl in bls_to_filter if (bl[2] == pol) and (bl[0] != bl[1])]\n",
-    "    \n",
-    "    predicted_mean = 1.0\n",
-    "    sigma = predicted_mean * np.sqrt(2 / np.pi)\n",
-    "    variance_expected = (4 - np.pi) / 2 * sigma**2 / len(abs_SNRs_this_pol)\n",
+    "if not ALL_FLAGGED:\n",
+    "    zscore = {}\n",
+    "    for pol in ['ee', 'nn']:\n",
+    "        abs_SNRs_this_pol = [np.abs(dly_filt_SNRs[bl]) for bl in bls_to_filter if (bl[2] == pol) and (bl[0] != bl[1])]\n",
+    "\n",
+    "        predicted_mean = 1.0\n",
+    "        sigma = predicted_mean * np.sqrt(2 / np.pi)\n",
+    "        variance_expected = (4 - np.pi) / 2 * sigma**2 / len(abs_SNRs_this_pol)\n",
     "\n",
-    "    zscore[pol] = (np.nanmean(abs_SNRs_this_pol, axis=0) - predicted_mean) / variance_expected**.5"
+    "        zscore[pol] = (np.nanmean(abs_SNRs_this_pol, axis=0) - predicted_mean) / variance_expected**.5"
    ]
   },
   {

diff --git a/notebooks/full_day_rfi_round_2.ipynb b/notebooks/full_day_rfi_round_2.ipynb
@@ -7,7 +7,7 @@
    "source": [
     "# Second Round of Full Day RFI Flagging\n",
     "\n",
-    "**by Josh Dillon**, last updated July 31, 2023\n",
+    "**by Josh Dillon**, last updated October 13, 2024\n",
     "\n",
     "This notebook is synthesizes information from individual [delay_filtered_average_zscore](https://github.com/HERA-Team/hera_notebook_templates/blob/master/notebooks/delay_filtered_average_zscore.ipynb) notebooks to find low-level RFI and flag it. That notebook takes `smooth_cal`ibrated data, redundantly averages it, performs a high-pass delay filter, and then incoherently averages across baselines, creating a per-polarization z-score. This notebook then takes that whole night of z-scores and finds a new set of flags to both add to the `smooth_cal` files, which are updated in place, and to write down as new `UVFlag` waterfall-type `.h5` files.\n",
     "\n",
@@ -53,6 +53,7 @@
     "from hera_cal import utils\n",
     "from hera_qm import xrfi\n",
     "from hera_qm.time_series_metrics import true_stretches\n",
+    "from hera_filters import dspec\n",
     "\n",
     "from IPython.display import display, HTML\n",
     "%matplotlib inline\n",
@@ -91,13 +92,18 @@
     "out_yaml_file = os.path.join(OUT_YAML_DIR, SUM_FILE.split('.')[-4] + OUT_YAML_SUFFIX)    \n",
     "\n",
     "# get flagging parameters\n",
-    "Z_THRESH = float(os.environ.get(\"Z_THRESH\", 5))\n",
-    "WS_Z_THRESH = float(os.environ.get(\"WS_Z_THRESH\", 4))\n",
+    "Z_THRESH = float(os.environ.get(\"Z_THRESH\", 4))\n",
+    "WS_Z_THRESH = float(os.environ.get(\"WS_Z_THRESH\", 2))\n",
     "AVG_Z_THRESH = float(os.environ.get(\"AVG_Z_THRESH\", 1))\n",
     "MAX_FREQ_FLAG_FRAC = float(os.environ.get(\"MAX_FREQ_FLAG_FRAC\", .25))\n",
     "MAX_TIME_FLAG_FRAC = float(os.environ.get(\"MAX_TIME_FLAG_FRAC\", .1))\n",
+    "AVG_SPECTRUM_FILTER_DELAY = float(os.environ.get(\"AVG_SPECTRUM_FILTER_DELAY\", 250)) # in ns\n",
+    "EIGENVAL_CUTOFF = float(os.environ.get(\"EIGENVAL_CUTOFF\", 1e-12))\n",
+    "TIME_AVG_DELAY_FILT_SNR_THRESH = float(os.environ.get(\"TIME_AVG_DELAY_FILT_SNR_THRESH\", 4.0))\n",
+    "TIME_AVG_DELAY_FILT_SNR_DYNAMIC_RANGE = float(os.environ.get(\"TIME_AVG_DELAY_FILT_SNR_DYNAMIC_RANGE\", 1.5))\n",
     "\n",
-    "for setting in ['Z_THRESH', 'WS_Z_THRESH', 'AVG_Z_THRESH', 'MAX_FREQ_FLAG_FRAC', 'MAX_TIME_FLAG_FRAC']:\n",
+    "for setting in ['Z_THRESH', 'WS_Z_THRESH', 'AVG_Z_THRESH', 'MAX_FREQ_FLAG_FRAC', 'MAX_TIME_FLAG_FRAC', 'AVG_SPECTRUM_FILTER_DELAY',\n",
+    "               'EIGENVAL_CUTOFF', 'TIME_AVG_DELAY_FILT_SNR_THRESH', 'TIME_AVG_DELAY_FILT_SNR_DYNAMIC_RANGE']:\n",
     "    print(f'{setting} = {eval(setting)}')"
    ]
   },
@@ -184,12 +190,12 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "def plot_max_z_score(zscore, flags=None):\n",
+    "def plot_max_z_score(zscore, flags=None, vmin=-5, vmax=5):\n",
     "    if flags is None:\n",
     "        flags = np.any(~np.isfinite(list(zscore.values())), axis=0)\n",
     "    plt.figure(figsize=(14,10), dpi=100)\n",
     "    plt.imshow(np.where(flags, np.nan, np.nanmax([zscore['ee'], zscore['nn']], axis=0)), aspect='auto', \n",
-    "               cmap='coolwarm', interpolation='none', vmin=-10, vmax=10, extent=extent)\n",
+    "               cmap='coolwarm', interpolation='none', vmin=vmin, vmax=vmax, extent=extent)\n",
     "    plt.colorbar(location='top', label='Max z-score of either polarization', extend='both', aspect=40, pad=.02)\n",
     "    plt.xlabel('Frequency (MHz)')\n",
     "    plt.ylabel(f'JD - {int(times[0])}')\n",
@@ -210,9 +216,7 @@
    "cell_type": "code",
    "execution_count": null,
    "id": "bf8ae866",
-   "metadata": {
-    "scrolled": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "plot_max_z_score(zscore)"
@@ -278,14 +282,14 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "def iteratively_flag_on_averaged_zscore(flags, zscore, avg_z_thresh=1.5, verbose=True):\n",
+    "def iteratively_flag_on_averaged_zscore(flags, zscore, avg_func=np.nanmean, avg_z_thresh=AVG_Z_THRESH, verbose=True):\n",
     "    '''Flag whole integrations or channels based on average z-score. This is done\n",
     "    iteratively to prevent bad times affecting channel averages or vice versa.'''\n",
     "    flagged_chan_count = 0\n",
     "    flagged_int_count = 0\n",
     "    while True:\n",
-    "        zspec = np.nanmean(np.where(flags, np.nan, zscore), axis=0)\n",
-    "        ztseries = np.nanmean(np.where(flags, np.nan, zscore), axis=1)\n",
+    "        zspec = avg_func(np.where(flags, np.nan, zscore), axis=0)\n",
+    "        ztseries = avg_func(np.where(flags, np.nan, zscore), axis=1)\n",
     "\n",
     "        if (np.nanmax(zspec) < avg_z_thresh) and (np.nanmax(ztseries) < avg_z_thresh):\n",
     "            break\n",
@@ -300,21 +304,68 @@
     "    if verbose:\n",
     "        print(f'\\tFlagging an additional {flagged_int_count} integrations and {flagged_chan_count} channels.')\n",
     "\n",
-    "def impose_max_chan_flag_frac(flags, max_flag_frac=.25, verbose=True):\n",
+    "def impose_max_chan_flag_frac(flags, max_flag_frac=MAX_FREQ_FLAG_FRAC, verbose=True):\n",
     "    '''Flag channels already flagged more than max_flag_frac (excluding completely flagged times).'''\n",
     "    unflagged_times = ~np.all(flags, axis=1)\n",
     "    frequently_flagged_chans =  np.mean(flags[unflagged_times, :], axis=0) >= max_flag_frac\n",
     "    if verbose:\n",
     "        print(f'\\tFlagging {np.sum(frequently_flagged_chans) - np.sum(np.all(flags, axis=0))} channels previously flagged {max_flag_frac:.2%} or more.')        \n",
     "    flags[:, frequently_flagged_chans] = True \n",
     "        \n",
-    "def impose_max_time_flag_frac(flags, max_flag_frac=.25, verbose=True):\n",
+    "def impose_max_time_flag_frac(flags, max_flag_frac=MAX_TIME_FLAG_FRAC, verbose=True):\n",
     "    '''Flag times already flagged more than max_flag_frac (excluding completely flagged channels).'''\n",
     "    unflagged_chans = ~np.all(flags, axis=0)\n",
     "    frequently_flagged_times =  np.mean(flags[:, unflagged_chans], axis=1) >= max_flag_frac\n",
     "    if verbose:\n",
     "        print(f'\\tFlagging {np.sum(frequently_flagged_times) - np.sum(np.all(flags, axis=1))} times previously flagged {max_flag_frac:.2%} or more.')\n",
-    "    flags[frequently_flagged_times, :] = True             "
+    "    flags[frequently_flagged_times, :] = True\n",
+    "\n",
+    "def time_avg_zscore_dly_filt_SNRs(flags, filter_delay=AVG_SPECTRUM_FILTER_DELAY, eigenval_cutoff=EIGENVAL_CUTOFF):\n",
+    "    \"\"\"Produces SNRs after time-averaging z-scores and delay filtering, accounting for flagging's effect on the filter.\"\"\"\n",
+    "    # figure out high and low band based on FM gap at 100 MHz\n",
+    "    flagged_stretches = true_stretches(np.all(flags, axis=0))\n",
+    "    FM_gap = [fs for fs in flagged_stretches if fs.start <= np.argmin(np.abs(freqs - 100e6)) < fs.stop][0]\n",
+    "    low_band = slice((0 if flagged_stretches[0].start != 0 else flagged_stretches[0].stop), FM_gap.start)\n",
+    "    high_band = slice(FM_gap.stop, (len(freqs) if flagged_stretches[-1].stop != len(freqs) else flagged_stretches[-1].start))\n",
+    "    \n",
+    "    filt_SNR = {}\n",
+    "    for pol in zscore:\n",
+    "        # calculate timeavg_SNR and filter\n",
+    "        noise_prediction = 1.0 / np.sum(~flags, axis=0)**.5\n",
+    "        timeavg_SNR = np.nanmean(np.where(flags, np.nan, zscore[pol] / noise_prediction), axis=0) \n",
+    "        wgts = np.where(np.isfinite(timeavg_SNR), 1, 0)\n",
+    "        model = np.zeros_like(timeavg_SNR)\n",
+    "        for band in [low_band, high_band]:\n",
+    "            model[band], _, _ = dspec.fourier_filter(freqs[band], np.where(np.isfinite(timeavg_SNR[band]), timeavg_SNR[band], 0),\n",
+    "                                                     wgts[band], [0], [AVG_SPECTRUM_FILTER_DELAY / 1e9], mode=\"dpss_solve\", \n",
+    "                                                     eigenval_cutoff=[EIGENVAL_CUTOFF], suppression_factors=[EIGENVAL_CUTOFF])\n",
+    "        filt_SNR[pol] = timeavg_SNR - model\n",
+    "\n",
+    "        # correct for impact of filter\n",
+    "        correction_factors = np.ones_like(wgts) * np.nan\n",
+    "        for band in [low_band, high_band]:\n",
+    "            X = dspec.dpss_operator(freqs[band], [0], filter_half_widths=[AVG_SPECTRUM_FILTER_DELAY / 1e9], eigenval_cutoff=[EIGENVAL_CUTOFF])[0]\n",
+    "            W = wgts[band]\n",
+    "            leverage = np.diag(X @ np.linalg.pinv(np.dot(X.T * W, X)) @ (X.T * W))\n",
+    "            correction_factors[band] = np.where(leverage > 0, (1 - leverage)**.5, np.nan) # because the underlying data should be gaussian\n",
+    "        filt_SNR[pol] /= correction_factors\n",
+    "    \n",
+    "    return filt_SNR\n",
+    "\n",
+    "def iteratively_flag_on_delay_filtered_time_avg_zscore(flags, thresh=TIME_AVG_DELAY_FILT_SNR_THRESH, dynamic_range=TIME_AVG_DELAY_FILT_SNR_DYNAMIC_RANGE,\n",
+    "                                                       filter_delay=AVG_SPECTRUM_FILTER_DELAY, eigenval_cutoff=EIGENVAL_CUTOFF):\n",
+    "    \"\"\"Flag whole channels based on their outlierness after delay-filterd time-averaged zscores.\n",
+    "    This is done iteratively since the delay filter can be unduly influenced by large outliers.\"\"\"\n",
+    "    filt_SNR = time_avg_zscore_dly_filt_SNRs(flags, filter_delay=AVG_SPECTRUM_FILTER_DELAY, eigenval_cutoff=EIGENVAL_CUTOFF)\n",
+    "    while True:\n",
+    "        largest_SNR = np.nanmax(list(filt_SNR.values()))\n",
+    "        if largest_SNR < thresh:\n",
+    "            break\n",
+    "        # \n",
+    "        cut = np.max([thresh, largest_SNR / dynamic_range])\n",
+    "        for pol in filt_SNR:\n",
+    "            flags[:, filt_SNR[pol] > cut] = True\n",
+    "        filt_SNR = time_avg_zscore_dly_filt_SNRs(flags, filter_delay=AVG_SPECTRUM_FILTER_DELAY, eigenval_cutoff=EIGENVAL_CUTOFF)"
    ]
   },
   {
@@ -327,6 +378,17 @@
     "flags = np.any(~np.isfinite(list(zscore.values())), axis=0)\n",
     "print(f'{np.mean(flags):.3%} of waterfall flagged to start.')\n",
     "\n",
+    "# flag whole integrations or channels using outliers in median\n",
+    "while True:\n",
+    "    nflags = np.sum(flags)\n",
+    "    for pol in ['ee', 'nn']:    \n",
+    "        iteratively_flag_on_averaged_zscore(flags, zscore[pol], avg_func=np.nanmedian, avg_z_thresh=AVG_Z_THRESH, verbose=True)\n",
+    "        impose_max_chan_flag_frac(flags, max_flag_frac=MAX_FREQ_FLAG_FRAC, verbose=True)\n",
+    "        impose_max_time_flag_frac(flags, max_flag_frac=MAX_TIME_FLAG_FRAC, verbose=True)\n",
+    "    if np.sum(flags) == nflags:\n",
+    "        break  \n",
+    "print(f'{np.mean(flags):.3%} of waterfall flagged after flagging whole times and channels with median z > {AVG_Z_THRESH}.')\n",
+    "\n",
     "# flag largest outliers\n",
     "for pol in ['ee', 'nn']:\n",
     "    flags |= (zscore[pol] > Z_THRESH) \n",
@@ -339,18 +401,32 @@
     "        flags |= xrfi._ws_flag_waterfall(zscore[pol], flags, WS_Z_THRESH)\n",
     "    if np.sum(flags) == nflags:\n",
     "        break\n",
-    "print(f'{np.mean(flags):.3%} of waterfall flagged after watershed flagging on z > {WS_Z_THRESH} neightbors of prior flags.')\n",
+    "print(f'{np.mean(flags):.3%} of waterfall flagged after watershed flagging on z > {WS_Z_THRESH} neighbors of prior flags.')\n",
     "        \n",
-    "# flag whole integrations or channels\n",
+    "# flag whole integrations or channels using outliers in mean\n",
     "while True:\n",
     "    nflags = np.sum(flags)\n",
     "    for pol in ['ee', 'nn']:    \n",
-    "        iteratively_flag_on_averaged_zscore(flags, zscore[pol], avg_z_thresh=AVG_Z_THRESH, verbose=True)\n",
+    "        iteratively_flag_on_averaged_zscore(flags, zscore[pol], avg_func=np.nanmean, avg_z_thresh=AVG_Z_THRESH, verbose=True)\n",
     "        impose_max_chan_flag_frac(flags, max_flag_frac=MAX_FREQ_FLAG_FRAC, verbose=True)\n",
     "        impose_max_time_flag_frac(flags, max_flag_frac=MAX_TIME_FLAG_FRAC, verbose=True)\n",
     "    if np.sum(flags) == nflags:\n",
     "        break  \n",
-    "print(f'{np.mean(flags):.3%} of waterfall flagged after flagging whole times and channels with average z > {AVG_Z_THRESH}.')"
+    "print(f'{np.mean(flags):.3%} of waterfall flagged after flagging whole times and channels with average z > {AVG_Z_THRESH}.')\n",
+    "\n",
+    "# flag channels based on delay filter\n",
+    "iteratively_flag_on_delay_filtered_time_avg_zscore(flags, thresh=TIME_AVG_DELAY_FILT_SNR_THRESH, dynamic_range=TIME_AVG_DELAY_FILT_SNR_DYNAMIC_RANGE,\n",
+    "                                                   filter_delay=AVG_SPECTRUM_FILTER_DELAY, eigenval_cutoff=EIGENVAL_CUTOFF)\n",
+    "print(f'{np.mean(flags):.3%} of flagging channels that are {TIME_AVG_DELAY_FILT_SNR_THRESH}σ outliers after delay filtering the time average.')\n",
+    "\n",
+    "# watershed flagging again\n",
+    "while True:\n",
+    "    nflags = np.sum(flags)\n",
+    "    for pol in ['ee', 'nn']:\n",
+    "        flags |= xrfi._ws_flag_waterfall(zscore[pol], flags, WS_Z_THRESH)\n",
+    "    if np.sum(flags) == nflags:\n",
+    "        break\n",
+    "print(f'{np.mean(flags):.3%} of waterfall flagged after another round of watershed flagging on z > {WS_Z_THRESH} neighbors of prior flags.')"
    ]
   },
   {
@@ -388,15 +464,15 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "def zscore_spectra():\n",
+    "def zscore_spectra(ylim=[-3, 3], flags=flags):\n",
     "    fig, axes = plt.subplots(2, 1, figsize=(14,6), dpi=100, sharex=True, sharey=True, gridspec_kw={'hspace': 0})\n",
     "    for ax, pol in zip(axes, ['ee', 'nn']):\n",
     "\n",
     "        ax.plot(freqs / 1e6, np.nanmean(zscore[pol], axis=0),'r', label=f'{pol}-Polarization Before Round 2 Flagging', lw=.5)\n",
     "        ax.plot(freqs / 1e6, np.nanmean(np.where(flags, np.nan, zscore[pol]), axis=0), label=f'{pol}-Polarization After Round 2 Flagging')\n",
     "        ax.legend(loc='lower right')\n",
     "        ax.set_ylabel('Time-Averged Z-Score\\n(Excluding Flags)')\n",
-    "        ax.set_ylim(-11, 11)\n",
+    "        ax.set_ylim(ylim)\n",
     "    axes[1].set_xlabel('Frequency (MHz)')\n",
     "    plt.tight_layout()"
    ]
@@ -428,7 +504,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "def summarize_flagging():\n",
+    "def summarize_flagging(flags=flags):\n",
     "    plt.figure(figsize=(14,10), dpi=100)\n",
     "    cmap = matplotlib.colors.ListedColormap(((0, 0, 0),) + matplotlib.cm.get_cmap(\"Set2\").colors[0:2])\n",
     "    plt.imshow(np.where(np.any(~np.isfinite(list(zscore.values())), axis=0), 1, np.where(flags, 2, 0)), \n",
@@ -562,9 +638,7 @@
    "cell_type": "code",
    "execution_count": null,
    "id": "2b67d967",
-   "metadata": {
-    "scrolled": true
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "for repo in ['hera_cal', 'hera_qm', 'hera_filters', 'hera_notebook_templates', 'pyuvdata']:\n",
@@ -599,7 +673,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.16"
+   "version": "3.12.0"
   },
   "toc": {
    "base_numbering": 1,