Merge pull request #284 from nasa-fornax/add_herschel

Add herschel_get_spec functionality to spectroscopy notebook
nasa-fornax · Jul 24, 2024 · d617cea · d617cea
2 parents 2f95285 + 410a8c3
commit d617cea
Show file tree

Hide file tree

Showing 2 changed files with 171 additions and 4 deletions.
diff --git a/spectroscopy/code_src/herschel_functions.py b/spectroscopy/code_src/herschel_functions.py
@@ -0,0 +1,152 @@
+## Herschel PACS & SPIRE (from ESA TAP)
+from astroquery.esa.hsa import HSA
+from astroquery.exceptions import LoginError
+from requests.exceptions import ChunkedEncodingError
+import tarfile 
+from astropy.io import fits
+import glob
+import pandas as pd
+import astropy.constants as const
+from astropy import units as u
+import os
+
+from data_structures_spec import MultiIndexDFObject
+
+def find_max_flux_column(df):
+  """
+  Analyzes a DataFrame with flux columns and returns the column with the largest sum.
+
+  Args:
+      df (pandas.DataFrame): The DataFrame containing columns with "flux" in the name.
+
+  Returns:
+      str: The name of the column with the largest sum of values containing "flux".
+  """
+
+  # Filter column names containing "flux"
+  flux_cols = [col for col in df.columns if "flux" in col.lower()]
+
+  # Check if any flux columns are found
+  if not flux_cols:
+    raise ValueError("No columns containing 'flux' found in the DataFrame")
+
+  # Calculate the sum of each flux column
+  flux_sums = {col: df[col].sum() for col in flux_cols}
+
+  # Find the column with the largest sum
+  max_flux_col = max(flux_sums, key=flux_sums.get)
+
+  return max_flux_col
+
+
+def Herschel_get_spec(sample_table, search_radius_arcsec, datadir, delete_tarfiles = False):
+    '''
+    Retrieves Herschel spectra from a subset of modes for a list of sources.
+
+    Parameters
+    ----------
+    sample_table : `~astropy.table.Table`
+        Table with the coordinates and journal reference labels of the sources
+    search_radius_arcsec : `float`
+        Search radius in arcseconds.
+    datadir : `str`
+        Data directory where to store the data. Each function will create a
+        separate data directory (for example "[datadir]/HST/" for HST data).
+    delete_tarfiles: True/False
+        Should the tarfiles be deteled after spectra are extracted?
+        
+    Returns
+    -------
+    df_spec : MultiIndexDFObject
+        The main data structure to store all spectra
+        
+    '''
+
+    ## Initialize multi-index object:
+    df_spec = MultiIndexDFObject()
+
+    for stab in sample_table:
+        search_coords = stab["coord"]
+        print("working on object", stab["label"])
+
+        #first find the object ids from herschel then download the data for each observation id
+        #query_hsa_tap doesn't accept an upload_table, so do this so do this as a for loop over each instrument and object..
+
+        for instrument_name in ['PACS', 'SPIRE']:
+            querystring = "select observation_id from hsa.v_active_observation join hsa.instrument using (instrument_oid) where contains(point('ICRS', hsa.v_active_observation.ra, hsa.v_active_observation.dec), circle('ICRS', "+str(search_coords.ra.deg)+", " + str(search_coords.dec.deg) +", " + str(search_radius_arcsec) +"))=1 and hsa.instrument.instrument_name='"+str(instrument_name)+"'"
+            objectid_table = HSA.query_hsa_tap(querystring)
+
+            #download_data only accepts one observation_id so we need to loop over each observation_id
+            for tab_id in range(len(objectid_table)):
+                observation_id = str(objectid_table[tab_id]['observation_id'])
+                try: 
+                    HSA.download_data(observation_id=observation_id, retrieval_type='OBSERVATION', 
+                            instrument_name=instrument_name, product_level = "LEVEL2, LEVEL_2_5, LEVEL_3", download_dir = datadir)
+
+                    #ok, now we have the tar files, need to read those into the right data structure
+                    #first untar
+                    path_to_file = f"{datadir}/{observation_id}.tar"
+
+                    object = tarfile.open(path_to_file, 'r')
+                    #there are a million files!!! how do I know which one I need?
+                    #only grab the files which have the final spectra in them = "HPSSPEC" in directory name
+                    #not all modes have a final spectrum (cubes?) 
+                    for member in object.getmembers():
+                         if "HPSSPEC"  in member.name: 
+                            path_to_final_dir = f'data/herschel/final_spectrum{observation_id}'
+                            object.extract(member, path = path_to_final_dir)
+
+                            for directory_name in os.listdir(path_to_final_dir):
+
+                                for fits_file_path in glob.glob(f"{path_to_final_dir}/{directory_name}/{observation_id}/level*/HPSSPEC*/herschel*/*"):
+                                    #open the fits file
+                                    hdulist = fits.open(fits_file_path) 
+
+                                    #convert final spectrum to pandas dataframe
+                                    df = pd.DataFrame(hdulist[1].data)
+                                    #There are multiple flux columns; figure out which flux column to use
+                                    #advice from https://www.cosmos.esa.int/documents/12133/996891/Product+decision+trees 
+                                    #is to use the flux coluimn with the most flux
+                                    max_flux = find_max_flux_column(df)
+                                    #use the corresponding uncertainty column
+                                    max_error =  max_flux.replace("Flux", "Error")
+
+                                    #convert to cgs units for saving and plotting
+                                    flux_Jy = df[max_flux].to_numpy() * u.Jy
+                                    wavelength = df.wave[0] * u.micrometer #single wavelength for conversion to cgs
+                                    flux_cgs = flux_Jy.to(u.erg / u.second / (u.centimeter**2) / u.hertz) * (const.c.to(u.angstrom/u.second)) / (wavelength.to(u.angstrom)**2)
+                                    flux_cgs = flux_cgs.to(u.erg / u.second / (u.centimeter**2) / u.angstrom)
+
+                                    flux_err_Jy = df[max_error].to_numpy() * u.Jy
+                                    flux_err_cgs = flux_err_Jy.to(u.erg / u.second / (u.centimeter**2) / u.hertz) * (const.c.to(u.angstrom/u.second)) / (wavelength.to(u.angstrom)**2)
+                                    flux_err_cgs = flux_err_cgs.to(u.erg / u.second / (u.centimeter**2) / u.angstrom)
+
+                                    wave = df.wave.to_numpy() * u.micrometer
+                                    wave = wave.to(u.angstrom)
+                                    #build the df with this object's spectrum from this instrument
+                                    dfsingle = pd.DataFrame(dict(wave=[wave] , flux=[flux_cgs], err=[flux_err_cgs],
+                                                label=[stab["label"]],
+                                                objectid=[stab["objectid"]],
+                                                mission=["Herschel"],
+                                                instrument=[instrument_name],
+                                                filter=[df["band"][0]],
+                                                )).set_index(["objectid", "label", "filter", "mission"])
+
+
+                                    df_spec.append(dfsingle)
+
+                except LoginError:
+                    print("This observation is proprietary, which might mean that it is calibration data")
+                except:
+                    print("Connection to the ESA archive broken")
+
+                #delete tar files
+                if delete_tarfiles:
+                    filename_tar = f"data/herschel/{objectid_table[tab_id]['observation_id']}.tar"
+                    print('filename_tar', filename_tar)
+                    if os.path.exists(filename_tar):
+                        print('removing tar file')
+                        os.remove(filename_tar)
+
+    return df_spec
+
diff --git a/spectroscopy/spectra_generator.md b/spectroscopy/spectra_generator.md
@@ -42,7 +42,8 @@ The notebook may focus on the COSMOS field for now, which has a large overlap of
 | ------- | ------- | ----------- | ------------ | ------ |
 | IRSA    | Keck    | About 10,000 spectra on the COSMOS field from [Hasinger et al. (2018)](https://ui.adsabs.harvard.edu/abs/2018ApJ...858...77H/abstract) | [IRSA Archive](https://irsa.ipac.caltech.edu/cgi-bin/Gator/nph-scan?projshort=COSMOS) | Implemented with `astroquery.ipac.irsa`. (Table gives URLs to spectrum FITS files.) Note: only implemented for absolute calibrated spectra. |
 | IRSA    | Spitzer IRS | ~17,000 merged low-resolution IRS spectra | [IRS Enhanced Product](https://irsa.ipac.caltech.edu/cgi-bin/Gator/nph-dd?catalog=irs_enhv211) | Implemented with `astroquery.ipac.irsa`. (Table gives URLs to spectrum IPAC tables.) |
-| IRSA    | Herschel*    | Some spectra, need to check reduction stage | | |
+| IRSA    | IRTF*        | Large library of stellar spectra | | does `astroquery.ipac.irsa` work?? |
+| ESA    | Herschel*    | Some spectra | | implemented with [astroquery](https://astroquery.readthedocs.io/en/latest/esa/hsa/hsa.html) |
 | IRSA    | Euclid      | Spectra hosted at IRSA in FY25 -> preparation for ingestion | | Will use mock spectra with correct format for testing |
 | IRSA    | SPHEREx     | Spectra/cubes will be hosted at IRSA, first release in FY25 -> preparation for ingestion | | Will use mock spectra with correct format for testing |
 | MAST    | HST*         | Slitless spectra would need reduction and extraction. There are some reduced slit spectra from COS in the Hubble Archive | `astroquery.mast` | Implemented using `astroquery.mast` |
@@ -103,6 +104,7 @@ Andreas Faisst, Jessica Krick, Shoubaneh Hemmati, Troy Raen, Brigitta Sipőcz, D
 ## IMPORTS
 import sys, os
 import numpy as np
+import os
 
 import matplotlib.pyplot as plt
 import matplotlib as mpl
@@ -121,6 +123,7 @@ from sdss_functions import SDSS_get_spec
 from mast_functions import HST_get_spec, JWST_get_spec
 from keck_functions import KeckDEIMOS_get_spec
 from plot_functions import create_figures
+from herschel_functions import Herschel_get_spec
 ```
 
 ## 1. Define the sample
@@ -202,7 +205,7 @@ This archive includes spectra taken by
 
  &bull; Spitzer/IRS
 
- &bull; Herschel (not implemented, yet)
+
 
 
 ```python
@@ -235,14 +238,26 @@ df_spec_HST = HST_get_spec(sample_table , search_radius_arcsec = 0.5, datadir =
 df_spec.append(df_spec_HST)
 ```
 
+### 2.3 ESA Archive
+```python
+# Herschel PACS & SPIRE from ESA TAP using astroquery
+
+herschel_radius = 1.1  
+herschel_download_directory = 'data/herschel'
+if not os.path.exists(herschel_download_directory):
+    os.makedirs(herschel_download_directory, exist_ok=True)
+df_spec_herschel =  Herschel_get_spec(sample_table, herschel_radius, herschel_download_directory, delete_tarfiles = True)
+df_spec.append(df_spec_herschel)
+```
+
+### 2.4 SDSS Archive
 ```python
 %%time
 ## Get Spectra for JWST
 df_jwst = JWST_get_spec(sample_table , search_radius_arcsec = 0.5, datadir = "./data/", verbose = False)
 df_spec.append(df_jwst)
 ```
 
-### 2.3 SDSS Archive
 
 This includes SDSS spectra.
 
@@ -253,7 +268,7 @@ df_spec_SDSS = SDSS_get_spec(sample_table , search_radius_arcsec=5, data_release
 df_spec.append(df_spec_SDSS)
 ```
 
-### 2.4 DESI Archive
+### 2.5 DESI Archive
 
 This includes DESI spectra. Here, we use the `SPARCL` query. Note that this can also be used
 for SDSS searches, however, according to the SPARCL webpage, only up to DR16 is included. Therefore, we will not include SDSS DR16 here (this is treated in the SDSS search above).