compute_sdss_pca.py

"""
Example of downloading and processing SDSS spectra
--------------------------------------------------

This is the code used to create the files fetched by the routine
:func:`fetch_sdss_corrected_spectra`.  Be aware that this routine
downloads a large amount of data (~700MB for 4000 spectra) and takes
a long time to run (~30 minutes for 4000 spectra).
"""
# Author: Jake VanderPlas <vanderplas@astro.washington.edu>
# License: BSD
#   This code is an example from astroML: see http://astroML.github.com
# Modified March 2019 by Stephen Portillo <sportill@uw.edu>
from __future__ import print_function, division

import sys
from astroML.py3k_compat import HTTPError
import numpy as np
from astroML.datasets import fetch_sdss_spectrum
from astroML.datasets.tools import query_plate_mjd_fiber, TARGET_GALAXY, TARGET_QSO_CAP, TARGET_QSO_SKIRT
from astroML.dimensionality import iterative_pca


def fetch_and_shift_spectra(n_spectra,
                            outfile,
                            primtarget=TARGET_GALAXY,
                            zlim=(0, 0.7),
                            loglam_start=3.5,
                            loglam_end=3.9,
                            Nlam=1000):
    """
    This function queries CAS for matching spectra, and then downloads
    them and shifts them to a common redshift binning
    """
    # First query for the list of spectra to download
    plate, mjd, fiber = query_plate_mjd_fiber(n_spectra, primtarget,
                                              zlim[0], zlim[1])

    # Set up arrays to hold information gathered from the spectra
    spec_cln = np.zeros(n_spectra, dtype=np.int32)
    lineindex_cln = np.zeros(n_spectra, dtype=np.int32)

    log_NII_Ha = np.zeros(n_spectra, dtype=np.float32)
    log_OIII_Hb = np.zeros(n_spectra, dtype=np.float32)

    z = np.zeros(n_spectra, dtype=np.float32)
    zerr = np.zeros(n_spectra, dtype=np.float32)
    spectra = np.zeros((n_spectra, Nlam), dtype=np.float32)
    mask = np.zeros((n_spectra, Nlam), dtype=np.bool)
    specerr = np.zeros((n_spectra, Nlam), dtype=np.float32)

    # also save plate, mjd, fiber to allow reference to SDSS data
    plates = np.zeros(n_spectra, dtype=np.int32)
    mjds = np.zeros(n_spectra, dtype=np.int32)
    fibers = np.zeros(n_spectra, dtype=np.int32)

    # Calculate new wavelength coefficients
    new_coeff0 = loglam_start
    new_coeff1 = (loglam_end - loglam_start) / Nlam

    # Now download all the needed spectra, and resample to a common
    #  wavelength bin.
    n_spectra = len(plate)
    num_skipped = 0
    # changed counter and loop so that skipped spectra do not create gaps in arrays
    j = 0

    for i in range(n_spectra):
        sys.stdout.write(' %i / %i spectra\r' % (i + 1, n_spectra))
        sys.stdout.flush()
        try:
            spec = fetch_sdss_spectrum(plate[i], mjd[i], fiber[i], data_home='/epyc/users/sportill/specAE/cache')
        except HTTPError:
            num_skipped += 1
            print("%i, %i, %i not found" % (plate[i], mjd[i], fiber[i]))
            continue

        spec_rebin = spec.restframe().rebin(new_coeff0, new_coeff1, Nlam)

        if spec.z < zlim[0] or spec.z > zlim[1]:
            num_skipped += 1
            print("%i, %i, %i outside redshift range" % (plate[i], mjd[i], fiber[i]))
            continue

        if np.all(spec_rebin.spectrum == 0):
            num_skipped += 1
            print("%i, %i, %i is all zero" % (plate[i], mjd[i], fiber[i]))
            continue

        #if spec.spec_cln < 2 or spec.spec_cln > 3:
        #    num_skipped += 1
        #    print("%i, %i, %i is not a galaxy spectrum" % (plate[i], mjd[i], fiber[i]))
        #    continue

        spec_cln[j] = spec.spec_cln

        lineindex_cln[j], (log_NII_Ha[j], log_OIII_Hb[j])\
            = spec.lineratio_index()

        z[j] = spec.z
        zerr[j] = spec.zerr

        spectra[j] = spec_rebin.spectrum
        mask[j] = spec_rebin.compute_mask(0.5, 5)
        assert((mask[j] == 0).any())
        specerr[j] = spec_rebin.error

        plates[j] = plate[i]
        mjds[j] = mjd[i]
        fibers[j] = fiber[i]

        j += 1
    sys.stdout.write('\n')

    N = j
    print("   %i spectra skipped" % num_skipped)
    print("   %i spectra processed" % N)
    print("saving to %s" % outfile)

    np.savez(outfile,
             spectra=spectra[:N],
             mask=mask[:N],
             spec_err=specerr[:N],
             coeff0=new_coeff0,
             coeff1=new_coeff1,
             spec_cln=spec_cln[:N],
             lineindex_cln=lineindex_cln[:N],
             log_NII_Ha=log_NII_Ha[:N],
             log_OIII_Hb=log_OIII_Hb[:N],
             z=z[:N],
             zerr=zerr[:N],
             plate=plates[:N],
             mjd=mjds[:N],
             fiber=fibers[:N])


def spec_iterative_pca(outfile, n_ev=10, n_iter=20, norm='L2'):
    """
    This function takes the file outputted above, performs an iterative
    PCA to fill in the gaps, and appends the results to the same file.
    """
    data_in = np.load(outfile)
    spectra = data_in['spectra']
    mask = data_in['mask']

    res = iterative_pca(spectra, mask,
                        n_ev=n_ev, n_iter=n_iter, norm=norm,
                        full_output=True)

    input_dict = dict([(key, data_in[key]) for key in data_in.files])

    # don't save the reconstructed spectrum: this can easily
    # be recomputed from the other parameters.
    input_dict['mu'] = res[1]
    input_dict['evecs'] = res[2]
    input_dict['evals'] = res[3]
    input_dict['norms'] = res[4]
    input_dict['coeffs'] = res[5]

    np.savez(outfile, **input_dict)


if __name__ == '__main__':
    # download from main galaxy sample and low redshift quasars
    # limit redshift to 0.36
    fetch_and_shift_spectra(64000, 'spec64k.npz', primtarget=TARGET_GALAXY+TARGET_QSO_CAP+TARGET_QSO_SKIRT, zlim=(0, 0.36),
        loglam_start=3.53, loglam_end=3.92)
    spec_iterative_pca('spec64k.npz')