dmu12/likelihood_ratio.py

'''
Adopting the liklihood ratio code from https://github.com/nudomarinero/mltier1 to use for crossmatching radio catalogues to the HELP masterlist

The original code was used in Williams et al 2018 using the method described by Fleuren et al 2012
'''
import numpy as np
import astropy
from astropy.table import Table, hstack
from astropy import units as u
from astropy.coordinates import SkyCoord, search_around_sky
from IPython.display import clear_output
from scipy.optimize import curve_fit
import pickle
import os
from pymoc import MOC
from matplotlib import pyplot as plt

#import mltier which is a non standard HELP module 

from mltier import (get_center, get_n_m, estimate_q_m, Field, SingleMLEstimator, MultiMLEstimator,
                     parallel_process, get_sigma, get_q_m, get_threshold, q0_min_level, q0_min_numbers)


'''
Runs an adapted version of the liklihood ratio code from https://github.com/nudomarinero/mltier1 to crossmatch radio catalogues to the HELP masterlist

The original code was used in Williams et al 2018 using the method described by Fleuren et al 2012

INPUTS
------------
cat1 - the radio catalogue
cat2 - the masterlist
cat1ra - column name of the ra column for cat1
cat1dec - column name of the dec column for cat1
cat2ra - column name of the ra column for cat2
cat2dec - column name of the dec column for cat2
cat2mag - column name of the magnitude of the sources in cat2 in the band that is being used for the crossmatching (must be a magnitude or your crossmatch will be incorrect)
Q0_i - the value of Q0 used for the crossmatching
radius - the radius used for the crossmatching in arcseconds
ra_down - the ra that defines the left side of a rectangle encompasing the area the crossmatching will be done in (this parameter is redundant if a moc is provided)
ra_up - the ra that defines the right side of a rectangle encompasing the area the crossmatching will be done in (this parameter is redundant if a moc is provided)
dec_down - the dec that defines the bottom side of a rectangle encompasing the area the crossmatching will be done in (this parameter is redundant if a moc is provided)
dec_up - the dec that defines the upper side of a rectangle encompassing the area the crossmatching will be done in (this parameter is redundant if a moc is provided)
moc - a moc defining the area the crossmatching will be done in 

OUTPUTS
------------
new_table - A masterlist table of the crossmatched sources containing all the columns from cat1 and cat2
'''
def likelihood_ratio(cat1,cat2,cat1ra,cat1dec,cat2ra,cat2dec,cat2mag,Q0_i,radius,
                     ra_down,ra_up,dec_down,dec_up,moc=None):


    field = Field(ra_down, ra_up, dec_down, dec_up, moc)
    
    radio_150mhz = cat1
    masterlist = cat2

    # Filter radio catalogue with the field 
    radio = field.filter_catalogue(radio_150mhz, colnames=(cat1ra,cat1dec))
    print('number of radio sources being crossmatched is {}'.format(len(radio)))

    # Filter the masterlist with the field
    masterlist = field.filter_catalogue(masterlist, colnames=(cat2ra,cat2dec))
    print('number of masterlist sources being crossmatched too is {}'.format(len(masterlist)))

    coords_masterlist = SkyCoord(masterlist[cat2ra], 
                           masterlist[cat2dec], 
                           unit=(u.deg, u.deg), 
                           frame='icrs')

    coords_radio = SkyCoord(radio[cat1ra], 
                       radio[cat1dec], 
                       unit=(u.deg, u.deg), 
                       frame='icrs')

    masterlist[cat2mag][np.isnan(masterlist[cat2mag])] = 0

    catalogue_i = masterlist

    mask = catalogue_i[cat2mag] != 0 
    bin_list_i = np.arange(np.min(catalogue_i[mask][cat2mag]), np.max(catalogue_i[mask][cat2mag]), 0.5) # Bins of 0.5

    center_i = get_center(bin_list_i)
    # get the mid point of each bin i.e if there is a bin from 1 to 4 then the centre will be at 2.5

    n_m_i = get_n_m(catalogue_i[cat2mag], bin_list_i, field.area)
    # get a cumulative distribution of the magnitudes divided by the field area
    q_m_i = estimate_q_m(catalogue_i[cat2mag], bin_list_i, n_m_i, coords_radio, coords_masterlist, radius=radius)
    
    # ### $Q_0$ and likelihood estimators
    q0 = Q0_i
    likelihood_ratio_i = SingleMLEstimator(Q0_i, n_m_i, q_m_i, center_i)
     
    idx_radio, idx_i, d2d, d3d = search_around_sky(
        coords_radio, coords_masterlist, radius*u.arcsec)
    
    idx_radio_unique = np.unique(idx_radio)

    radio["lr_i"] = np.nan                   # Likelihood ratio
    radio["lr_dist_i"] = np.nan              # Distance to the selected source
    radio["lr_index_i"] = np.nan             # Index of the source in masterlist
    
    total_sources = len(idx_radio_unique)
    masterlist_aux_index = np.arange(len(masterlist))
    
    def ml(i):
        
        idx_0 = idx_i[idx_radio == i]
    
        d2d_0 = d2d[idx_radio == i]
    
        i_mag = catalogue_i[cat2mag][idx_0]
        
        
        radio_ra = radio[i]["RA"]
        radio_dec = radio[i]["DEC"]
        radio_pa = radio[i]["PA"]
        radio_maj_err = radio[i]["E_Maj"]
        radio_min_err = radio[i]["E_Min"]
        c_ra = catalogue_i["ra"][idx_0]
        c_dec = catalogue_i["dec"][idx_0]
        # Error for masterlist given as the true error is difficult to compute due to the composite nature of the masterlist
        c_ra_err = 1/3600
        c_dec_err = 1/3600
        
        sigma = get_sigma(radio_maj_err, radio_min_err, radio_pa, 
                          radio_ra, radio_dec, 
                          c_ra, c_dec, c_ra_err, c_dec_err)
        
        lr_0 = likelihood_ratio_i(i_mag, d2d_0.arcsec, sigma)
        
        if len(lr_0) == 0:
            result = [np.nan,np.nan,np.nan]
            return result,np.nan
        chosen_index = np.argmax(lr_0)
        lr_sum = np.sum(lr_0)
        rel = lr_0/(lr_sum + (1-q0))
        result = [masterlist_aux_index[idx_0[chosen_index]], # Index
                  (d2d_0.arcsec)[chosen_index],              # distance
                  lr_0[chosen_index]]                        # LR

    
        return (result,rel[chosen_index])
    
    print('starting the crossmatch')
    res = []
    rel = []
    nomatches = 0
    for i in range(len(radio)):
        result,reliab = ml(i)
        if result == [np.nan,np.nan,np.nan]:
            nomatches = nomatches + 1
            continue
        res.append(result)
        rel.append(reliab)
    
    print('number of sources with no possible match is {}'.format(nomatches))
    
    (radio["lr_index_i"][idx_radio_unique], 
     radio["lr_dist_i"][idx_radio_unique], 
     radio["lr_i"][idx_radio_unique]) = list(map(list, zip(*res)))
    
    
    # #### Threshold and selection for i-band
    
    radio["lr_i"][np.isnan(radio["lr_i"])] = 0
    
    threshold_i = np.percentile(radio[radio['lr_i'] !=0]["lr_i"], 100*(1 - Q0_i))
    print('liklihood ratio threshold using Q0 only is {}'.format(threshold_i))
    
    radio["lrt"] = radio["lr_i"]
    radio["lrt"][np.isnan(radio["lr_i"])] = 0
    
    print('updating threshold of liklihood ratio that can be trusted')      
    def completeness(lr, threshold, q0):
        n = len(lr)
        lrt = lr[lr < threshold]
        return 1. - np.sum((q0 * lrt)/(q0 * lrt + (1 - q0)))/float(n)
    
    def reliability(lr, threshold, q0):
        n = len(lr)
        lrt = lr[lr > threshold]
        return 1. - np.sum((1. - q0)/(q0 * lrt + (1 - q0)))/float(n)
    
    completeness_v = np.vectorize(completeness, excluded=[0])
    reliability_v = np.vectorize(reliability, excluded=[0])
    
    n_test = 100
    threshold_mean = np.percentile(radio["lrt"], 100*(1 - q0))
    
    thresholds = np.arange(0., 10., 0.01)
    thresholds_fine = np.arange(0.1, 1., 0.001)
    
    completeness_t = completeness_v(radio["lrt"], thresholds, q0)
    reliability_t = reliability_v(radio["lrt"], thresholds, q0)
    average_t = (completeness_t + reliability_t)/2
    
    completeness_t_fine = completeness_v(radio["lrt"], thresholds_fine, q0)
    reliability_t_fine = reliability_v(radio["lrt"], thresholds_fine, q0)
    average_t_fine = (completeness_t_fine + reliability_t_fine)/2
    
    threshold_sel = thresholds_fine[np.argmax(average_t_fine)]
    print('liklihood ratio updated threshold considering completeness and reliability is {}'.format(threshold_sel))
    

        # ##### create a table containing the radio sources whose likelihood matches were above the new threshold and create a table of the data for the counterparts and then merge then using hstack
    print('creating masterlist table of cross matched sources')
    radio_match = radio[~np.isnan(radio['lr_dist_i'])]
    radio_match = radio_match[radio_match['lr_i']>threshold_sel]

    lockman_match = masterlist[radio_match['lr_index_i'].quantity.value.astype(int)]
    
    new_table = astropy.table.hstack([radio_match,lockman_match],join_type='outer')
    print('number of crossmatched sources is {}'.format(len(new_table)))

    return(new_table)