-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* checkpoint * checkpoint * testing for MacauffArguments * create boilerplate for macauff runner + tests * remove commented out test * add __future__.annotations * linter problems * add more tests for missing coverage * refactor MacauffArguments required parameter tests * address more comments from pr #152 * add dask pytest mark + black formatting
- Loading branch information
1 parent
f8ede50
commit 46ee668
Showing
7 changed files
with
421 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,128 @@ | ||
from __future__ import annotations | ||
|
||
from dataclasses import dataclass, field | ||
from os import path | ||
from typing import List | ||
|
||
from hipscat.io import FilePointer, file_io | ||
from hipscat.io.validation import is_valid_catalog | ||
|
||
from hipscat_import.runtime_arguments import RuntimeArguments | ||
|
||
# pylint: disable=too-many-instance-attributes | ||
# pylint: disable=unsupported-binary-operation | ||
|
||
|
||
@dataclass | ||
class MacauffArguments(RuntimeArguments): | ||
"""Data class for holding cross-match association arguments""" | ||
|
||
## Input - Cross-match data | ||
input_path: FilePointer | None = None | ||
"""path to search for the input data""" | ||
input_format: str = "" | ||
"""specifier of the input data format. this will be used to find an appropriate | ||
InputReader type, and may be used to find input files, via a match like | ||
``<input_path>/*<input_format>`` """ | ||
input_file_list: List[FilePointer] = field(default_factory=list) | ||
"""can be used instead of `input_format` to import only specified files""" | ||
input_paths: List[FilePointer] = field(default_factory=list) | ||
"""resolved list of all files that will be used in the importer""" | ||
add_hipscat_index: bool = True | ||
"""add the hipscat spatial index field alongside the data""" | ||
|
||
## Input - Left catalog | ||
left_catalog_dir: str = "" | ||
left_id_column: str = "" | ||
left_ra_column: str = "" | ||
left_dec_column: str = "" | ||
|
||
## Input - Right catalog | ||
right_catalog_dir: str = "" | ||
right_id_column: str = "" | ||
right_ra_column: str = "" | ||
right_dec_column: str = "" | ||
|
||
## `macauff` specific attributes | ||
metadata_file_path: str = "" | ||
match_probability_columns: List[str] = field(default_factory=list) | ||
column_names: List[str] = field(default_factory=list) | ||
|
||
def __post_init__(self): | ||
self._check_arguments() | ||
|
||
def _check_arguments(self): | ||
super()._check_arguments() | ||
|
||
if not self.input_path and not self.input_file_list: | ||
raise ValueError("input_path nor input_file_list not provided") | ||
if not self.input_format: | ||
raise ValueError("input_format is required") | ||
|
||
if not self.left_catalog_dir: | ||
raise ValueError("left_catalog_dir is required") | ||
if not self.left_id_column: | ||
raise ValueError("left_id_column is required") | ||
if not self.left_ra_column: | ||
raise ValueError("left_ra_column is required") | ||
if not self.left_dec_column: | ||
raise ValueError("left_dec_column is required") | ||
if not is_valid_catalog(self.left_catalog_dir): | ||
raise ValueError("left_catalog_dir not a valid catalog") | ||
|
||
if not self.right_catalog_dir: | ||
raise ValueError("right_catalog_dir is required") | ||
if not self.right_id_column: | ||
raise ValueError("right_id_column is required") | ||
if not self.right_ra_column: | ||
raise ValueError("right_ra_column is required") | ||
if not self.right_dec_column: | ||
raise ValueError("right_dec_column is required") | ||
if not is_valid_catalog(self.right_catalog_dir): | ||
raise ValueError("right_catalog_dir not a valid catalog") | ||
|
||
if not self.metadata_file_path: | ||
raise ValueError("metadata_file_path required for macauff crossmatch") | ||
if not path.isfile(self.metadata_file_path): | ||
raise ValueError("Macauff column metadata file must point to valid file path.") | ||
|
||
# Basic checks complete - make more checks and create directories where necessary | ||
if self.input_path: | ||
if not file_io.does_file_or_directory_exist(self.input_path): | ||
raise FileNotFoundError("input_path not found on local storage") | ||
self.input_paths = file_io.find_files_matching_path(self.input_path, f"*{self.input_format}") | ||
elif self.input_file_list: | ||
self.input_paths = self.input_file_list | ||
if len(self.input_paths) == 0: | ||
raise FileNotFoundError("No input files found") | ||
|
||
self.column_names = self.get_column_names() | ||
|
||
def get_column_names(self): | ||
"""Grab the macauff column names.""" | ||
# TODO: Actually read in the metadata file once we get the example file from Tom. | ||
|
||
return [ | ||
"Gaia_designation", | ||
"Gaia_RA", | ||
"Gaia_Dec", | ||
"BP", | ||
"G", | ||
"RP", | ||
"CatWISE_Name", | ||
"CatWISE_RA", | ||
"CatWISE_Dec", | ||
"W1", | ||
"W2", | ||
"match_p", | ||
"Separation", | ||
"eta", | ||
"xi", | ||
"Gaia_avg_cont", | ||
"CatWISE_avg_cont", | ||
"Gaia_cont_f1", | ||
"Gaia_cont_f10", | ||
"CatWISE_cont_f1", | ||
"CatWISE_cont_f10", | ||
"CatWISE_fit_sig", | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
from hipscat_import.cross_match.macauff_arguments import MacauffArguments | ||
|
||
# pylint: disable=unused-argument | ||
|
||
|
||
def run(args, client): | ||
"""run macauff cross-match import pipeline""" | ||
if not args: | ||
raise TypeError("args is required and should be type MacauffArguments") | ||
if not isinstance(args, MacauffArguments): | ||
raise TypeError("args must be type MacauffArguments") | ||
|
||
raise NotImplementedError("macauff pipeline not implemented yet.") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
219 changes: 219 additions & 0 deletions
219
tests/hipscat_import/cross_match/test_macauff_arguments.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,219 @@ | ||
"""Tests of macauff arguments""" | ||
|
||
|
||
from os import path | ||
|
||
import pytest | ||
|
||
from hipscat_import.cross_match.macauff_arguments import MacauffArguments | ||
|
||
# pylint: disable=duplicate-code | ||
|
||
|
||
def test_macauff_arguments( | ||
small_sky_object_catalog, small_sky_source_catalog, small_sky_dir, formats_yaml, tmp_path | ||
): | ||
"""Test that we can create a MacauffArguments instance with two valid catalogs.""" | ||
args = MacauffArguments( | ||
output_path=tmp_path, | ||
output_catalog_name="object_to_source", | ||
tmp_dir=tmp_path, | ||
left_catalog_dir=small_sky_object_catalog, | ||
left_ra_column="ra", | ||
left_dec_column="dec", | ||
left_id_column="id", | ||
right_catalog_dir=small_sky_source_catalog, | ||
right_ra_column="source_ra", | ||
right_dec_column="source_dec", | ||
right_id_column="source_id", | ||
input_path=small_sky_dir, | ||
input_format="csv", | ||
metadata_file_path=formats_yaml, | ||
) | ||
|
||
assert len(args.input_paths) > 0 | ||
|
||
|
||
def test_empty_required( | ||
small_sky_object_catalog, small_sky_source_catalog, small_sky_dir, formats_yaml, tmp_path | ||
): | ||
"""All non-runtime arguments are required.""" | ||
|
||
## List of required args: | ||
## - match expression that should be found when missing | ||
## - default value | ||
required_args = [ | ||
["output_path", tmp_path], | ||
["output_catalog_name", "object_to_source"], | ||
["left_catalog_dir", small_sky_object_catalog], | ||
["left_ra_column", "ra"], | ||
["left_dec_column", "dec"], | ||
["left_id_column", "id"], | ||
["right_catalog_dir", small_sky_source_catalog], | ||
["right_ra_column", "source_ra"], | ||
["right_dec_column", "source_dec"], | ||
["right_id_column", "source_id"], | ||
["input_path", small_sky_dir], | ||
["input_format", "csv"], | ||
["metadata_file_path", formats_yaml], | ||
] | ||
|
||
## For each required argument, check that a ValueError is raised that matches the | ||
## expected name of the missing param. | ||
for index, args in enumerate(required_args): | ||
test_args = [ | ||
list_args[1] if list_index != index else None | ||
for list_index, list_args in enumerate(required_args) | ||
] | ||
|
||
print(f"testing required arg #{index}") | ||
|
||
with pytest.raises(ValueError, match=args[0]): | ||
MacauffArguments( | ||
output_path=test_args[0], | ||
output_catalog_name=test_args[1], | ||
tmp_dir=tmp_path, | ||
left_catalog_dir=test_args[2], | ||
left_ra_column=test_args[3], | ||
left_dec_column=test_args[4], | ||
left_id_column=test_args[5], | ||
right_catalog_dir=test_args[6], | ||
right_ra_column=test_args[7], | ||
right_dec_column=test_args[8], | ||
right_id_column=test_args[9], | ||
input_path=test_args[10], | ||
input_format=test_args[11], | ||
metadata_file_path=test_args[12], | ||
overwrite=True, | ||
) | ||
|
||
|
||
def test_macauff_arguments_file_list( | ||
small_sky_object_catalog, small_sky_source_catalog, small_sky_dir, formats_yaml, tmp_path | ||
): | ||
"""Test that we can create a MacauffArguments instance with two valid catalogs.""" | ||
files = [path.join(small_sky_dir, "catalog.csv")] | ||
args = MacauffArguments( | ||
output_path=tmp_path, | ||
output_catalog_name="object_to_source", | ||
tmp_dir=tmp_path, | ||
left_catalog_dir=small_sky_object_catalog, | ||
left_ra_column="ra", | ||
left_dec_column="dec", | ||
left_id_column="id", | ||
right_catalog_dir=small_sky_source_catalog, | ||
right_ra_column="source_ra", | ||
right_dec_column="source_dec", | ||
right_id_column="source_id", | ||
input_file_list=files, | ||
input_format="csv", | ||
metadata_file_path=formats_yaml, | ||
) | ||
|
||
assert len(args.input_paths) > 0 | ||
|
||
|
||
def test_macauff_args_invalid_catalog(small_sky_source_catalog, small_sky_dir, formats_yaml, tmp_path): | ||
with pytest.raises(ValueError, match="left_catalog_dir"): | ||
MacauffArguments( | ||
output_path=tmp_path, | ||
output_catalog_name="object_to_source", | ||
tmp_dir=tmp_path, | ||
left_catalog_dir=small_sky_dir, # valid path, but not a catalog | ||
left_ra_column="ra", | ||
left_dec_column="dec", | ||
left_id_column="id", | ||
right_catalog_dir=small_sky_source_catalog, | ||
right_ra_column="source_ra", | ||
right_dec_column="source_dec", | ||
right_id_column="source_id", | ||
input_path=small_sky_dir, | ||
input_format="csv", | ||
metadata_file_path=formats_yaml, | ||
) | ||
|
||
|
||
def test_macauff_args_right_invalid_catalog(small_sky_object_catalog, small_sky_dir, formats_yaml, tmp_path): | ||
with pytest.raises(ValueError, match="right_catalog_dir"): | ||
MacauffArguments( | ||
output_path=tmp_path, | ||
output_catalog_name="object_to_source", | ||
tmp_dir=tmp_path, | ||
left_catalog_dir=small_sky_object_catalog, | ||
left_ra_column="ra", | ||
left_dec_column="dec", | ||
left_id_column="id", | ||
right_catalog_dir=small_sky_dir, # valid directory with files, not a catalog | ||
right_ra_column="source_ra", | ||
right_dec_column="source_dec", | ||
right_id_column="source_id", | ||
input_path=small_sky_dir, | ||
input_format="csv", | ||
metadata_file_path=formats_yaml, | ||
) | ||
|
||
|
||
def test_macauff_args_invalid_metadata_file( | ||
small_sky_object_catalog, small_sky_source_catalog, small_sky_dir, tmp_path | ||
): | ||
with pytest.raises(ValueError, match="column metadata file must"): | ||
MacauffArguments( | ||
output_path=tmp_path, | ||
output_catalog_name="object_to_source", | ||
tmp_dir=tmp_path, | ||
left_catalog_dir=small_sky_object_catalog, | ||
left_ra_column="ra", | ||
left_dec_column="dec", | ||
left_id_column="id", | ||
right_catalog_dir=small_sky_source_catalog, | ||
right_ra_column="source_ra", | ||
right_dec_column="source_dec", | ||
right_id_column="source_id", | ||
input_path=small_sky_dir, | ||
input_format="csv", | ||
metadata_file_path="ceci_n_est_pas_un_fichier.xml", | ||
) | ||
|
||
|
||
def test_macauff_args_invalid_input_directory( | ||
small_sky_object_catalog, small_sky_source_catalog, formats_yaml, tmp_path | ||
): | ||
with pytest.raises(FileNotFoundError, match="input_path not found"): | ||
MacauffArguments( | ||
output_path=tmp_path, | ||
output_catalog_name="object_to_source", | ||
tmp_dir=tmp_path, | ||
left_catalog_dir=small_sky_object_catalog, | ||
left_ra_column="ra", | ||
left_dec_column="dec", | ||
left_id_column="id", | ||
right_catalog_dir=small_sky_source_catalog, | ||
right_ra_column="source_ra", | ||
right_dec_column="source_dec", | ||
right_id_column="source_id", | ||
input_path="ceci_n_est_pas_un_directoire/", | ||
input_format="csv", | ||
metadata_file_path=formats_yaml, | ||
) | ||
|
||
|
||
def test_macauff_args_no_files( | ||
small_sky_object_catalog, small_sky_source_catalog, small_sky_dir, formats_yaml, tmp_path | ||
): | ||
with pytest.raises(FileNotFoundError, match="No input files found"): | ||
MacauffArguments( | ||
output_path=tmp_path, | ||
output_catalog_name="object_to_source", | ||
tmp_dir=tmp_path, | ||
left_catalog_dir=small_sky_object_catalog, | ||
left_ra_column="ra", | ||
left_dec_column="dec", | ||
left_id_column="id", | ||
right_catalog_dir=small_sky_source_catalog, | ||
right_ra_column="source_ra", | ||
right_dec_column="source_dec", | ||
right_id_column="source_id", | ||
input_path=small_sky_dir, | ||
input_format="parquet", # no files of this format will be found | ||
metadata_file_path=formats_yaml, | ||
) |
Oops, something went wrong.