-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Initial commit of metadata file conversion (#156)
* Initial commit of metadata file conversion * Move shared logic for locating input files. * Explicitly depend on pyyaml
- Loading branch information
1 parent
7038f7d
commit 0629cb4
Showing
10 changed files
with
389 additions
and
20 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,102 @@ | ||
"""Utilities for generating parquet metdata from macauff-generated metadata files.""" | ||
|
||
import xml.etree.ElementTree as ET | ||
|
||
import pyarrow as pa | ||
import pyarrow.parquet as pq | ||
import yaml | ||
from hipscat.io import file_io | ||
|
||
|
||
def _get_inner_xml_value(parent_el, node_type, default_value): | ||
child_el = parent_el.findall(node_type) | ||
if len(child_el) == 0: | ||
return default_value | ||
if len(child_el) > 1: | ||
raise ValueError(f"found too many {node_type} XML elements") | ||
return child_el[0].text.strip() | ||
|
||
|
||
def _construct_field(name, units, metadata_dict): | ||
"""Helper method to construct a pyarrow field from macauff metadata strings.""" | ||
if units == "string": | ||
pa_type = pa.string() | ||
elif units in ("float", "double"): | ||
pa_type = pa.float64() | ||
elif units in ("integer", "long"): | ||
pa_type = pa.int64() | ||
else: | ||
raise ValueError(f"unhandled units {units}") | ||
return pa.field(name, pa_type, metadata=metadata_dict) | ||
|
||
|
||
def from_xml(input_file, output_file): | ||
"""Read XML file with column metadata for a cross-match file from macauff. | ||
Expects XML with the format:: | ||
<columns> | ||
<column> | ||
<name>$COLUMN_NAME</name> | ||
<description>$COLUMN_DESCRIPTION</description> | ||
<units>$COLUMN_UNIT_DESCRIPTOR</units> | ||
</column> | ||
</columns> | ||
Args: | ||
input file (str): file to read for match metadata | ||
output_file (str): desired location for output parquet metadata file | ||
Raises | ||
ValueError: if the XML is mal-formed | ||
""" | ||
fields = [] | ||
root_el = ET.parse(input_file).getroot() | ||
columns = root_el.findall("column") | ||
|
||
for column in columns: | ||
name = _get_inner_xml_value(column, "name", "foo") | ||
description = _get_inner_xml_value(column, "description", "") | ||
units = _get_inner_xml_value(column, "units", "string") | ||
|
||
fields.append(_construct_field(name, units, metadata_dict={"macauff_description": description})) | ||
|
||
schema = pa.schema(fields) | ||
pq.write_table(schema.empty_table(), where=output_file) | ||
|
||
|
||
def from_yaml(input_file, output_directory): | ||
"""Read YAML file with column metadata for the various cross-match files from macauff. | ||
Expects YAML with the format:: | ||
name: macauff_GaiaDR3xCatWISE2020 | ||
description: Match and non-match table for macauff cross-matches of Gaia DR3 and CatWISE 2020. | ||
tables: | ||
- name: macauff_GaiaDR3xCatWISE2020_matches | ||
"@id": "#macauff_GaiaDR3xCatWISE2020_matches" | ||
description: Counterpart associations between Gaia and WISE | ||
columns: | ||
- name: gaia_source_id | ||
datatype: long | ||
description: The Gaia DR3 object ID. | ||
Args: | ||
input file (str): file to read for match metadata | ||
output_dir (str): desired location for output parquet metadata files | ||
We will write one file per table in the "tables" element. | ||
""" | ||
with open(input_file, "r", encoding="utf-8") as file_handle: | ||
metadata = yaml.safe_load(file_handle) | ||
tables = metadata.get("tables", []) | ||
for index, table in enumerate(tables): | ||
fields = [] | ||
table_name = table.get("name", f"metadata_table_{index}") | ||
for col_index, column in enumerate(table.get("columns", [])): | ||
name = column.get("name", f"column_{col_index}") | ||
units = column.get("units", "string") | ||
fields.append(_construct_field(name, units, metadata_dict=column)) | ||
|
||
schema = pa.schema(fields) | ||
output_file = file_io.append_paths_to_pointer(output_directory, f"{table_name}.parquet") | ||
pq.write_table(schema.empty_table(), where=str(output_file)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
101 changes: 101 additions & 0 deletions
101
tests/hipscat_import/cross_match/test_macauff_metadata.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,101 @@ | ||
import os | ||
from xml.etree.ElementTree import ParseError | ||
|
||
import pytest | ||
from hipscat.io import file_io | ||
|
||
from hipscat_import.cross_match.macauff_metadata import from_xml, from_yaml | ||
|
||
|
||
def test_from_xml(macauff_data_dir, tmp_path): | ||
"""Test XML file reading and parquet metadata generation.""" | ||
xml_input_file = os.path.join(macauff_data_dir, "macauff_gaia_catwise_match.xml") | ||
output_file = os.path.join(tmp_path, "output.parquet") | ||
|
||
from_xml(xml_input_file, output_file) | ||
|
||
single_metadata = file_io.read_parquet_metadata(output_file) | ||
schema = single_metadata.schema.to_arrow_schema() | ||
|
||
assert len(schema) == 6 | ||
|
||
|
||
def test_from_xml_malformed(tmp_path): | ||
"""Test some invalid XML file inputs.""" | ||
input_file = os.path.join(tmp_path, "input.parquet") | ||
output_file = os.path.join(tmp_path, "output.parquet") | ||
|
||
## No "columns" found at all | ||
with open(input_file, "w", encoding="utf-8") as file_handle: | ||
file_handle.write("") | ||
|
||
with pytest.raises(ParseError, match="no element found"): | ||
from_xml(input_file, output_file) | ||
|
||
## Some columns, too many required fields | ||
with open(input_file, "w", encoding="utf-8") as file_handle: | ||
file_handle.write( | ||
"""<columns> | ||
<column> | ||
<name>Gaia_designation</name> | ||
<name>The Gaia DR3 object ID.</name> | ||
<units>long</units> | ||
</column> | ||
</columns>""" | ||
) | ||
|
||
with pytest.raises(ValueError, match="too many name XML elements"): | ||
from_xml(input_file, output_file) | ||
|
||
## Unhandled types | ||
with open(input_file, "w", encoding="utf-8") as file_handle: | ||
file_handle.write( | ||
"""<columns> | ||
<column> | ||
<name>Gaia_designation</name> | ||
<description>The Gaia DR3 object ID.</description> | ||
<units>blob</units> | ||
</column> | ||
</columns>""" | ||
) | ||
|
||
with pytest.raises(ValueError, match="unhandled units blob"): | ||
from_xml(input_file, output_file) | ||
|
||
## Some empty fields are ok! | ||
with open(input_file, "w", encoding="utf-8") as file_handle: | ||
file_handle.write( | ||
"""<columns> | ||
<column> | ||
<name> </name> | ||
<units>long </units> | ||
</column> | ||
</columns>""" | ||
) | ||
|
||
from_xml(input_file, output_file) | ||
|
||
|
||
def test_from_yaml(macauff_data_dir, tmp_path): | ||
"""Test YAML file reading and parquet metadata generation.""" | ||
yaml_input_file = os.path.join(macauff_data_dir, "macauff_gaia_catwise_match_and_nonmatches.yaml") | ||
|
||
from_yaml(yaml_input_file, tmp_path) | ||
|
||
output_file = os.path.join(tmp_path, "macauff_GaiaDR3xCatWISE2020_matches.parquet") | ||
single_metadata = file_io.read_parquet_metadata(output_file) | ||
schema = single_metadata.schema.to_arrow_schema() | ||
|
||
assert len(schema) == 7 | ||
|
||
output_file = os.path.join(tmp_path, "macauff_GaiaDR3xCatWISE2020_gaia_nonmatches.parquet") | ||
single_metadata = file_io.read_parquet_metadata(output_file) | ||
schema = single_metadata.schema.to_arrow_schema() | ||
|
||
assert len(schema) == 4 | ||
|
||
output_file = os.path.join(tmp_path, "macauff_GaiaDR3xCatWISE2020_catwise_nonmatches.parquet") | ||
single_metadata = file_io.read_parquet_metadata(output_file) | ||
schema = single_metadata.schema.to_arrow_schema() | ||
|
||
assert len(schema) == 4 |
68 changes: 68 additions & 0 deletions
68
tests/hipscat_import/data/macauff/macauff_gaia_catwise_match.xml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
<columns> | ||
<column> | ||
<name> | ||
Gaia_designation | ||
</name> | ||
<description> | ||
The Gaia DR3 object ID. | ||
</description> | ||
<units> | ||
long | ||
</units> | ||
</column> | ||
<column> | ||
<name> | ||
Gaia_RA | ||
</name> | ||
<description> | ||
Right Ascension of the Gaia DR3 source. | ||
</description> | ||
<units> | ||
float | ||
</units> | ||
</column> | ||
<column> | ||
<name> | ||
Gaia_Dec | ||
</name> | ||
<description> | ||
The Gaia DR3 declination. | ||
</description> | ||
<units> | ||
float | ||
</units> | ||
</column> | ||
<column> | ||
<name> | ||
CatWISE_Name | ||
</name> | ||
<description> | ||
The object identifier from the CatWISE 2020 catalogue. | ||
</description> | ||
<units> | ||
string | ||
</units> | ||
</column> | ||
<column> | ||
<name> | ||
CatWISE_RA | ||
</name> | ||
<description> | ||
Right Ascension of the object as quoted by the CatWISE 2020 catalogue. | ||
</description> | ||
<units> | ||
float | ||
</units> | ||
</column> | ||
<column> | ||
<name> | ||
CatWISE_Dec | ||
</name> | ||
<description> | ||
CatWISE 2020 Declination. | ||
</description> | ||
<units> | ||
float | ||
</units> | ||
</column> | ||
</columns> |
Oops, something went wrong.