Skip to content

Commit

Permalink
Add validate CLI
Browse files Browse the repository at this point in the history
  • Loading branch information
pipliggins committed Aug 1, 2024
1 parent 4457674 commit ca283d3
Show file tree
Hide file tree
Showing 10 changed files with 111 additions and 5 deletions.
6 changes: 5 additions & 1 deletion fhirflat/__main__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import sys

from .ingest import main as ingest_to_flat
from .ingest import validate_cli as validate


def main():
Expand All @@ -10,16 +11,19 @@ def main():
Available subcommands:
transform - Convert raw data into FHIRflat files
validate - Validate FHIRflat files against FHIR schemas
"""
)
sys.exit(1)
subcommand = sys.argv[1]
if subcommand not in ["transform"]:
if subcommand not in ["transform", "validate"]:
print("fhirflat: unrecognised subcommand", subcommand)
sys.exit(1)
sys.argv = sys.argv[1:]
if subcommand == "transform":
ingest_to_flat()
elif subcommand == "validate":
validate()
else:
pass

Expand Down
65 changes: 65 additions & 0 deletions fhirflat/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -580,6 +580,49 @@ def convert_data_to_flat(
shutil.rmtree(folder_name)


def validate(folder_name: str, compress_format: str | None = None):
"""
Takes a folder containing (optionally compressed) FHIRflat files and validates them
against the FHIR. File names **must** correspond to the FHIR resource types they
represent. E.g. a file containing Patient resources must be named "patient.parquet".
"""

if compress_format:
shutil.unpack_archive(folder_name, compress_format, folder_name)
directory = Path(folder_name).parents
else:
directory = folder_name

for file in Path(directory).glob("*.parquet"):
df = pd.read_parquet(file)
resource = file.stem
resource_type = get_local_resource(resource, case_insensitive=True)

valid_flat, errors = resource_type.validate_fhirflat(df, return_files=True)

if errors is not None:

valid_flat.to_parquet(os.path.join(directory, f"{resource}_valid.parquet"))
errors.to_csv(
os.path.join(directory, f"{resource}_errors.csv"), index=False
)
error_length = len(errors)
print(
f"{error_length} rows in {file.name} have validation errors. "
f"Errors saved to {resource}_errors.csv. "
f"Valid rows saved to {resource}_valid.parquet"
)
else:
print(f"{file.name} is valid")
print("Validation complete")

if compress_format:
new_directory = directory + "_validated"
shutil.make_archive(new_directory, compress_format, new_directory)
shutil.rmtree(directory)
print(f"Validated files saved as {new_directory}.{compress_format}")


def main():
parser = argparse.ArgumentParser(
description="Convert data to FHIRflat parquet files",
Expand Down Expand Up @@ -637,5 +680,27 @@ def main():
)


def validate_cli():
parser = argparse.ArgumentParser(
description="Validate FHIRflat parquet files against the FHIR schema",
prog="fhirflat validate",
)
parser.add_argument("folder", help="File path to folder containing FHIRflat files")

parser.add_argument(
"-c",
"--compress_format",
help="Format the folder is compressed in",
choices=["zip", "tar", "gztar", "bztar", "xztar"],
)

args = parser.parse_args()

validate(
args.folder,
compress_format=args.compress_format,
)


if __name__ == "__main__":
main()
7 changes: 5 additions & 2 deletions fhirflat/resources/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ def create_fhir_resource(

@classmethod
def validate_fhirflat(
cls, df: pd.DataFrame
cls, df: pd.DataFrame, return_files: bool = False
) -> tuple[FHIRFlatBase | list[FHIRFlatBase], pd.Series | None]:
"""
Takes a FHIRflat dataframe and validates the data against the FHIR
Expand All @@ -100,6 +100,9 @@ def validate_fhirflat(
----------
df: pd.DataFrame
Pandas dataframe containing the FHIRflat data
return_files: bool
If True, returns the valid FHIR resources & errors as a parquet file,
even if only one row is present in the dataframe.
Returns
-------
Expand All @@ -115,7 +118,7 @@ def validate_fhirflat(
lambda row: row.to_json(date_format="iso", date_unit="s"), axis=1
).apply(lambda x: cls.create_fhir_resource(x))

if len(flat_df) == 1:
if len(flat_df) == 1 and return_files is False:
resource = flat_df["fhir"].iloc[0]
if isinstance(resource, ValidationError):
raise resource
Expand Down
9 changes: 7 additions & 2 deletions fhirflat/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,13 @@ def get_local_extension_type(t: str):
raise AttributeError(f"Could not find {t} in fhirflat extensions") from ae


def get_local_resource(t: str):
return getattr(fhirflat, t)
def get_local_resource(t: str, case_insensitive: bool = False):
if case_insensitive is False:
return getattr(fhirflat, t)
else:
for a in dir(fhirflat):
if a.lower() == t.lower():
return getattr(fhirflat, a)


def find_data_class(data_class, k):
Expand Down
Binary file not shown.
Binary file added tests/data/invalid_flat_bundle/encounter.parquet
Binary file not shown.
Binary file added tests/data/valid_flat_bundle/condition.parquet
Binary file not shown.
Binary file added tests/data/valid_flat_bundle/encounter.parquet
Binary file not shown.
Binary file added tests/data/valid_flat_bundle/patient.parquet
Binary file not shown.
29 changes: 29 additions & 0 deletions tests/test_ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
write_metadata,
checksum,
main,
validate,
)
from fhirflat.resources.encounter import Encounter
from fhirflat.resources.observation import Observation
Expand Down Expand Up @@ -1225,3 +1226,31 @@ def test_convert_data_to_flat_local_mapping_errors():
)

shutil.rmtree(output_folder)


def test_validate_valid(capsys):
folder = "tests/data/valid_flat_bundle"

validate(folder)

captured = capsys.readouterr()
assert "encounter.parquet is valid" in captured.out
assert "condition.parquet is valid" in captured.out
assert "patient.parquet is valid" in captured.out
assert "Validation complete" in captured.out


def test_validate_invalid(capsys):
folder = "tests/data/invalid_flat_bundle"

validate(folder)

captured = capsys.readouterr()
assert "encounter.parquet have validation errors" in captured.out
assert "condition.parquet have validation errors" in captured.out
assert "Validation complete" in captured.out

Path.unlink(os.path.join(folder, "encounter_errors.csv"))
Path.unlink(os.path.join(folder, "encounter_valid.parquet"))
Path.unlink(os.path.join(folder, "condition_errors.csv"))
Path.unlink(os.path.join(folder, "condition_valid.parquet"))

0 comments on commit ca283d3

Please sign in to comment.