diff --git a/fhirflat/__main__.py b/fhirflat/__main__.py index 7832ca3..7dcec77 100644 --- a/fhirflat/__main__.py +++ b/fhirflat/__main__.py @@ -1,6 +1,7 @@ import sys from .ingest import main as ingest_to_flat +from .ingest import validate_cli as validate def main(): @@ -10,16 +11,19 @@ def main(): Available subcommands: transform - Convert raw data into FHIRflat files + validate - Validate FHIRflat files against FHIR schemas """ ) sys.exit(1) subcommand = sys.argv[1] - if subcommand not in ["transform"]: + if subcommand not in ["transform", "validate"]: print("fhirflat: unrecognised subcommand", subcommand) sys.exit(1) sys.argv = sys.argv[1:] if subcommand == "transform": ingest_to_flat() + elif subcommand == "validate": + validate() else: pass diff --git a/fhirflat/ingest.py b/fhirflat/ingest.py index 5e93bcc..875066a 100644 --- a/fhirflat/ingest.py +++ b/fhirflat/ingest.py @@ -580,6 +580,49 @@ def convert_data_to_flat( shutil.rmtree(folder_name) +def validate(folder_name: str, compress_format: str | None = None): + """ + Takes a folder containing (optionally compressed) FHIRflat files and validates them + against the FHIR. File names **must** correspond to the FHIR resource types they + represent. E.g. a file containing Patient resources must be named "patient.parquet". + """ + + if compress_format: + shutil.unpack_archive(folder_name, compress_format, folder_name) + directory = Path(folder_name).parents + else: + directory = folder_name + + for file in Path(directory).glob("*.parquet"): + df = pd.read_parquet(file) + resource = file.stem + resource_type = get_local_resource(resource, case_insensitive=True) + + valid_flat, errors = resource_type.validate_fhirflat(df, return_files=True) + + if errors is not None: + + valid_flat.to_parquet(os.path.join(directory, f"{resource}_valid.parquet")) + errors.to_csv( + os.path.join(directory, f"{resource}_errors.csv"), index=False + ) + error_length = len(errors) + print( + f"{error_length} rows in {file.name} have validation errors. " + f"Errors saved to {resource}_errors.csv. " + f"Valid rows saved to {resource}_valid.parquet" + ) + else: + print(f"{file.name} is valid") + print("Validation complete") + + if compress_format: + new_directory = directory + "_validated" + shutil.make_archive(new_directory, compress_format, new_directory) + shutil.rmtree(directory) + print(f"Validated files saved as {new_directory}.{compress_format}") + + def main(): parser = argparse.ArgumentParser( description="Convert data to FHIRflat parquet files", @@ -637,5 +680,27 @@ def main(): ) +def validate_cli(): + parser = argparse.ArgumentParser( + description="Validate FHIRflat parquet files against the FHIR schema", + prog="fhirflat validate", + ) + parser.add_argument("folder", help="File path to folder containing FHIRflat files") + + parser.add_argument( + "-c", + "--compress_format", + help="Format the folder is compressed in", + choices=["zip", "tar", "gztar", "bztar", "xztar"], + ) + + args = parser.parse_args() + + validate( + args.folder, + compress_format=args.compress_format, + ) + + if __name__ == "__main__": main() diff --git a/fhirflat/resources/base.py b/fhirflat/resources/base.py index 59690e4..8d1f47d 100644 --- a/fhirflat/resources/base.py +++ b/fhirflat/resources/base.py @@ -88,7 +88,7 @@ def create_fhir_resource( @classmethod def validate_fhirflat( - cls, df: pd.DataFrame + cls, df: pd.DataFrame, return_files: bool = False ) -> tuple[FHIRFlatBase | list[FHIRFlatBase], pd.Series | None]: """ Takes a FHIRflat dataframe and validates the data against the FHIR @@ -100,6 +100,9 @@ def validate_fhirflat( ---------- df: pd.DataFrame Pandas dataframe containing the FHIRflat data + return_files: bool + If True, returns the valid FHIR resources & errors as a parquet file, + even if only one row is present in the dataframe. Returns ------- @@ -115,7 +118,7 @@ def validate_fhirflat( lambda row: row.to_json(date_format="iso", date_unit="s"), axis=1 ).apply(lambda x: cls.create_fhir_resource(x)) - if len(flat_df) == 1: + if len(flat_df) == 1 and return_files is False: resource = flat_df["fhir"].iloc[0] if isinstance(resource, ValidationError): raise resource diff --git a/fhirflat/util.py b/fhirflat/util.py index f778bde..f62c23e 100644 --- a/fhirflat/util.py +++ b/fhirflat/util.py @@ -70,8 +70,13 @@ def get_local_extension_type(t: str): raise AttributeError(f"Could not find {t} in fhirflat extensions") from ae -def get_local_resource(t: str): - return getattr(fhirflat, t) +def get_local_resource(t: str, case_insensitive: bool = False): + if case_insensitive is False: + return getattr(fhirflat, t) + else: + for a in dir(fhirflat): + if a.lower() == t.lower(): + return getattr(fhirflat, a) def find_data_class(data_class, k): diff --git a/tests/data/invalid_flat_bundle/condition.parquet b/tests/data/invalid_flat_bundle/condition.parquet new file mode 100644 index 0000000..f92c847 Binary files /dev/null and b/tests/data/invalid_flat_bundle/condition.parquet differ diff --git a/tests/data/invalid_flat_bundle/encounter.parquet b/tests/data/invalid_flat_bundle/encounter.parquet new file mode 100644 index 0000000..c743369 Binary files /dev/null and b/tests/data/invalid_flat_bundle/encounter.parquet differ diff --git a/tests/data/valid_flat_bundle/condition.parquet b/tests/data/valid_flat_bundle/condition.parquet new file mode 100644 index 0000000..f213484 Binary files /dev/null and b/tests/data/valid_flat_bundle/condition.parquet differ diff --git a/tests/data/valid_flat_bundle/encounter.parquet b/tests/data/valid_flat_bundle/encounter.parquet new file mode 100644 index 0000000..a1cdf7a Binary files /dev/null and b/tests/data/valid_flat_bundle/encounter.parquet differ diff --git a/tests/data/valid_flat_bundle/patient.parquet b/tests/data/valid_flat_bundle/patient.parquet new file mode 100644 index 0000000..c27c24b Binary files /dev/null and b/tests/data/valid_flat_bundle/patient.parquet differ diff --git a/tests/test_ingest.py b/tests/test_ingest.py index b3eea81..aaa2cbe 100644 --- a/tests/test_ingest.py +++ b/tests/test_ingest.py @@ -9,6 +9,7 @@ write_metadata, checksum, main, + validate, ) from fhirflat.resources.encounter import Encounter from fhirflat.resources.observation import Observation @@ -1225,3 +1226,31 @@ def test_convert_data_to_flat_local_mapping_errors(): ) shutil.rmtree(output_folder) + + +def test_validate_valid(capsys): + folder = "tests/data/valid_flat_bundle" + + validate(folder) + + captured = capsys.readouterr() + assert "encounter.parquet is valid" in captured.out + assert "condition.parquet is valid" in captured.out + assert "patient.parquet is valid" in captured.out + assert "Validation complete" in captured.out + + +def test_validate_invalid(capsys): + folder = "tests/data/invalid_flat_bundle" + + validate(folder) + + captured = capsys.readouterr() + assert "encounter.parquet have validation errors" in captured.out + assert "condition.parquet have validation errors" in captured.out + assert "Validation complete" in captured.out + + Path.unlink(os.path.join(folder, "encounter_errors.csv")) + Path.unlink(os.path.join(folder, "encounter_valid.parquet")) + Path.unlink(os.path.join(folder, "condition_errors.csv")) + Path.unlink(os.path.join(folder, "condition_valid.parquet"))