diff --git a/.vscode/cspell.json b/.vscode/cspell.json index 8b6033538..efcb5b9f5 100644 --- a/.vscode/cspell.json +++ b/.vscode/cspell.json @@ -5,6 +5,7 @@ "chardet", "dpcr", "dropna", + "dtype", "ffill", "fillna", "flourescence", diff --git a/src/allotropy/parsers/lines_reader.py b/src/allotropy/parsers/lines_reader.py index bb5c4e8ce..7b70fd202 100644 --- a/src/allotropy/parsers/lines_reader.py +++ b/src/allotropy/parsers/lines_reader.py @@ -130,6 +130,12 @@ def drop_until_empty_inclusive( self.drop_until_empty(empty_pat) return self.pop() + def pop_while(self, match_pat: str) -> Iterator[str]: + while self.current_line_exists() and self.match(match_pat): + line = self.pop() + if line is not None: + yield line + def pop_until(self, match_pat: str) -> Iterator[str]: while self.current_line_exists() and not self.match(match_pat): line = self.pop() diff --git a/src/allotropy/parsers/thermo_fisher_nanodrop_8000/nanodrop_8000_parser.py b/src/allotropy/parsers/thermo_fisher_nanodrop_8000/nanodrop_8000_parser.py index 7c9d43855..8d010ca6f 100644 --- a/src/allotropy/parsers/thermo_fisher_nanodrop_8000/nanodrop_8000_parser.py +++ b/src/allotropy/parsers/thermo_fisher_nanodrop_8000/nanodrop_8000_parser.py @@ -15,6 +15,7 @@ create_metadata, SpectroscopyRow, ) +from allotropy.parsers.utils.pandas import map_rows from allotropy.parsers.vendor_parser import VendorParser @@ -25,8 +26,8 @@ class Nanodrop8000Parser(VendorParser[Data, Model]): SCHEMA_MAPPER = Mapper def create_data(self, named_file_contents: NamedFileContents) -> Data: - data = Nanodrop8000Reader.read(named_file_contents) - rows = SpectroscopyRow.create_rows(data) + reader = Nanodrop8000Reader(named_file_contents) + rows = map_rows(reader.data, SpectroscopyRow.create) return Data( create_metadata(named_file_contents.original_file_name), diff --git a/src/allotropy/parsers/thermo_fisher_nanodrop_8000/nanodrop_8000_reader.py b/src/allotropy/parsers/thermo_fisher_nanodrop_8000/nanodrop_8000_reader.py index f8f08fe76..137642ee9 100644 --- a/src/allotropy/parsers/thermo_fisher_nanodrop_8000/nanodrop_8000_reader.py +++ b/src/allotropy/parsers/thermo_fisher_nanodrop_8000/nanodrop_8000_reader.py @@ -10,19 +10,17 @@ class Nanodrop8000Reader: SUPPORTED_EXTENSIONS = "txt" + data: pd.DataFrame - @classmethod - def read(cls, named_file_contents: NamedFileContents) -> pd.DataFrame: + def __init__(self, named_file_contents: NamedFileContents): all_lines = lines_reader.read_to_lines(named_file_contents) reader = CsvReader(all_lines) lines = reader.pop_csv_block_as_lines() - raw_data = read_csv( + self.data = read_csv( StringIO("\n".join(lines)), sep="\t", dtype={"Plate ID": str, "Sample ID": str}, # Prevent pandas from rounding decimal values, at the cost of some speed. float_precision="round_trip", ) - raw_data = raw_data.rename(columns=lambda x: x.strip().lower()) - - return raw_data + self.data.columns = self.data.columns.str.strip().str.lower() diff --git a/src/allotropy/parsers/thermo_fisher_nanodrop_8000/nanodrop_8000_structure.py b/src/allotropy/parsers/thermo_fisher_nanodrop_8000/nanodrop_8000_structure.py index 71b5ae30a..e5d7ffd1a 100644 --- a/src/allotropy/parsers/thermo_fisher_nanodrop_8000/nanodrop_8000_structure.py +++ b/src/allotropy/parsers/thermo_fisher_nanodrop_8000/nanodrop_8000_structure.py @@ -2,8 +2,6 @@ from dataclasses import dataclass -import pandas as pd - from allotropy.allotrope.models.shared.definitions.units import UNITLESS from allotropy.allotrope.schema_mappers.adm.spectrophotometry.benchling._2023._12.spectrophotometry import ( CalculatedDataItem, @@ -18,7 +16,7 @@ from allotropy.parsers.constants import NOT_APPLICABLE from allotropy.parsers.thermo_fisher_nanodrop_8000 import constants from allotropy.parsers.utils.iterables import get_first_not_none -from allotropy.parsers.utils.pandas import map_rows, SeriesData +from allotropy.parsers.utils.pandas import SeriesData from allotropy.parsers.utils.uuids import random_uuid_str @@ -190,11 +188,6 @@ def create(data: SeriesData) -> SpectroscopyRow: calculated_data, ) - @staticmethod - def create_rows(data: pd.DataFrame) -> list[SpectroscopyRow]: - data.columns = data.columns.str.lower() - return map_rows(data, SpectroscopyRow.create) - def create_metadata(file_name: str) -> Metadata: return Metadata( diff --git a/src/allotropy/parsers/thermo_fisher_nanodrop_eight/nanodrop_eight_parser.py b/src/allotropy/parsers/thermo_fisher_nanodrop_eight/nanodrop_eight_parser.py index 9d3f974a5..6d9d00b96 100644 --- a/src/allotropy/parsers/thermo_fisher_nanodrop_eight/nanodrop_eight_parser.py +++ b/src/allotropy/parsers/thermo_fisher_nanodrop_eight/nanodrop_eight_parser.py @@ -1,3 +1,5 @@ +from functools import partial + from allotropy.allotrope.models.adm.spectrophotometry.benchling._2023._12.spectrophotometry import ( Model, ) @@ -15,6 +17,7 @@ create_metadata, SpectroscopyRow, ) +from allotropy.parsers.utils.pandas import map_rows from allotropy.parsers.vendor_parser import VendorParser @@ -26,13 +29,19 @@ class NanodropEightParser(VendorParser[Data, Model]): SCHEMA_MAPPER = Mapper def create_data(self, named_file_contents: NamedFileContents) -> Data: - data = NanodropEightReader.read(named_file_contents) - rows = SpectroscopyRow.create_rows(data) - metadata = create_metadata(named_file_contents.original_file_name, data) + reader = NanodropEightReader(named_file_contents) + rows = map_rows( + reader.data, partial(SpectroscopyRow.create, header=reader.header) + ) + metadata = create_metadata( + reader.header, named_file_contents.original_file_name + ) return Data( metadata=metadata, - measurement_groups=[create_measurement_group(row) for row in rows], + measurement_groups=[ + create_measurement_group(row, reader.header) for row in rows + ], # NOTE: in current implementation, calculated data is reported at global level for some reason. # TODO(nstender): should we move this inside of measurements? calculated_data=[item for row in rows for item in row.calculated_data], diff --git a/src/allotropy/parsers/thermo_fisher_nanodrop_eight/nanodrop_eight_reader.py b/src/allotropy/parsers/thermo_fisher_nanodrop_eight/nanodrop_eight_reader.py index 6b12a025c..40350ab18 100644 --- a/src/allotropy/parsers/thermo_fisher_nanodrop_eight/nanodrop_eight_reader.py +++ b/src/allotropy/parsers/thermo_fisher_nanodrop_eight/nanodrop_eight_reader.py @@ -2,37 +2,42 @@ import pandas as pd +from allotropy.exceptions import AllotropeConversionError from allotropy.named_file_contents import NamedFileContents -from allotropy.parsers import lines_reader -from allotropy.parsers.lines_reader import CsvReader -from allotropy.parsers.utils.pandas import read_csv +from allotropy.parsers.lines_reader import CsvReader, read_to_lines +from allotropy.parsers.utils.pandas import read_csv, SeriesData class NanodropEightReader: SUPPORTED_EXTENSIONS = "txt,tsv" + header: SeriesData + data: pd.DataFrame - @classmethod - def read(cls, named_file_contents: NamedFileContents) -> pd.DataFrame: - all_lines = lines_reader.read_to_lines(named_file_contents) - reader = CsvReader(all_lines) + def __init__(self, named_file_contents: NamedFileContents) -> None: + reader = CsvReader(read_to_lines(named_file_contents)) - preamble = [*reader.pop_until(".*?Sample Name.*?")] + header_data = {} + # Header lines are expected to have a single 'key: value' pair, while table will have multiple + # tab-separated column headers. So, detect header lines as: + # : + for line in reader.pop_while(match_pat=r"^[^\t]*:[\t]*[^\t]*$"): + key, value = line.split(":") + header_data[key] = value.strip() + + header = pd.Series(header_data) + header.index = header.index.str.strip().str.lower() + self.header = SeriesData(header) lines = reader.pop_csv_block_as_lines() + if not lines: + msg = "Reached end of file without finding table data." + raise AllotropeConversionError(msg) - raw_data = read_csv( + self.data = read_csv( StringIO("\n".join(lines)), sep="\t", dtype={"Sample Name": str, "Sample ID": str}, # Prevent pandas from rounding decimal values, at the cost of some speed. float_precision="round_trip", ) - - for line in preamble: - key, val = line.split("\t") - key = key.replace(":", "").strip() - val = val.strip() - raw_data[key] = val - - raw_data = raw_data.rename(columns=lambda x: x.strip()) - return raw_data + self.data.columns = self.data.columns.str.lower() diff --git a/src/allotropy/parsers/thermo_fisher_nanodrop_eight/nanodrop_eight_structure.py b/src/allotropy/parsers/thermo_fisher_nanodrop_eight/nanodrop_eight_structure.py index b64b26e3a..5a028eb50 100644 --- a/src/allotropy/parsers/thermo_fisher_nanodrop_eight/nanodrop_eight_structure.py +++ b/src/allotropy/parsers/thermo_fisher_nanodrop_eight/nanodrop_eight_structure.py @@ -2,8 +2,6 @@ from dataclasses import dataclass -import pandas as pd - from allotropy.allotrope.models.shared.definitions.definitions import ( FieldComponentDatatype, ) @@ -26,23 +24,22 @@ ) from allotropy.parsers.thermo_fisher_nanodrop_eight import constants from allotropy.parsers.utils.iterables import get_first_not_none -from allotropy.parsers.utils.pandas import map_rows, SeriesData +from allotropy.parsers.utils.pandas import SeriesData from allotropy.parsers.utils.uuids import random_uuid_str from allotropy.parsers.utils.values import try_float, try_float_or_none @dataclass class SpectroscopyRow: - analyst: str | None timestamp: str experiment_type: str | None measurements: list[Measurement] calculated_data: list[CalculatedDataItem] @staticmethod - def create(data: SeriesData) -> SpectroscopyRow: + def create(data: SeriesData, header: SeriesData) -> SpectroscopyRow: absorbances = read_absorbances(data) - experiment_type = data.get(str, "application") + experiment_type = header.get(str, "application") mass_concentration_capture_wavelength = ( read_mass_concentration_capture_wavelength( data, experiment_type, absorbances @@ -75,7 +72,9 @@ def create(data: SeriesData) -> SpectroscopyRow: measures=[list(spectra_data.values())], ) - sample_id = data.get(str, "sample id", NOT_APPLICABLE, SeriesData.NOT_NAN) + sample_id = data.get( + str, ["sample id", "uid"], NOT_APPLICABLE, SeriesData.NOT_NAN + ) location_id = data.get(str, "location") measurements: list[Measurement] = [] for wavelength, absorbance in absorbances.items(): @@ -118,33 +117,29 @@ def create(data: SeriesData) -> SpectroscopyRow: calculated_data = create_calculated_data(data, measurements) return SpectroscopyRow( - data.get(str, "user name"), - data[str, "date & time"], + data[str, ["date", "date & time"]], experiment_type, measurements, calculated_data, ) - @staticmethod - def create_rows(data: pd.DataFrame) -> list[SpectroscopyRow]: - data.columns = data.columns.str.lower() - return map_rows(data, SpectroscopyRow.create) - -def create_metadata(file_name: str, data: pd.DataFrame) -> Metadata: +def create_metadata(data: SeriesData, file_name: str) -> Metadata: return Metadata( device_identifier=constants.DEVICE_IDENTIFIER, device_type=constants.DEVICE_TYPE, model_number=constants.MODEL_NUBMER, - equipment_serial_number=data.iloc[0]["serial number"], + equipment_serial_number=data[str, "serial number"], file_name=file_name, ) -def create_measurement_group(row: SpectroscopyRow) -> MeasurementGroup: +def create_measurement_group( + row: SpectroscopyRow, header: SeriesData +) -> MeasurementGroup: return MeasurementGroup( measurement_time=row.timestamp, - analyst=row.analyst, + analyst=header.get(str, "user name"), experiment_type=row.experiment_type, measurements=row.measurements, ) diff --git a/tests/parsers/lines_reader_test.py b/tests/parsers/lines_reader_test.py index 966e0af39..e9e9e28ea 100644 --- a/tests/parsers/lines_reader_test.py +++ b/tests/parsers/lines_reader_test.py @@ -176,3 +176,8 @@ def test_reader_pop_until() -> None: def test_reader_pop_until_empty() -> None: test_reader = get_test_reader() assert list(test_reader.pop_until_empty()) == INPUT_LINES[:5] + + +def test_reader_pop_while() -> None: + lines = ["k1: v1", "k2 : v2", "Something else"] + assert list(LinesReader(lines).pop_while(":")) == lines[:2] diff --git a/tests/parsers/thermo_fisher_nanodrop_eight/testdata/thermo_nanodrop_eight_RNA.json b/tests/parsers/thermo_fisher_nanodrop_eight/testdata/thermo_nanodrop_eight_RNA.json new file mode 100644 index 000000000..93799e2d0 --- /dev/null +++ b/tests/parsers/thermo_fisher_nanodrop_eight/testdata/thermo_nanodrop_eight_RNA.json @@ -0,0 +1,279 @@ +{ + "$asm.manifest": "http://purl.allotrope.org/manifests/spectrophotometry/BENCHLING/2023/12/spectrophotometry.manifest", + "spectrophotometry aggregate document": { + "spectrophotometry document": [ + { + "measurement aggregate document": { + "measurement time": "2023-11-09T14:19:00+00:00", + "measurement document": [ + { + "measurement identifier": "THERMO_FISHER_NANODROP_EIGHT_TEST_ID_0", + "device control aggregate document": { + "device control document": [ + { + "device type": "absorbance detector", + "detector wavelength setting": { + "value": 260, + "unit": "nm" + } + } + ] + }, + "sample document": { + "sample identifier": "6a2f673f-a80a-4bfe-b797-6ec1a26d76f4" + }, + "absorbance": { + "value": 17.62, + "unit": "mAU" + } + }, + { + "measurement identifier": "THERMO_FISHER_NANODROP_EIGHT_TEST_ID_1", + "device control aggregate document": { + "device control document": [ + { + "device type": "absorbance detector", + "detector wavelength setting": { + "value": 280, + "unit": "nm" + } + } + ] + }, + "sample document": { + "sample identifier": "6a2f673f-a80a-4bfe-b797-6ec1a26d76f4" + }, + "absorbance": { + "value": 8.51, + "unit": "mAU" + } + } + ], + "experiment type": "RNA" + } + }, + { + "measurement aggregate document": { + "measurement time": "2023-11-09T14:19:00+00:00", + "measurement document": [ + { + "measurement identifier": "THERMO_FISHER_NANODROP_EIGHT_TEST_ID_4", + "device control aggregate document": { + "device control document": [ + { + "device type": "absorbance detector", + "detector wavelength setting": { + "value": 260, + "unit": "nm" + } + } + ] + }, + "sample document": { + "sample identifier": "359b61c9-d124-4132-aaa1-ce85f22ae037" + }, + "absorbance": { + "value": 17.16, + "unit": "mAU" + } + }, + { + "measurement identifier": "THERMO_FISHER_NANODROP_EIGHT_TEST_ID_5", + "device control aggregate document": { + "device control document": [ + { + "device type": "absorbance detector", + "detector wavelength setting": { + "value": 280, + "unit": "nm" + } + } + ] + }, + "sample document": { + "sample identifier": "359b61c9-d124-4132-aaa1-ce85f22ae037" + }, + "absorbance": { + "value": 8.3, + "unit": "mAU" + } + } + ], + "experiment type": "RNA" + } + }, + { + "measurement aggregate document": { + "measurement time": "2023-11-09T14:20:00+00:00", + "measurement document": [ + { + "measurement identifier": "THERMO_FISHER_NANODROP_EIGHT_TEST_ID_8", + "device control aggregate document": { + "device control document": [ + { + "device type": "absorbance detector", + "detector wavelength setting": { + "value": 260, + "unit": "nm" + } + } + ] + }, + "sample document": { + "sample identifier": "5b85e508-31ee-4bc2-807a-ca6355594f3d" + }, + "absorbance": { + "value": 17.75, + "unit": "mAU" + } + }, + { + "measurement identifier": "THERMO_FISHER_NANODROP_EIGHT_TEST_ID_9", + "device control aggregate document": { + "device control document": [ + { + "device type": "absorbance detector", + "detector wavelength setting": { + "value": 280, + "unit": "nm" + } + } + ] + }, + "sample document": { + "sample identifier": "5b85e508-31ee-4bc2-807a-ca6355594f3d" + }, + "absorbance": { + "value": 8.54, + "unit": "mAU" + } + } + ], + "experiment type": "RNA" + } + } + ], + "device system document": { + "device identifier": "Nanodrop", + "model number": "Nanodrop Eight", + "equipment serial number": "1234" + }, + "data system document": { + "file name": "thermo_nanodrop_eight_RNA.txt", + "ASM converter name": "allotropy_thermo_fisher_nanodrop_eight", + "ASM converter version": "0.1.61" + }, + "calculated data aggregate document": { + "calculated data document": [ + { + "calculated data name": "A260/280", + "calculated result": { + "value": 2.07, + "unit": "(unitless)" + }, + "data source aggregate document": { + "data source document": [ + { + "data source identifier": "THERMO_FISHER_NANODROP_EIGHT_TEST_ID_0", + "data source feature": "absorbance" + }, + { + "data source identifier": "THERMO_FISHER_NANODROP_EIGHT_TEST_ID_1", + "data source feature": "absorbance" + } + ] + }, + "calculated data identifier": "THERMO_FISHER_NANODROP_EIGHT_TEST_ID_2" + }, + { + "calculated data name": "A260/230", + "calculated result": { + "value": 2.19, + "unit": "(unitless)" + }, + "data source aggregate document": { + "data source document": [ + { + "data source identifier": "THERMO_FISHER_NANODROP_EIGHT_TEST_ID_0", + "data source feature": "absorbance" + } + ] + }, + "calculated data identifier": "THERMO_FISHER_NANODROP_EIGHT_TEST_ID_3" + }, + { + "calculated data name": "A260/280", + "calculated result": { + "value": 2.07, + "unit": "(unitless)" + }, + "data source aggregate document": { + "data source document": [ + { + "data source identifier": "THERMO_FISHER_NANODROP_EIGHT_TEST_ID_4", + "data source feature": "absorbance" + }, + { + "data source identifier": "THERMO_FISHER_NANODROP_EIGHT_TEST_ID_5", + "data source feature": "absorbance" + } + ] + }, + "calculated data identifier": "THERMO_FISHER_NANODROP_EIGHT_TEST_ID_6" + }, + { + "calculated data name": "A260/230", + "calculated result": { + "value": 1.87, + "unit": "(unitless)" + }, + "data source aggregate document": { + "data source document": [ + { + "data source identifier": "THERMO_FISHER_NANODROP_EIGHT_TEST_ID_4", + "data source feature": "absorbance" + } + ] + }, + "calculated data identifier": "THERMO_FISHER_NANODROP_EIGHT_TEST_ID_7" + }, + { + "calculated data name": "A260/280", + "calculated result": { + "value": 2.08, + "unit": "(unitless)" + }, + "data source aggregate document": { + "data source document": [ + { + "data source identifier": "THERMO_FISHER_NANODROP_EIGHT_TEST_ID_8", + "data source feature": "absorbance" + }, + { + "data source identifier": "THERMO_FISHER_NANODROP_EIGHT_TEST_ID_9", + "data source feature": "absorbance" + } + ] + }, + "calculated data identifier": "THERMO_FISHER_NANODROP_EIGHT_TEST_ID_10" + }, + { + "calculated data name": "A260/230", + "calculated result": { + "value": 2.0, + "unit": "(unitless)" + }, + "data source aggregate document": { + "data source document": [ + { + "data source identifier": "THERMO_FISHER_NANODROP_EIGHT_TEST_ID_8", + "data source feature": "absorbance" + } + ] + }, + "calculated data identifier": "THERMO_FISHER_NANODROP_EIGHT_TEST_ID_11" + } + ] + } + } +} diff --git a/tests/parsers/thermo_fisher_nanodrop_eight/testdata/thermo_nanodrop_eight_RNA.txt b/tests/parsers/thermo_fisher_nanodrop_eight/testdata/thermo_nanodrop_eight_RNA.txt new file mode 100644 index 000000000..cb4d6472d --- /dev/null +++ b/tests/parsers/thermo_fisher_nanodrop_eight/testdata/thermo_nanodrop_eight_RNA.txt @@ -0,0 +1,6 @@ +Application: RNA +Serial number: 1234 +Date UID Sample Username ng/µL A260/A280 A260/A230 A260 A280 WeirdExtra: AndMore +11/09/2023 14:19 6a2f673f-a80a-4bfe-b797-6ec1a26d76f4 ALP096 PBL1 GN104564 704,986 2,07 2,19 17,62 8,51 Blah Blah +11/09/2023 14:19 359b61c9-d124-4132-aaa1-ce85f22ae037 ALP111 PBL1 GN104564 686,535 2,07 1,87 17,16 8,30 Blah Blah +11/09/2023 14:20 5b85e508-31ee-4bc2-807a-ca6355594f3d 651 PBL1 GN104564 710,130 2,08 2,00 17,75 8,54 Blah Blah