Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Thermo Fisher NanoDrop Eight - handle alternative column names #731

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .vscode/cspell.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
"chardet",
"dpcr",
"dropna",
"dtype",
"ffill",
"fillna",
"flourescence",
Expand Down
6 changes: 6 additions & 0 deletions src/allotropy/parsers/lines_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,12 @@ def drop_until_empty_inclusive(
self.drop_until_empty(empty_pat)
return self.pop()

def pop_while(self, match_pat: str) -> Iterator[str]:
while self.current_line_exists() and self.match(match_pat):
line = self.pop()
if line is not None:
yield line

def pop_until(self, match_pat: str) -> Iterator[str]:
while self.current_line_exists() and not self.match(match_pat):
line = self.pop()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
create_metadata,
SpectroscopyRow,
)
from allotropy.parsers.utils.pandas import map_rows
from allotropy.parsers.vendor_parser import VendorParser


Expand All @@ -25,8 +26,8 @@ class Nanodrop8000Parser(VendorParser[Data, Model]):
SCHEMA_MAPPER = Mapper

def create_data(self, named_file_contents: NamedFileContents) -> Data:
data = Nanodrop8000Reader.read(named_file_contents)
rows = SpectroscopyRow.create_rows(data)
reader = Nanodrop8000Reader(named_file_contents)
rows = map_rows(reader.data, SpectroscopyRow.create)

return Data(
create_metadata(named_file_contents.original_file_name),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,19 +10,17 @@

class Nanodrop8000Reader:
SUPPORTED_EXTENSIONS = "txt"
data: pd.DataFrame

@classmethod
def read(cls, named_file_contents: NamedFileContents) -> pd.DataFrame:
def __init__(self, named_file_contents: NamedFileContents):
all_lines = lines_reader.read_to_lines(named_file_contents)
reader = CsvReader(all_lines)
lines = reader.pop_csv_block_as_lines()
raw_data = read_csv(
self.data = read_csv(
StringIO("\n".join(lines)),
sep="\t",
dtype={"Plate ID": str, "Sample ID": str},
# Prevent pandas from rounding decimal values, at the cost of some speed.
float_precision="round_trip",
)
raw_data = raw_data.rename(columns=lambda x: x.strip().lower())

return raw_data
self.data.columns = self.data.columns.str.strip().str.lower()
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@

from dataclasses import dataclass

import pandas as pd

from allotropy.allotrope.models.shared.definitions.units import UNITLESS
from allotropy.allotrope.schema_mappers.adm.spectrophotometry.benchling._2023._12.spectrophotometry import (
CalculatedDataItem,
Expand All @@ -18,7 +16,7 @@
from allotropy.parsers.constants import NOT_APPLICABLE
from allotropy.parsers.thermo_fisher_nanodrop_8000 import constants
from allotropy.parsers.utils.iterables import get_first_not_none
from allotropy.parsers.utils.pandas import map_rows, SeriesData
from allotropy.parsers.utils.pandas import SeriesData
from allotropy.parsers.utils.uuids import random_uuid_str


Expand Down Expand Up @@ -190,11 +188,6 @@ def create(data: SeriesData) -> SpectroscopyRow:
calculated_data,
)

@staticmethod
def create_rows(data: pd.DataFrame) -> list[SpectroscopyRow]:
data.columns = data.columns.str.lower()
return map_rows(data, SpectroscopyRow.create)


def create_metadata(file_name: str) -> Metadata:
return Metadata(
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from functools import partial

from allotropy.allotrope.models.adm.spectrophotometry.benchling._2023._12.spectrophotometry import (
Model,
)
Expand All @@ -15,6 +17,7 @@
create_metadata,
SpectroscopyRow,
)
from allotropy.parsers.utils.pandas import map_rows
from allotropy.parsers.vendor_parser import VendorParser


Expand All @@ -26,13 +29,19 @@ class NanodropEightParser(VendorParser[Data, Model]):
SCHEMA_MAPPER = Mapper

def create_data(self, named_file_contents: NamedFileContents) -> Data:
data = NanodropEightReader.read(named_file_contents)
rows = SpectroscopyRow.create_rows(data)
metadata = create_metadata(named_file_contents.original_file_name, data)
reader = NanodropEightReader(named_file_contents)
rows = map_rows(
reader.data, partial(SpectroscopyRow.create, header=reader.header)
)
metadata = create_metadata(
reader.header, named_file_contents.original_file_name
)

return Data(
metadata=metadata,
measurement_groups=[create_measurement_group(row) for row in rows],
measurement_groups=[
create_measurement_group(row, reader.header) for row in rows
],
# NOTE: in current implementation, calculated data is reported at global level for some reason.
# TODO(nstender): should we move this inside of measurements?
calculated_data=[item for row in rows for item in row.calculated_data],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,37 +2,42 @@

import pandas as pd

from allotropy.exceptions import AllotropeConversionError
from allotropy.named_file_contents import NamedFileContents
from allotropy.parsers import lines_reader
from allotropy.parsers.lines_reader import CsvReader
from allotropy.parsers.utils.pandas import read_csv
from allotropy.parsers.lines_reader import CsvReader, read_to_lines
from allotropy.parsers.utils.pandas import read_csv, SeriesData


class NanodropEightReader:
SUPPORTED_EXTENSIONS = "txt,tsv"
header: SeriesData
data: pd.DataFrame

@classmethod
def read(cls, named_file_contents: NamedFileContents) -> pd.DataFrame:
all_lines = lines_reader.read_to_lines(named_file_contents)
reader = CsvReader(all_lines)
def __init__(self, named_file_contents: NamedFileContents) -> None:
reader = CsvReader(read_to_lines(named_file_contents))

preamble = [*reader.pop_until(".*?Sample Name.*?")]
header_data = {}
# Header lines are expected to have a single 'key: value' pair, while table will have multiple
# tab-separated column headers. So, detect header lines as:
# <anything but a tab>:<any number of tabs><anything but a tab>
for line in reader.pop_while(match_pat=r"^[^\t]*:[\t]*[^\t]*$"):
key, value = line.split(":")
header_data[key] = value.strip()

header = pd.Series(header_data)
header.index = header.index.str.strip().str.lower()
self.header = SeriesData(header)

lines = reader.pop_csv_block_as_lines()
if not lines:
msg = "Reached end of file without finding table data."
raise AllotropeConversionError(msg)

raw_data = read_csv(
self.data = read_csv(
StringIO("\n".join(lines)),
sep="\t",
dtype={"Sample Name": str, "Sample ID": str},
# Prevent pandas from rounding decimal values, at the cost of some speed.
float_precision="round_trip",
)

for line in preamble:
key, val = line.split("\t")
key = key.replace(":", "").strip()
val = val.strip()
raw_data[key] = val

raw_data = raw_data.rename(columns=lambda x: x.strip())
return raw_data
self.data.columns = self.data.columns.str.lower()
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@

from dataclasses import dataclass

import pandas as pd

from allotropy.allotrope.models.shared.definitions.definitions import (
FieldComponentDatatype,
)
Expand All @@ -26,23 +24,22 @@
)
from allotropy.parsers.thermo_fisher_nanodrop_eight import constants
from allotropy.parsers.utils.iterables import get_first_not_none
from allotropy.parsers.utils.pandas import map_rows, SeriesData
from allotropy.parsers.utils.pandas import SeriesData
from allotropy.parsers.utils.uuids import random_uuid_str
from allotropy.parsers.utils.values import try_float, try_float_or_none


@dataclass
class SpectroscopyRow:
analyst: str | None
timestamp: str
experiment_type: str | None
measurements: list[Measurement]
calculated_data: list[CalculatedDataItem]

@staticmethod
def create(data: SeriesData) -> SpectroscopyRow:
def create(data: SeriesData, header: SeriesData) -> SpectroscopyRow:
absorbances = read_absorbances(data)
experiment_type = data.get(str, "application")
experiment_type = header.get(str, "application")
mass_concentration_capture_wavelength = (
read_mass_concentration_capture_wavelength(
data, experiment_type, absorbances
Expand Down Expand Up @@ -75,7 +72,9 @@ def create(data: SeriesData) -> SpectroscopyRow:
measures=[list(spectra_data.values())],
)

sample_id = data.get(str, "sample id", NOT_APPLICABLE, SeriesData.NOT_NAN)
sample_id = data.get(
str, ["sample id", "uid"], NOT_APPLICABLE, SeriesData.NOT_NAN
)
location_id = data.get(str, "location")
measurements: list[Measurement] = []
for wavelength, absorbance in absorbances.items():
Expand Down Expand Up @@ -118,33 +117,29 @@ def create(data: SeriesData) -> SpectroscopyRow:
calculated_data = create_calculated_data(data, measurements)

return SpectroscopyRow(
data.get(str, "user name"),
data[str, "date & time"],
data[str, ["date", "date & time"]],
experiment_type,
measurements,
calculated_data,
)

@staticmethod
def create_rows(data: pd.DataFrame) -> list[SpectroscopyRow]:
data.columns = data.columns.str.lower()
return map_rows(data, SpectroscopyRow.create)


def create_metadata(file_name: str, data: pd.DataFrame) -> Metadata:
def create_metadata(data: SeriesData, file_name: str) -> Metadata:
return Metadata(
device_identifier=constants.DEVICE_IDENTIFIER,
device_type=constants.DEVICE_TYPE,
model_number=constants.MODEL_NUBMER,
equipment_serial_number=data.iloc[0]["serial number"],
equipment_serial_number=data[str, "serial number"],
file_name=file_name,
)


def create_measurement_group(row: SpectroscopyRow) -> MeasurementGroup:
def create_measurement_group(
row: SpectroscopyRow, header: SeriesData
) -> MeasurementGroup:
return MeasurementGroup(
measurement_time=row.timestamp,
analyst=row.analyst,
analyst=header.get(str, "user name"),
experiment_type=row.experiment_type,
measurements=row.measurements,
)
5 changes: 5 additions & 0 deletions tests/parsers/lines_reader_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,3 +176,8 @@ def test_reader_pop_until() -> None:
def test_reader_pop_until_empty() -> None:
test_reader = get_test_reader()
assert list(test_reader.pop_until_empty()) == INPUT_LINES[:5]


def test_reader_pop_while() -> None:
lines = ["k1: v1", "k2 : v2", "Something else"]
assert list(LinesReader(lines).pop_while(":")) == lines[:2]
Loading
Loading