-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
fix: don't load all of an i2b2 file into memory
The primary change in this commit is to stop loading i2b2 input files all at once, but rather stream them in, in chunks determined by the --batch-size parameter. But this commit also includes several small fixes: - Fixes location of MS tool during CI - Adds comma-formatting to a lot of progress-count prints - Continues ETL even if cTAKES can't process one message (just logs the error instead) - Changes default batch size from 10M to 200k. This works more reliably for small-memory (8G) machines. The previous number was optimized for the size of the resulting parquet files. This number is optimized for memory during the run, which feels like a safer default. - When using --input-format=ndjson and pointing at a local folder, we now still use a temporary folder and copy in just the resource ndjson files we want. This is to speed up the MS deid tool, so it doesn't have to read all possible ndjson inputs. - Add better progress messaging while reading i2b2 files.
- Loading branch information
Showing
14 changed files
with
78 additions
and
83 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,68 +1,58 @@ | ||
"""Read files into data structures""" | ||
|
||
from typing import List | ||
import logging | ||
from typing import Iterator | ||
|
||
import pandas | ||
from cumulus import common | ||
|
||
from cumulus.loaders.i2b2.schema import ObservationFact, PatientDimension, VisitDimension | ||
|
||
|
||
def extract_csv(path_csv: str, sample=1.0) -> pandas.DataFrame: | ||
def extract_csv(path_csv: str, batch_size: int) -> Iterator[dict]: | ||
""" | ||
:param path_csv: /path/to/i2b2_formatted_file.csv | ||
:param sample: %percentage of file to read | ||
:return: pandas Dataframe | ||
:param batch_size: how many entries to load into memory at once | ||
:return: an iterator over each row from the file | ||
""" | ||
return common.extract_csv(path_csv, sample) | ||
print(f'Reading csv {path_csv}...') | ||
count = 0 | ||
with pandas.read_csv(path_csv, dtype=str, na_filter=False, chunksize=batch_size) as reader: | ||
for chunk in reader: | ||
print(f' Read {count:,} entries...') | ||
for _, row in chunk.iterrows(): | ||
yield dict(row) | ||
count += batch_size | ||
print(f'Done reading {path_csv} .') | ||
|
||
|
||
def extract_csv_observation_facts(path_csv: str, | ||
sample=1.0) -> List[ObservationFact]: | ||
def extract_csv_observation_facts(path_csv: str, batch_size: int) -> Iterator[ObservationFact]: | ||
""" | ||
:param path_csv: /path/to/file.csv | ||
:param sample: %percentage of file to read | ||
:param batch_size: how many entries to load into memory at once | ||
:return: i2b2 ObservationFact table | ||
""" | ||
df = extract_csv(path_csv, sample) | ||
|
||
logging.info('Transforming text into List[ObservationFact]') | ||
facts = [] | ||
for _, row in df.iterrows(): | ||
facts.append(ObservationFact(row)) | ||
|
||
logging.info('Ready List[ObservationFact]') | ||
return facts | ||
for row in extract_csv(path_csv, batch_size): | ||
yield ObservationFact(row) | ||
|
||
|
||
def extract_csv_patients(path_csv: str, sample=1.0) -> List[PatientDimension]: | ||
def extract_csv_patients(path_csv: str, batch_size: int) -> Iterator[PatientDimension]: | ||
""" | ||
:param path_csv: /path/to/file.csv | ||
:param sample: %percentage of file to read | ||
:param batch_size: how many entries to load into memory at once | ||
:return: List i2b2 patient dimension table | ||
""" | ||
df = extract_csv(path_csv, sample) | ||
|
||
logging.info('Transforming text into List[PatientDimension]') | ||
patients = [] | ||
for _, row in df.iterrows(): | ||
patients.append(PatientDimension(row)) | ||
|
||
logging.info('Ready List[PatientDimension]') | ||
return patients | ||
for row in extract_csv(path_csv, batch_size): | ||
yield PatientDimension(row) | ||
|
||
|
||
def extract_csv_visits(path_csv: str, sample=1.0) -> List[VisitDimension]: | ||
def extract_csv_visits(path_csv: str, batch_size: int) -> Iterator[VisitDimension]: | ||
""" | ||
:param path_csv: /path/to/file.csv | ||
:param sample: %percentage of file to read | ||
:param batch_size: how many entries to load into memory at once | ||
:return: List i2b2 visit dimension table | ||
""" | ||
df = extract_csv(path_csv, sample) | ||
|
||
logging.info('Transforming text into List[VisitDimension]') | ||
visits = [] | ||
for _, row in df.iterrows(): | ||
visits.append(VisitDimension(row)) | ||
|
||
logging.info('Ready List[VisitDimension]') | ||
return visits | ||
for row in extract_csv(path_csv, batch_size): | ||
yield VisitDimension(row) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters