fix: don't load all of an i2b2 file into memory

The primary change in this commit is to stop loading i2b2 input files all at once, but rather stream them in, in chunks determined by the --batch-size parameter. But this commit also includes several small fixes: - Fixes location of MS tool during CI - Adds comma-formatting to a lot of progress-count prints - Continues ETL even if cTAKES can't process one message (just logs the error instead) - Changes default batch size from 10M to 200k. This works more reliably for small-memory (8G) machines. The previous number was optimized for the size of the resulting parquet files. This number is optimized for memory during the run, which feels like a safer default. - When using --input-format=ndjson and pointing at a local folder, we now still use a temporary folder and copy in just the resource ndjson files we want. This is to speed up the MS deid tool, so it doesn't have to read all possible ndjson inputs. - Add better progress messaging while reading i2b2 files.
smart-on-fhir · Dec 27, 2022 · 418e8bb · 418e8bb
1 parent f657110
commit 418e8bb
Show file tree

Hide file tree

Showing 14 changed files with 78 additions and 83 deletions.
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -36,7 +36,7 @@ jobs:
             --runtime=linux-x64 \
             --configuration=Release \
             -p:PublishSingleFile=true \
-            --output=~/.local/bin \
+            --output=$HOME/.local/bin \
             mstool/FHIR/src/Microsoft.Health.Fhir.Anonymizer.R4.CommandLineTool
 
       - name: Test with pytest

diff --git a/cumulus/common.py b/cumulus/common.py
@@ -9,7 +9,6 @@
 from urllib.parse import urlparse
 
 import fsspec
-import pandas
 from fhirclient.models.resource import Resource
 from fhirclient.models.fhirabstractbase import FHIRAbstractBase
 
@@ -58,20 +57,6 @@ def find_by_name(folder, path_contains='filemask', progress_bar=1000) -> list:
     return found
 
 
-def extract_csv(path_csv: str, sample=1.0) -> pandas.DataFrame:
-    """
-    :param path_csv: /path/to/file.csv
-    :param sample: %percentage of file to read
-    :return: pandas Dataframe
-    """
-    logging.info('Reading csv %s ...', path_csv)
-    df = pandas.read_csv(path_csv, dtype=str, na_filter=False)
-    if sample != 1.0:
-        df = df.sample(frac=sample)
-    logging.info('Done reading %s .', path_csv)
-    return df
-
-
 def fake_id(category: str) -> str:
     """
     Randomly generate a linked Patient identifier

diff --git a/cumulus/config.py b/cumulus/config.py
@@ -87,7 +87,7 @@ def success_rate(self, show_every=1000 * 10) -> float:
         prct = float(self.success) / float(self.attempt)
 
         if 0 == self.attempt % show_every:
-            print(f'success = {self.success} rate % {prct}')
+            print(f'success = {self.success:,} rate % {prct}')
 
         return prct
 

diff --git a/cumulus/ctakes.py b/cumulus/ctakes.py
@@ -42,7 +42,11 @@ def symptoms(cache: store.Root, docref: DocumentReference) -> List[Observation]:
         logging.warning('No text/plain content for symptoms')  # ideally would print identifier, but it's PHI...
         return []
 
-    ctakes_json = extract(cache, physician_note)
+    try:
+        ctakes_json = extract(cache, physician_note)
+    except Exception as exc:  # pylint: disable=broad-except
+        logging.error('Could not extract symptoms: %s', exc)
+        return []
 
     observations = []
     for match in ctakes_json.list_sign_symptom(ctakesclient.typesystem.Polarity.pos):

diff --git a/cumulus/deid/mstool.py b/cumulus/deid/mstool.py
@@ -8,7 +8,7 @@
 import subprocess  # nosec: B404
 import sys
 
-from cumulus import errors
+from cumulus import common, errors
 
 MSTOOL_CMD = 'Microsoft.Health.Fhir.Anonymizer.R4.CommandLineTool'
 
@@ -23,6 +23,7 @@ def run_mstool(input_dir: str, output_dir: str) -> None:
 
     The input must be in ndjson format. And the output will be as well.
     """
+    common.print_header('De-identifying data...')
     try:
         # The following call only points at some temporary directory names (which we generate),
         # so it should be safe, and we thus disable the security linter warning about validating inputs.

diff --git a/cumulus/etl.py b/cumulus/etl.py
@@ -366,9 +366,9 @@ def main(args: List[str]):
                         help='input format (default is ndjson)')
     parser.add_argument('--output-format', default='parquet', choices=['json', 'ndjson', 'parquet'],
                         help='output format (default is parquet)')
-    parser.add_argument('--batch-size', type=int, metavar='SIZE', default=10000000,
+    parser.add_argument('--batch-size', type=int, metavar='SIZE', default=200000,
                         help='how many entries to process at once and thus '
-                             'how many to put in one output file (default is 10M)')
+                             'how many to put in one output file (default is 200k)')
     parser.add_argument('--comment', help='add the comment to the log file')
     parser.add_argument('--s3-region', help='if using S3 paths (s3://...), this is their region')
     parser.add_argument('--s3-kms-key', help='if using S3 paths (s3://...), this is the KMS key ID to use')
@@ -397,7 +397,7 @@ def main(args: List[str]):
     job_datetime = common.datetime_now()  # grab timestamp before we do anything
 
     if args.input_format == 'i2b2':
-        config_loader = loaders.I2b2Loader(root_input)
+        config_loader = loaders.I2b2Loader(root_input, args.batch_size)
     else:
         config_loader = loaders.FhirNdjsonLoader(root_input, client_id=args.smart_client_id, jwks=args.smart_jwks)
 

diff --git a/cumulus/loaders/fhir/fhir_ndjson.py b/cumulus/loaders/fhir/fhir_ndjson.py
@@ -35,16 +35,18 @@ def load_all(self, resources: List[str]) -> tempfile.TemporaryDirectory:
         if self.root.protocol in ['http', 'https']:
             return self._load_from_bulk_export(resources)
 
-        # Are we reading from a local directory?
-        if self.root.protocol == 'file':
-            # We can actually just re-use the input dir without copying the files, since everything is local.
-            class Dir:
-                name: str = self.root.path
-            return Dir()  # once we drop python3.7, we can have load_all return a Protocol for proper typing
-
-        # Fall back to copying from a remote directory (like S3 buckets) to a local one
+        # Copy the resources we need from the remote directory (like S3 buckets) to a local one.
+        #
+        # We do this even if the files are local, because the next step in our pipeline is the MS deid tool,
+        # and it will just process *everything* in a directory. So if there are other *.ndjson sitting next to our
+        # target resources, they'll get processed by the MS tool and that slows down running a single task with
+        # "--task" a lot.
+        #
+        # This uses more disk space temporarily (copied files will get deleted once the MS tool is done and this
+        # TemporaryDirectory gets discarded), but that seems reasonable.
         tmpdir = tempfile.TemporaryDirectory()  # pylint: disable=consider-using-with
-        self.root.get(self.root.joinpath('*.ndjson'), f'{tmpdir.name}/')
+        for resource in resources:
+            self.root.get(self.root.joinpath(f'*{resource}*.ndjson'), f'{tmpdir.name}/')
         return tmpdir
 
     def _load_from_bulk_export(self, resources: List[str]) -> tempfile.TemporaryDirectory:

diff --git a/cumulus/loaders/i2b2/extract.py b/cumulus/loaders/i2b2/extract.py
@@ -1,68 +1,58 @@
 """Read files into data structures"""
 
-from typing import List
 import logging
+from typing import Iterator
+
 import pandas
-from cumulus import common
+
 from cumulus.loaders.i2b2.schema import ObservationFact, PatientDimension, VisitDimension
 
 
-def extract_csv(path_csv: str, sample=1.0) -> pandas.DataFrame:
+def extract_csv(path_csv: str, batch_size: int) -> Iterator[dict]:
     """
     :param path_csv: /path/to/i2b2_formatted_file.csv
-    :param sample: %percentage of file to read
-    :return: pandas Dataframe
+    :param batch_size: how many entries to load into memory at once
+    :return: an iterator over each row from the file
     """
-    return common.extract_csv(path_csv, sample)
+    print(f'Reading csv {path_csv}...')
+    count = 0
+    with pandas.read_csv(path_csv, dtype=str, na_filter=False, chunksize=batch_size) as reader:
+        for chunk in reader:
+            print(f'  Read {count:,} entries...')
+            for _, row in chunk.iterrows():
+                yield dict(row)
+            count += batch_size
+    print(f'Done reading {path_csv} .')
 
 
-def extract_csv_observation_facts(path_csv: str,
-                                  sample=1.0) -> List[ObservationFact]:
+def extract_csv_observation_facts(path_csv: str, batch_size: int) -> Iterator[ObservationFact]:
     """
     :param path_csv: /path/to/file.csv
-    :param sample: %percentage of file to read
+    :param batch_size: how many entries to load into memory at once
     :return: i2b2 ObservationFact table
     """
-    df = extract_csv(path_csv, sample)
-
     logging.info('Transforming text into List[ObservationFact]')
-    facts = []
-    for _, row in df.iterrows():
-        facts.append(ObservationFact(row))
-
-    logging.info('Ready List[ObservationFact]')
-    return facts
+    for row in extract_csv(path_csv, batch_size):
+        yield ObservationFact(row)
 
 
-def extract_csv_patients(path_csv: str, sample=1.0) -> List[PatientDimension]:
+def extract_csv_patients(path_csv: str, batch_size: int) -> Iterator[PatientDimension]:
     """
     :param path_csv: /path/to/file.csv
-    :param sample: %percentage of file to read
+    :param batch_size: how many entries to load into memory at once
     :return: List i2b2 patient dimension table
     """
-    df = extract_csv(path_csv, sample)
-
     logging.info('Transforming text into List[PatientDimension]')
-    patients = []
-    for _, row in df.iterrows():
-        patients.append(PatientDimension(row))
-
-    logging.info('Ready List[PatientDimension]')
-    return patients
+    for row in extract_csv(path_csv, batch_size):
+        yield PatientDimension(row)
 
 
-def extract_csv_visits(path_csv: str, sample=1.0) -> List[VisitDimension]:
+def extract_csv_visits(path_csv: str, batch_size: int) -> Iterator[VisitDimension]:
     """
     :param path_csv: /path/to/file.csv
-    :param sample: %percentage of file to read
+    :param batch_size: how many entries to load into memory at once
     :return: List i2b2 visit dimension table
     """
-    df = extract_csv(path_csv, sample)
-
     logging.info('Transforming text into List[VisitDimension]')
-    visits = []
-    for _, row in df.iterrows():
-        visits.append(VisitDimension(row))
-
-    logging.info('Ready List[VisitDimension]')
-    return visits
+    for row in extract_csv(path_csv, batch_size):
+        yield VisitDimension(row)
diff --git a/cumulus/loaders/i2b2/loader.py b/cumulus/loaders/i2b2/loader.py
@@ -8,6 +8,7 @@
 
 from fhirclient.models.resource import Resource
 
+from cumulus import store
 from cumulus.loaders.base import Loader
 from cumulus.loaders.i2b2 import extract, schema, transform
 from cumulus.loaders.i2b2.oracle import extract as oracle_extract
@@ -31,6 +32,15 @@ class I2b2Loader(Loader):
     - csv_visit
     """
 
+    def __init__(self, root: store.Root, batch_size: int):
+        """
+        Initialize a new I2b2Loader class
+        :param root: the base location to read data from
+        :param batch_size: the most entries to keep in memory at once
+        """
+        super().__init__(root)
+        self.batch_size = batch_size
+
     def load_all(self, resources: List[str]) -> tempfile.TemporaryDirectory:
         if self.root.protocol in ['tcp']:
             return self._load_all_from_oracle(resources)
@@ -117,13 +127,14 @@ def _load_all_from_csv(self, resources: List[str]) -> tempfile.TemporaryDirector
         return self._load_all_with_extractors(
             resources,
             conditions=partial(extract.extract_csv_observation_facts,
-                               os.path.join(path, 'observation_fact_diagnosis.csv')),
+                               os.path.join(path, 'observation_fact_diagnosis.csv'), self.batch_size),
             observations=partial(extract.extract_csv_observation_facts,
-                                 os.path.join(path, 'observation_fact_lab_views.csv')),
+                                 os.path.join(path, 'observation_fact_lab_views.csv'), self.batch_size),
             documentreferences=partial(extract.extract_csv_observation_facts,
-                                       os.path.join(path, 'observation_fact_notes.csv')),
-            patients=partial(extract.extract_csv_patients, os.path.join(path, 'patient_dimension.csv')),
-            encounters=partial(extract.extract_csv_visits, os.path.join(path, 'visit_dimension.csv')),
+                                       os.path.join(path, 'observation_fact_notes.csv'), self.batch_size),
+            patients=partial(extract.extract_csv_patients, os.path.join(path, 'patient_dimension.csv'),
+                             self.batch_size),
+            encounters=partial(extract.extract_csv_visits, os.path.join(path, 'visit_dimension.csv'), self.batch_size),
         )
 
     ###################################################################################################################

diff --git a/cumulus/loaders/i2b2/resources/external_mappings.py b/cumulus/loaders/i2b2/resources/external_mappings.py
@@ -51,9 +51,13 @@
 
 
 # PHIN VADS Admission statuses {308277006, SNOMED-CT }
-# Subset of codes related to means of admition
+# Subset of codes related to means of admission
 # system: http://snomed.info/sct
+# Other terms seen:
+# - "Recurring Outpatient Series" (AMB?)
+# - "Day Surgery" (AMB?)
 SNOMED_ADMISSION = {
+    'Emergency': 'EMER',
     'Inpatient': 'IMP',
-    'Emergency': 'EMER'
+    'Outpatient': 'AMB',
 }
diff --git a/cumulus/loaders/i2b2/transform.py b/cumulus/loaders/i2b2/transform.py
@@ -115,9 +115,7 @@ def to_fhir_encounter(visit: VisitDimension) -> Encounter:
             'code': external_mappings.SNOMED_ADMISSION.get(visit.inout_cd)
         })
     else:
-        logging.warning(
-            'skipping encounter.class_fhir.code for i2b2 '
-            'INOUT_CD : %s', visit.inout_cd)
+        logging.debug('skipping encounter.class_fhir.code for i2b2 INOUT_CD : %s', visit.inout_cd)
     return encounter
 
 

diff --git a/docs/howtos/run-cumulus-etl.md b/docs/howtos/run-cumulus-etl.md
@@ -179,7 +179,7 @@ docker compose -f $CUMULUS_REPO_PATH/compose.yaml run --rm\
   --comment="Any interesting logging data you like, like which user launched this" \
   --input-format=ndjson \
   --output-format=parquet \
-  --batch-size=10000000 \
+  --batch-size=200000 \
   --s3-region=us-east-2 \
   s3://my-us-east-2-input-bucket/ \
   s3://my-cumulus-prefix-99999999999-us-east-2/subdir1/ \

diff --git a/tests/test_etl.py b/tests/test_etl.py
@@ -204,7 +204,7 @@ def test_unknown_task(self):
 
     def test_single_task(self):
         # Grab all observations before we mock anything
-        observations = loaders.I2b2Loader(store.Root(self.input_path)).load_all(['Observation'])
+        observations = loaders.I2b2Loader(store.Root(self.input_path), 5).load_all(['Observation'])
 
         def fake_load_all(internal_self, resources):
             del internal_self
@@ -221,7 +221,7 @@ def fake_load_all(internal_self, resources):
 
     def test_multiple_tasks(self):
         # Grab all observations before we mock anything
-        loaded = loaders.I2b2Loader(store.Root(self.input_path)).load_all(['Observation', 'Patient'])
+        loaded = loaders.I2b2Loader(store.Root(self.input_path), 5).load_all(['Observation', 'Patient'])
 
         def fake_load_all(internal_self, resources):
             del internal_self
@@ -439,7 +439,7 @@ def test_stores_cached_json(self):
         self.run_etl(output_format='parquet')
 
         notes_csv_path = os.path.join(self.input_path, 'observation_fact_notes.csv')
-        facts = extract.extract_csv_observation_facts(notes_csv_path)
+        facts = list(extract.extract_csv_observation_facts(notes_csv_path, 5))
 
         for index, checksum in enumerate(self.expected_checksums):
             self.assertEqual(

diff --git a/tests/test_i2b2_oracle_extract.py b/tests/test_i2b2_oracle_extract.py
@@ -79,7 +79,7 @@ def test_loader(self, mock_extract):
         mock_extract.list_visit.return_value = [i2b2_mock_data.encounter_dim()]
 
         root = store.Root('tcp://localhost/foo')
-        oracle_loader = loader.I2b2Loader(root)
+        oracle_loader = loader.I2b2Loader(root, 5)
         tmpdir = oracle_loader.load_all(['Condition', 'Encounter', 'Patient'])
 
         # Check results