-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #89 from smart-on-fhir/mikix/acidlake
feat: add Delta Lake support
- Loading branch information
Showing
31 changed files
with
250 additions
and
46 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,6 @@ | ||
"""Classes that know _how_ to write out results to the target folder""" | ||
|
||
from .deltalake import DeltaLakeFormat | ||
from .json_tree import JsonTreeFormat | ||
from .ndjson import NdjsonFormat | ||
from .parquet import ParquetFormat |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,111 @@ | ||
""" | ||
An implementation of Format that writes to a Delta Lake. | ||
See https://delta.io/ | ||
""" | ||
|
||
import contextlib | ||
import logging | ||
import os | ||
|
||
import delta | ||
import pandas | ||
import pyspark | ||
from pyspark.sql.utils import AnalysisException | ||
|
||
from cumulus import store | ||
|
||
from .athena import AthenaFormat | ||
|
||
# This class would be a lot simpler if we could use fsspec & pandas directly, since that's what the rest of our code | ||
# uses and expects (in terms of filesystem writing). | ||
# | ||
# There is a 1st party Delta Lake implementation (`deltalake`) based off native Rust code and which talks to | ||
# fsspec & pandas by default. But it is missing some critical features as of this writing (mostly merges): | ||
# - Merge support in deltalake bindings: https://github.com/delta-io/delta-rs/issues/850 | ||
|
||
|
||
@contextlib.contextmanager | ||
def _suppress_output(): | ||
""" | ||
Totally hides stdout and stderr unless there is an error, and then stderr is printed. | ||
This is a more powerful version of contextlib.redirect_stdout that also works for subprocesses / threads. | ||
""" | ||
stdout = os.dup(1) | ||
stderr = os.dup(2) | ||
silent = os.open(os.devnull, os.O_WRONLY) | ||
os.dup2(silent, 1) | ||
os.dup2(silent, 2) | ||
|
||
try: | ||
yield | ||
finally: | ||
os.dup2(stdout, 1) | ||
os.dup2(stderr, 2) | ||
|
||
|
||
class DeltaLakeFormat(AthenaFormat): | ||
""" | ||
Stores data in a delta lake. | ||
""" | ||
def __init__(self, root: store.Root): | ||
super().__init__(root) | ||
|
||
# This _suppress_output call is because pyspark is SO NOISY during session creation. Like 40 lines of trivial | ||
# output. Progress reports of downloading the jars. Comments about default logging level and the hostname. | ||
# I could not find a way to set the log level before the session is created. So here we just suppress | ||
# stdout/stderr entirely. | ||
with _suppress_output(): | ||
# Prep the builder with various config options | ||
builder = pyspark.sql.SparkSession.builder \ | ||
.appName('cumulus-etl') \ | ||
.config('spark.driver.memory', '2g') \ | ||
.config('spark.sql.catalog.spark_catalog', 'org.apache.spark.sql.delta.catalog.DeltaCatalog') \ | ||
.config('spark.sql.extensions', 'io.delta.sql.DeltaSparkSessionExtension') | ||
|
||
# Now add delta's packages and actually build the session | ||
self.spark = delta.configure_spark_with_delta_pip(builder, extra_packages=[ | ||
'org.apache.hadoop:hadoop-aws:3.3.4', | ||
]).getOrCreate() | ||
|
||
self.spark.sparkContext.setLogLevel('ERROR') | ||
self._configure_fs() | ||
|
||
def write_records(self, job, df: pandas.DataFrame, dbname: str, batch: int) -> None: | ||
"""Writes the whole dataframe to a delta lake""" | ||
job.attempt += len(df) | ||
full_path = self.root.joinpath(dbname).replace('s3://', 's3a://') # hadoop uses the s3a: scheme instead of s3: | ||
|
||
try: | ||
updates = self.spark.createDataFrame(df) | ||
|
||
try: | ||
table = delta.DeltaTable.forPath(self.spark, full_path) | ||
if batch == 0: | ||
table.vacuum() # Clean up unused data files older than retention policy (default 7 days) | ||
table.alias('table') \ | ||
.merge(source=updates.alias('updates'), condition='table.id = updates.id') \ | ||
.whenMatchedUpdateAll() \ | ||
.whenNotMatchedInsertAll() \ | ||
.execute() | ||
except AnalysisException: | ||
# table does not exist yet, let's make an initial version | ||
updates.write.save(path=full_path, format='delta') | ||
|
||
job.success += len(df) | ||
job.success_rate(1) | ||
except Exception: # pylint: disable=broad-except | ||
logging.exception('Could not process data records') | ||
|
||
def _configure_fs(self): | ||
"""Tell spark/hadoop how to talk to S3 for us""" | ||
fsspec_options = self.root.fsspec_options() | ||
self.spark.conf.set('fs.s3a.sse.enabled', 'true') | ||
self.spark.conf.set('fs.s3a.server-side-encryption-algorithm', 'SSE-KMS') | ||
kms_key = fsspec_options.get('s3_additional_kwargs', {}).get('SSEKMSKeyId') | ||
if kms_key: | ||
self.spark.conf.set('fs.s3a.sse.kms.keyId', kms_key) | ||
region_name = fsspec_options.get('client_kwargs', {}).get('region_name') | ||
if region_name: | ||
self.spark.conf.set('fs.s3a.endpoint.region', region_name) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
"""Tests for Delta Lake support""" | ||
|
||
import os | ||
import shutil | ||
import tempfile | ||
import unittest | ||
|
||
import pandas | ||
from pyspark.sql.utils import AnalysisException | ||
from cumulus import config, formats, store | ||
|
||
|
||
class TestDeltaLake(unittest.TestCase): | ||
""" | ||
Test case for the Delta Lake format writer. | ||
i.e. tests for deltalake.py | ||
""" | ||
|
||
@classmethod | ||
def setUpClass(cls): | ||
super().setUpClass() | ||
output_tempdir = tempfile.TemporaryDirectory() # pylint: disable=consider-using-with | ||
cls.output_tempdir = output_tempdir | ||
cls.output_dir = output_tempdir.name | ||
|
||
# It is expensive to create a DeltaLakeFormat instance because of all the pyspark jar downloading etc. | ||
# So we only do it once per class suite. (And erase all folder contents per-test) | ||
cls.deltalake = formats.DeltaLakeFormat(store.Root(output_tempdir.name)) | ||
|
||
def setUp(self): | ||
super().setUp() | ||
shutil.rmtree(self.output_dir, ignore_errors=True) | ||
self.job = config.JobSummary() | ||
|
||
@staticmethod | ||
def df(**kwargs) -> pandas.DataFrame: | ||
""" | ||
Creates a dummy DataFrame with ids & values equal to each kwarg provided. | ||
""" | ||
rows = [{'id': k, 'value': v} for k, v in kwargs.items()] | ||
return pandas.DataFrame(rows) | ||
|
||
def store(self, df: pandas.DataFrame, batch: int = 10) -> None: | ||
""" | ||
Writes a single batch of data to the data lake. | ||
:param df: the data to insert | ||
:param batch: which batch number this is, defaulting to 10 to avoid triggering any first/last batch logic | ||
""" | ||
self.deltalake.store_patients(self.job, df, batch) | ||
|
||
def assert_lake_equal(self, df: pandas.DataFrame, when: int = None) -> None: | ||
table_path = os.path.join(self.output_dir, 'patient') | ||
|
||
reader = self.deltalake.spark.read | ||
if when is not None: | ||
reader = reader.option('versionAsOf', when) | ||
|
||
table_df = reader.format('delta').load(table_path).sort('id').toPandas() | ||
self.assertDictEqual(df.to_dict(), table_df.to_dict()) | ||
|
||
def test_creates_if_empty(self): | ||
"""Verify that the lake is created when empty""" | ||
# sanity check that it doesn't exist yet | ||
with self.assertRaises(AnalysisException): | ||
self.assert_lake_equal(self.df()) | ||
|
||
self.store(self.df(a=1)) | ||
self.assert_lake_equal(self.df(a=1)) | ||
|
||
def test_upsert(self): | ||
"""Verify that we can update and insert data""" | ||
self.store(self.df(a=1, b=2)) | ||
self.store(self.df(b=20, c=3)) | ||
self.assert_lake_equal(self.df(a=1, b=20, c=3)) |
Oops, something went wrong.