From 4311922d2e23a52b1bf0805f6f29b9cf74f42e42 Mon Sep 17 00:00:00 2001 From: Alexandre Gattiker Date: Sat, 14 Mar 2020 00:18:19 +0100 Subject: [PATCH] databricks-test: Fix for multiple runs (#6) --- Python/packages/databricks-test/Dockerfile | 21 +++++--- .../databricks_test/__init__.py | 18 ++++++- Python/packages/databricks-test/setup.py | 2 +- .../tests/multiple_runs_notebook.py | 19 +++++++ .../tests/multiple_runs_notebook2.py | 19 +++++++ .../tests/multiple_runs_test.py | 49 +++++++++++++++++++ 6 files changed, 117 insertions(+), 11 deletions(-) create mode 100644 Python/packages/databricks-test/tests/multiple_runs_notebook.py create mode 100644 Python/packages/databricks-test/tests/multiple_runs_notebook2.py create mode 100644 Python/packages/databricks-test/tests/multiple_runs_test.py diff --git a/Python/packages/databricks-test/Dockerfile b/Python/packages/databricks-test/Dockerfile index 9bc9c7e..44eda04 100644 --- a/Python/packages/databricks-test/Dockerfile +++ b/Python/packages/databricks-test/Dockerfile @@ -1,16 +1,21 @@ FROM conda/miniconda3 -RUN apt-get update -RUN apt-get install -y gcc wget openjdk-8-jdk +ARG SPARK_VERSION=2.4.4 +ARG HADOOP_VERSION=2.7 +# Install Spark runtime WORKDIR / -RUN wget -q https://www-eu.apache.org/dist/spark/spark-2.4.4/spark-2.4.4-bin-hadoop2.7.tgz -RUN tar -zxf spark-2.4.4-bin-hadoop2.7.tgz -RUN rm spark-2.4.4-bin-hadoop2.7.tgz -ENV SPARK_HOME /spark-2.4.4-bin-hadoop2.7 +RUN apt-get update \ + && apt-get install -y \ + wget \ + apt-transport-https \ + openjdk-8-jdk \ + && wget -q https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz \ + && tar -zxf spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz \ + && rm spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz -RUN pip install pytest==5.3.1 pytest-mock==1.13.0 flake8==3.7.9 pyspark==2.4.4 pyarrow==0.13.0 +ENV SPARK_HOME /spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION} -RUN pip install pandas==0.24.2 +RUN pip install pytest==5.3.1 pytest-mock==1.13.0 flake8==3.7.9 pyspark==${SPARK_VERSION} pyarrow==0.13.0 pandas==0.24.2 CMD ["python"] diff --git a/Python/packages/databricks-test/databricks_test/__init__.py b/Python/packages/databricks-test/databricks_test/__init__.py index 66deabd..74c58f1 100644 --- a/Python/packages/databricks-test/databricks_test/__init__.py +++ b/Python/packages/databricks-test/databricks_test/__init__.py @@ -2,9 +2,11 @@ import inspect from pyspark.sql import SparkSession from pyspark.sql.functions import udf +from tempfile import TemporaryDirectory import importlib import sys import os +import pathlib globalSession = None @@ -80,13 +82,16 @@ def __init__(self): class Session(): - def __init__(self): + def __init__(self, hivedir): self.display = MagicMock() self.displayHTML = MagicMock() self.dbutils = DbUtils() + + hivedirUrl = pathlib.Path(hivedir).as_uri() self.spark = (SparkSession.builder .master("local") .appName("test-pyspark") + .config("spark.sql.warehouse.dir", hivedirUrl) .enableHiveSupport() .getOrCreate()) @@ -101,6 +106,13 @@ def run_notebook(self, dir, script): importlib.import_module(script) else: # If script was already imported, reload it to rerun it + + # Per importlib docs: When a module is reloaded, its + # dictionary (global variables) is retained. + # Delete dbutils to ensure inject_variables gets called. + del sys.modules[script].dbutils + + # Reload the notebook module importlib.reload(sys.modules[script]) except WorkflowInterrupted: pass @@ -136,12 +148,14 @@ def __enter__(self): global globalSession if globalSession: raise SessionAlreadyExistsException("A session already exists") - globalSession = Session() + self.tmpdir = TemporaryDirectory() + globalSession = Session(self.tmpdir.name) return globalSession def __exit__(self, exc_type, exc_value, traceback): global globalSession globalSession = None + del self.tmpdir class add_path(): diff --git a/Python/packages/databricks-test/setup.py b/Python/packages/databricks-test/setup.py index 0b2cdf4..40b7c58 100644 --- a/Python/packages/databricks-test/setup.py +++ b/Python/packages/databricks-test/setup.py @@ -5,7 +5,7 @@ setuptools.setup( name='databricks_test', - version='0.0.1', + version='0.0.4', author="Alexandre Gattiker", author_email="algattik@microsoft.com", description="Unit testing and mocking for Databricks", diff --git a/Python/packages/databricks-test/tests/multiple_runs_notebook.py b/Python/packages/databricks-test/tests/multiple_runs_notebook.py new file mode 100644 index 0000000..28eba5f --- /dev/null +++ b/Python/packages/databricks-test/tests/multiple_runs_notebook.py @@ -0,0 +1,19 @@ +# Databricks notebook source + +# COMMAND ---------- + +# Instrument for unit tests. This is only executed in local unit tests, not in Databricks. +if 'dbutils' not in locals(): + import databricks_test + databricks_test.inject_variables() + +# COMMAND ---------- + +# Widgets for interactive development. +dbutils.widgets.text("input", "") +dbutils.widgets.text("output", "") + +# COMMAND ---------- + +with open(dbutils.widgets.get('output'), "w") as output_file: + output_file.write(dbutils.widgets.get('input')) diff --git a/Python/packages/databricks-test/tests/multiple_runs_notebook2.py b/Python/packages/databricks-test/tests/multiple_runs_notebook2.py new file mode 100644 index 0000000..28eba5f --- /dev/null +++ b/Python/packages/databricks-test/tests/multiple_runs_notebook2.py @@ -0,0 +1,19 @@ +# Databricks notebook source + +# COMMAND ---------- + +# Instrument for unit tests. This is only executed in local unit tests, not in Databricks. +if 'dbutils' not in locals(): + import databricks_test + databricks_test.inject_variables() + +# COMMAND ---------- + +# Widgets for interactive development. +dbutils.widgets.text("input", "") +dbutils.widgets.text("output", "") + +# COMMAND ---------- + +with open(dbutils.widgets.get('output'), "w") as output_file: + output_file.write(dbutils.widgets.get('input')) diff --git a/Python/packages/databricks-test/tests/multiple_runs_test.py b/Python/packages/databricks-test/tests/multiple_runs_test.py new file mode 100644 index 0000000..711ff2d --- /dev/null +++ b/Python/packages/databricks-test/tests/multiple_runs_test.py @@ -0,0 +1,49 @@ +import databricks_test +from tempfile import NamedTemporaryFile +import uuid + + +def run_notebook(notebook, run_num, dbrickstest): + + input = str(uuid.uuid4()) + + with NamedTemporaryFile() as tmp_dir: + + # Provide input and output location as widgets to notebook + switch = { + "input": input, + "output": tmp_dir.name, + } + dbrickstest.dbutils.widgets.get.side_effect = lambda x: switch.get( + x, "") + + # Run notebook + dbrickstest.run_notebook(".", notebook) + + # Notebook writes the input parameter as output file + with open(tmp_dir.name) as output_file: + assert input == output_file.read(), f"Run #{run_num} output" + + +def test_multiple_runs_in_same_session_1(): + with databricks_test.session() as dbrickstest: + run_notebook("multiple_runs_notebook", 1, dbrickstest) + run_notebook("multiple_runs_notebook", 2, dbrickstest) + + with databricks_test.session() as dbrickstest: + run_notebook("multiple_runs_notebook", 3, dbrickstest) + + +def test_multiple_runs_in_same_session_and_run_other_session(): + with databricks_test.session() as dbrickstest: + run_notebook("multiple_runs_notebook", 4, dbrickstest) + + +def test_multiple_runs_in_multiple_test_cases(): + with databricks_test.session() as dbrickstest: + run_notebook("multiple_runs_notebook2", 5, dbrickstest) + + +def test_multiple_runs_in_multiple_test_cases2(): + with databricks_test.session() as dbrickstest: + run_notebook("multiple_runs_notebook2", 6, dbrickstest)