From 4311922d2e23a52b1bf0805f6f29b9cf74f42e42 Mon Sep 17 00:00:00 2001
From: Alexandre Gattiker <algattik@users.noreply.github.com>
Date: Sat, 14 Mar 2020 00:18:19 +0100
Subject: [PATCH] databricks-test: Fix for multiple runs (#6)

---
 Python/packages/databricks-test/Dockerfile    | 21 +++++---
 .../databricks_test/__init__.py               | 18 ++++++-
 Python/packages/databricks-test/setup.py      |  2 +-
 .../tests/multiple_runs_notebook.py           | 19 +++++++
 .../tests/multiple_runs_notebook2.py          | 19 +++++++
 .../tests/multiple_runs_test.py               | 49 +++++++++++++++++++
 6 files changed, 117 insertions(+), 11 deletions(-)
 create mode 100644 Python/packages/databricks-test/tests/multiple_runs_notebook.py
 create mode 100644 Python/packages/databricks-test/tests/multiple_runs_notebook2.py
 create mode 100644 Python/packages/databricks-test/tests/multiple_runs_test.py

diff --git a/Python/packages/databricks-test/Dockerfile b/Python/packages/databricks-test/Dockerfile
index 9bc9c7e..44eda04 100644
--- a/Python/packages/databricks-test/Dockerfile
+++ b/Python/packages/databricks-test/Dockerfile
@@ -1,16 +1,21 @@
 FROM conda/miniconda3
 
-RUN apt-get update
-RUN apt-get install -y gcc wget openjdk-8-jdk
+ARG SPARK_VERSION=2.4.4
+ARG HADOOP_VERSION=2.7
 
+# Install Spark runtime
 WORKDIR /
-RUN wget -q https://www-eu.apache.org/dist/spark/spark-2.4.4/spark-2.4.4-bin-hadoop2.7.tgz
-RUN tar -zxf spark-2.4.4-bin-hadoop2.7.tgz
-RUN rm spark-2.4.4-bin-hadoop2.7.tgz
-ENV SPARK_HOME /spark-2.4.4-bin-hadoop2.7
+RUN apt-get update \
+    && apt-get install -y \
+        wget \
+        apt-transport-https \
+        openjdk-8-jdk \
+    && wget -q https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz \
+    && tar -zxf spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz \
+    && rm spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz
 
-RUN pip install pytest==5.3.1 pytest-mock==1.13.0 flake8==3.7.9 pyspark==2.4.4 pyarrow==0.13.0
+ENV SPARK_HOME /spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}
 
-RUN pip install pandas==0.24.2
+RUN pip install pytest==5.3.1 pytest-mock==1.13.0 flake8==3.7.9 pyspark==${SPARK_VERSION} pyarrow==0.13.0 pandas==0.24.2
 
 CMD ["python"]
diff --git a/Python/packages/databricks-test/databricks_test/__init__.py b/Python/packages/databricks-test/databricks_test/__init__.py
index 66deabd..74c58f1 100644
--- a/Python/packages/databricks-test/databricks_test/__init__.py
+++ b/Python/packages/databricks-test/databricks_test/__init__.py
@@ -2,9 +2,11 @@
 import inspect
 from pyspark.sql import SparkSession
 from pyspark.sql.functions import udf
+from tempfile import TemporaryDirectory
 import importlib
 import sys
 import os
+import pathlib
 
 
 globalSession = None
@@ -80,13 +82,16 @@ def __init__(self):
 
 
 class Session():
-    def __init__(self):
+    def __init__(self, hivedir):
         self.display = MagicMock()
         self.displayHTML = MagicMock()
         self.dbutils = DbUtils()
+
+        hivedirUrl = pathlib.Path(hivedir).as_uri()
         self.spark = (SparkSession.builder
                       .master("local")
                       .appName("test-pyspark")
+                      .config("spark.sql.warehouse.dir", hivedirUrl)
                       .enableHiveSupport()
                       .getOrCreate())
 
@@ -101,6 +106,13 @@ def run_notebook(self, dir, script):
                     importlib.import_module(script)
                 else:
                     # If script was already imported, reload it to rerun it
+
+                    # Per importlib docs: When a module is reloaded, its
+                    # dictionary (global variables) is retained.
+                    # Delete dbutils to ensure inject_variables gets called.
+                    del sys.modules[script].dbutils
+
+                    # Reload the notebook module
                     importlib.reload(sys.modules[script])
         except WorkflowInterrupted:
             pass
@@ -136,12 +148,14 @@ def __enter__(self):
         global globalSession
         if globalSession:
             raise SessionAlreadyExistsException("A session already exists")
-        globalSession = Session()
+        self.tmpdir = TemporaryDirectory()
+        globalSession = Session(self.tmpdir.name)
         return globalSession
 
     def __exit__(self, exc_type, exc_value, traceback):
         global globalSession
         globalSession = None
+        del self.tmpdir
 
 
 class add_path():
diff --git a/Python/packages/databricks-test/setup.py b/Python/packages/databricks-test/setup.py
index 0b2cdf4..40b7c58 100644
--- a/Python/packages/databricks-test/setup.py
+++ b/Python/packages/databricks-test/setup.py
@@ -5,7 +5,7 @@
 
 setuptools.setup(
     name='databricks_test',
-    version='0.0.1',
+    version='0.0.4',
     author="Alexandre Gattiker",
     author_email="algattik@microsoft.com",
     description="Unit testing and mocking for Databricks",
diff --git a/Python/packages/databricks-test/tests/multiple_runs_notebook.py b/Python/packages/databricks-test/tests/multiple_runs_notebook.py
new file mode 100644
index 0000000..28eba5f
--- /dev/null
+++ b/Python/packages/databricks-test/tests/multiple_runs_notebook.py
@@ -0,0 +1,19 @@
+# Databricks notebook source
+
+# COMMAND ----------
+
+# Instrument for unit tests. This is only executed in local unit tests, not in Databricks.
+if 'dbutils' not in locals():
+    import databricks_test
+    databricks_test.inject_variables()
+
+# COMMAND ----------
+
+# Widgets for interactive development.
+dbutils.widgets.text("input", "")
+dbutils.widgets.text("output", "")
+
+# COMMAND ----------
+
+with open(dbutils.widgets.get('output'), "w") as output_file:
+    output_file.write(dbutils.widgets.get('input'))
diff --git a/Python/packages/databricks-test/tests/multiple_runs_notebook2.py b/Python/packages/databricks-test/tests/multiple_runs_notebook2.py
new file mode 100644
index 0000000..28eba5f
--- /dev/null
+++ b/Python/packages/databricks-test/tests/multiple_runs_notebook2.py
@@ -0,0 +1,19 @@
+# Databricks notebook source
+
+# COMMAND ----------
+
+# Instrument for unit tests. This is only executed in local unit tests, not in Databricks.
+if 'dbutils' not in locals():
+    import databricks_test
+    databricks_test.inject_variables()
+
+# COMMAND ----------
+
+# Widgets for interactive development.
+dbutils.widgets.text("input", "")
+dbutils.widgets.text("output", "")
+
+# COMMAND ----------
+
+with open(dbutils.widgets.get('output'), "w") as output_file:
+    output_file.write(dbutils.widgets.get('input'))
diff --git a/Python/packages/databricks-test/tests/multiple_runs_test.py b/Python/packages/databricks-test/tests/multiple_runs_test.py
new file mode 100644
index 0000000..711ff2d
--- /dev/null
+++ b/Python/packages/databricks-test/tests/multiple_runs_test.py
@@ -0,0 +1,49 @@
+import databricks_test
+from tempfile import NamedTemporaryFile
+import uuid
+
+
+def run_notebook(notebook, run_num, dbrickstest):
+
+    input = str(uuid.uuid4())
+
+    with NamedTemporaryFile() as tmp_dir:
+
+        # Provide input and output location as widgets to notebook
+        switch = {
+            "input": input,
+            "output": tmp_dir.name,
+        }
+        dbrickstest.dbutils.widgets.get.side_effect = lambda x: switch.get(
+            x, "")
+
+        # Run notebook
+        dbrickstest.run_notebook(".", notebook)
+
+        # Notebook writes the input parameter as output file
+        with open(tmp_dir.name) as output_file:
+            assert input == output_file.read(), f"Run #{run_num} output"
+
+
+def test_multiple_runs_in_same_session_1():
+    with databricks_test.session() as dbrickstest:
+        run_notebook("multiple_runs_notebook", 1, dbrickstest)
+        run_notebook("multiple_runs_notebook", 2, dbrickstest)
+
+    with databricks_test.session() as dbrickstest:
+        run_notebook("multiple_runs_notebook", 3, dbrickstest)
+
+
+def test_multiple_runs_in_same_session_and_run_other_session():
+    with databricks_test.session() as dbrickstest:
+        run_notebook("multiple_runs_notebook", 4, dbrickstest)
+
+
+def test_multiple_runs_in_multiple_test_cases():
+    with databricks_test.session() as dbrickstest:
+        run_notebook("multiple_runs_notebook2", 5, dbrickstest)
+
+
+def test_multiple_runs_in_multiple_test_cases2():
+    with databricks_test.session() as dbrickstest:
+        run_notebook("multiple_runs_notebook2", 6, dbrickstest)