Skip to content
This repository has been archived by the owner on Nov 16, 2023. It is now read-only.

Commit

Permalink
databricks-test: Fix for multiple runs (#6)
Browse files Browse the repository at this point in the history
  • Loading branch information
algattik committed Mar 13, 2020
1 parent 7bf4ff3 commit 4311922
Show file tree
Hide file tree
Showing 6 changed files with 117 additions and 11 deletions.
21 changes: 13 additions & 8 deletions Python/packages/databricks-test/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,16 +1,21 @@
FROM conda/miniconda3

RUN apt-get update
RUN apt-get install -y gcc wget openjdk-8-jdk
ARG SPARK_VERSION=2.4.4
ARG HADOOP_VERSION=2.7

# Install Spark runtime
WORKDIR /
RUN wget -q https://www-eu.apache.org/dist/spark/spark-2.4.4/spark-2.4.4-bin-hadoop2.7.tgz
RUN tar -zxf spark-2.4.4-bin-hadoop2.7.tgz
RUN rm spark-2.4.4-bin-hadoop2.7.tgz
ENV SPARK_HOME /spark-2.4.4-bin-hadoop2.7
RUN apt-get update \
&& apt-get install -y \
wget \
apt-transport-https \
openjdk-8-jdk \
&& wget -q https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz \
&& tar -zxf spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz \
&& rm spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz

RUN pip install pytest==5.3.1 pytest-mock==1.13.0 flake8==3.7.9 pyspark==2.4.4 pyarrow==0.13.0
ENV SPARK_HOME /spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}

RUN pip install pandas==0.24.2
RUN pip install pytest==5.3.1 pytest-mock==1.13.0 flake8==3.7.9 pyspark==${SPARK_VERSION} pyarrow==0.13.0 pandas==0.24.2

CMD ["python"]
18 changes: 16 additions & 2 deletions Python/packages/databricks-test/databricks_test/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,11 @@
import inspect
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from tempfile import TemporaryDirectory
import importlib
import sys
import os
import pathlib


globalSession = None
Expand Down Expand Up @@ -80,13 +82,16 @@ def __init__(self):


class Session():
def __init__(self):
def __init__(self, hivedir):
self.display = MagicMock()
self.displayHTML = MagicMock()
self.dbutils = DbUtils()

hivedirUrl = pathlib.Path(hivedir).as_uri()
self.spark = (SparkSession.builder
.master("local")
.appName("test-pyspark")
.config("spark.sql.warehouse.dir", hivedirUrl)
.enableHiveSupport()
.getOrCreate())

Expand All @@ -101,6 +106,13 @@ def run_notebook(self, dir, script):
importlib.import_module(script)
else:
# If script was already imported, reload it to rerun it

# Per importlib docs: When a module is reloaded, its
# dictionary (global variables) is retained.
# Delete dbutils to ensure inject_variables gets called.
del sys.modules[script].dbutils

# Reload the notebook module
importlib.reload(sys.modules[script])
except WorkflowInterrupted:
pass
Expand Down Expand Up @@ -136,12 +148,14 @@ def __enter__(self):
global globalSession
if globalSession:
raise SessionAlreadyExistsException("A session already exists")
globalSession = Session()
self.tmpdir = TemporaryDirectory()
globalSession = Session(self.tmpdir.name)
return globalSession

def __exit__(self, exc_type, exc_value, traceback):
global globalSession
globalSession = None
del self.tmpdir


class add_path():
Expand Down
2 changes: 1 addition & 1 deletion Python/packages/databricks-test/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

setuptools.setup(
name='databricks_test',
version='0.0.1',
version='0.0.4',
author="Alexandre Gattiker",
author_email="[email protected]",
description="Unit testing and mocking for Databricks",
Expand Down
19 changes: 19 additions & 0 deletions Python/packages/databricks-test/tests/multiple_runs_notebook.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# Databricks notebook source

# COMMAND ----------

# Instrument for unit tests. This is only executed in local unit tests, not in Databricks.
if 'dbutils' not in locals():
import databricks_test
databricks_test.inject_variables()

# COMMAND ----------

# Widgets for interactive development.
dbutils.widgets.text("input", "")
dbutils.widgets.text("output", "")

# COMMAND ----------

with open(dbutils.widgets.get('output'), "w") as output_file:
output_file.write(dbutils.widgets.get('input'))
19 changes: 19 additions & 0 deletions Python/packages/databricks-test/tests/multiple_runs_notebook2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# Databricks notebook source

# COMMAND ----------

# Instrument for unit tests. This is only executed in local unit tests, not in Databricks.
if 'dbutils' not in locals():
import databricks_test
databricks_test.inject_variables()

# COMMAND ----------

# Widgets for interactive development.
dbutils.widgets.text("input", "")
dbutils.widgets.text("output", "")

# COMMAND ----------

with open(dbutils.widgets.get('output'), "w") as output_file:
output_file.write(dbutils.widgets.get('input'))
49 changes: 49 additions & 0 deletions Python/packages/databricks-test/tests/multiple_runs_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
import databricks_test
from tempfile import NamedTemporaryFile
import uuid


def run_notebook(notebook, run_num, dbrickstest):

input = str(uuid.uuid4())

with NamedTemporaryFile() as tmp_dir:

# Provide input and output location as widgets to notebook
switch = {
"input": input,
"output": tmp_dir.name,
}
dbrickstest.dbutils.widgets.get.side_effect = lambda x: switch.get(
x, "")

# Run notebook
dbrickstest.run_notebook(".", notebook)

# Notebook writes the input parameter as output file
with open(tmp_dir.name) as output_file:
assert input == output_file.read(), f"Run #{run_num} output"


def test_multiple_runs_in_same_session_1():
with databricks_test.session() as dbrickstest:
run_notebook("multiple_runs_notebook", 1, dbrickstest)
run_notebook("multiple_runs_notebook", 2, dbrickstest)

with databricks_test.session() as dbrickstest:
run_notebook("multiple_runs_notebook", 3, dbrickstest)


def test_multiple_runs_in_same_session_and_run_other_session():
with databricks_test.session() as dbrickstest:
run_notebook("multiple_runs_notebook", 4, dbrickstest)


def test_multiple_runs_in_multiple_test_cases():
with databricks_test.session() as dbrickstest:
run_notebook("multiple_runs_notebook2", 5, dbrickstest)


def test_multiple_runs_in_multiple_test_cases2():
with databricks_test.session() as dbrickstest:
run_notebook("multiple_runs_notebook2", 6, dbrickstest)

0 comments on commit 4311922

Please sign in to comment.