diff --git a/benchmarking/conftest.py b/benchmarking/conftest.py new file mode 100644 index 000000000..67504328f --- /dev/null +++ b/benchmarking/conftest.py @@ -0,0 +1,44 @@ +import os +import pytest + + +@pytest.fixture(scope="session") +def benchmarking_host(): + return os.getenv("BENCHMARKING_SERVER_HOSTNAME") + + +@pytest.fixture(scope="session") +def benchmarking_http_path(): + return os.getenv("BENCHMARKING_HTTP_PATH") + + +@pytest.fixture(scope="session") +def benchmarking_access_token(): + return os.getenv("BENCHMARKING_TOKEN") + + +@pytest.fixture(scope="session") +def benchfood_host(): + return os.getenv("BENCHFOOD_SERVER_HOSTNAME") + + +@pytest.fixture(scope="session") +def benchfood_http_path(): + return os.getenv("BENCHFOOD_HTTP_PATH") + + +@pytest.fixture(scope="session") +def benchfood_access_token(): + return os.getenv("BENCHFOOD_TOKEN") + + +@pytest.fixture(scope="session", autouse=True) +def connection_details(benchmarking_host, benchmarking_http_path, benchmarking_access_token, benchfood_host, benchfood_http_path, benchfood_access_token): + return { + "benchmarking_host": benchmarking_host, + "benchmarking_http_path": benchmarking_http_path, + "benchmarking_access_token": benchmarking_access_token, + "benchfood_host": benchfood_host, + "benchfood_http_path": benchfood_http_path, + "benchfood_access_token": benchfood_access_token, + } diff --git a/benchmarking/test_benchmark.py b/benchmarking/test_benchmark.py new file mode 100644 index 000000000..93550b40c --- /dev/null +++ b/benchmarking/test_benchmark.py @@ -0,0 +1,116 @@ +import random +import time +from databricks import sql +import logging +import pytest +from contextlib import contextmanager +from datetime import datetime +log = logging.getLogger(__name__) + + +class TestBenchmarkingSuite: + + # TAG = "PRE-SPLIT" + TAG = "POST-SPLIT" + CATALOG_NAME = "main" + SCHEMA_NAME = "tpcds_sf100_delta" + TABLE_NAME = "catalog_sales" + RESULTS_TABLE = "main.pysql_benchmarking_schema.benchmarking_results" + ATTEMPTS = 10 + ROWS = 1000000 + LARGE_QUERY_LIMIT = 1000000 + SMALL_QUERY_LIMIT = 10000 + + @pytest.fixture(autouse=True) + def get_details(self, connection_details): + self.arguments = connection_details.copy() + + self.benchmarking_connection_params = { + "server_hostname": self.arguments["benchmarking_host"], + "http_path": self.arguments["benchmarking_http_path"], + "access_token": self.arguments["benchmarking_access_token"] + } + + self.benchfood_connection_params = { + "server_hostname": self.arguments["benchfood_host"], + "http_path": self.arguments["benchfood_http_path"], + "access_token": self.arguments["benchfood_access_token"] + } + + @contextmanager + def connection(self, connection_params): + log.info("Connecting with args: {}".format(connection_params)) + conn = sql.connect(**connection_params) + + try: + yield conn + finally: + conn.close() + + @contextmanager + def cursor(self, connection_params): + with self.connection(connection_params) as conn: + cursor = conn.cursor() + try: + yield cursor + finally: + cursor.close() + + def removed_outlier_mean(self, data): + total = 0 + for i in range(1, len(data)-1): + total += data[i] + + return total/(len(data)-2) + + def insert_benchmarking_results_data(self, function_name, query_time): + + log.info(f"Inserting results {self.TAG} - {function_name}") + with self.cursor(self.benchfood_connection_params) as cursor: + cursor.execute( + f"INSERT INTO {self.RESULTS_TABLE} (tag, function_name, compute_duration, date_time) VALUES ('{self.TAG}', '{function_name}', {query_time}, '{datetime.now()}')" + ) + + def get_query_time(self, query, expected_num_rows): + start_time = time.time() + with self.cursor(self.benchmarking_connection_params) as cursor: + cursor.execute(query) + result = cursor.fetchall() + log.info("Fetched {} rows".format(len(result))) + + assert len(result) == expected_num_rows + + end_time = time.time() + elapsed_time = end_time - start_time + + return elapsed_time + + def test_large_queries_performance(self): + compute_duration = [] + function_name = "large_query" + + for i in range(0, self.ATTEMPTS): + log.info("Attempt: {}".format(i)) + offset = i * self.LARGE_QUERY_LIMIT + random.randint(1, self.LARGE_QUERY_LIMIT) + + query = "select * from {}.{}.{} LIMIT {} OFFSET {}".format(self.CATALOG_NAME, self.SCHEMA_NAME, self.TABLE_NAME, self.LARGE_QUERY_LIMIT, offset) + compute_duration.append(self.get_query_time(query, self.LARGE_QUERY_LIMIT)) + + compute_duration.sort() + self.insert_benchmarking_results_data(function_name, self.removed_outlier_mean(compute_duration)) + + def test_small_queries_performance(self): + compute_duration = [] + function_name = "small_query" + + for i in range(0, self.ATTEMPTS): + log.info("Attempt: {}".format(i)) + offset = i * self.SMALL_QUERY_LIMIT + random.randint(1, self.SMALL_QUERY_LIMIT) + + query = "select * from {}.{}.{} LIMIT {} OFFSET {}".format(self.CATALOG_NAME, self.SCHEMA_NAME, self.TABLE_NAME, self.SMALL_QUERY_LIMIT, offset) + compute_duration.append(self.get_query_time(query, self.SMALL_QUERY_LIMIT)) + + compute_duration.sort() + self.insert_benchmarking_results_data(function_name, self.removed_outlier_mean(compute_duration)) + + diff --git a/check.py b/check.py index a9aa6c43f..bfbbcf45e 100644 --- a/check.py +++ b/check.py @@ -21,10 +21,10 @@ # Load environment variables from .env file # load_dotenv() -host = "e2-dogfood.staging.cloud.databricks.com" -http_path = "/sql/1.0/warehouses/58aa1b363649e722" +host = os.getenv("MY_SERVER_HOSTNAME") +http_path = os.getenv("MY_HTTP_PATH") +access_token = os.getenv("MY_TOKEN") -access_token = "" connection = sql.connect( server_hostname=host, http_path=http_path, @@ -32,9 +32,9 @@ cursor = connection.cursor() -cursor.execute('SELECT :param `p`, * FROM RANGE(10)', {"param": "foo"}) +cursor.execute("select * from `auto_maintenance_bugbash`.`tpcds_sf1000_naga_testv32`.`store_sales` LIMIT 1000") # cursor.execute('SELECT 1') -result = cursor.fetchall() +result = cursor.fetchmany(10) for row in result: print(row) diff --git a/databricks_sql_connector/dist/databricks_sql_connector-1.0.0-py3-none-any.whl b/databricks_sql_connector/dist/databricks_sql_connector-1.0.0-py3-none-any.whl index 4cd32c830..dee9fe011 100644 Binary files a/databricks_sql_connector/dist/databricks_sql_connector-1.0.0-py3-none-any.whl and b/databricks_sql_connector/dist/databricks_sql_connector-1.0.0-py3-none-any.whl differ diff --git a/databricks_sql_connector/dist/databricks_sql_connector-1.0.0.tar.gz b/databricks_sql_connector/dist/databricks_sql_connector-1.0.0.tar.gz index 05718f920..f6855cee8 100644 Binary files a/databricks_sql_connector/dist/databricks_sql_connector-1.0.0.tar.gz and b/databricks_sql_connector/dist/databricks_sql_connector-1.0.0.tar.gz differ diff --git a/databricks_sql_connector/pyproject.toml b/databricks_sql_connector/pyproject.toml index d0c4aafbf..32b72b54e 100644 --- a/databricks_sql_connector/pyproject.toml +++ b/databricks_sql_connector/pyproject.toml @@ -10,38 +10,22 @@ include = ["CHANGELOG.md"] [tool.poetry.dependencies] python = "^3.8.0" -#thrift = ">=0.16.0,<0.21.0" -#pandas = [ -# { version = ">=1.2.5,<2.2.0", python = ">=3.8" } -#] -#pyarrow = ">=14.0.1,<17" - -#lz4 = "^4.0.2" -#requests = "^2.18.1" -#oauthlib = "^3.1.0" -#numpy = [ -# { version = "^1.16.6", python = ">=3.8,<3.11" }, -# { version = "^1.23.4", python = ">=3.11" }, -#] +# Remainging to add databricks_sql_connector_core databricks_sqlalchemy = { version = ">=1.0.0", optional = true } -#openpyxl = "^3.0.10" -#alembic = { version = "^1.0.11", optional = true } -#urllib3 = ">=1.26" -# + [tool.poetry.extras] databricks_sqlalchemy = ["databricks_sqlalchemy"] -#alembic = ["sqlalchemy", "alembic"] -# -#[tool.poetry.dev-dependencies] -#pytest = "^7.1.2" -#mypy = "^1.10.1" -#pylint = ">=2.12.0" -#black = "^22.3.0" -#pytest-dotenv = "^0.5.2" -#[tool.poetry.urls] -#"Homepage" = "https://github.com/databricks/databricks-sql-python" -#"Bug Tracker" = "https://github.com/databricks/databricks-sql-python/issues" +[tool.poetry.dev-dependencies] +pytest = "^7.1.2" +mypy = "^1.10.1" +pylint = ">=2.12.0" +black = "^22.3.0" +pytest-dotenv = "^0.5.2" + +[tool.poetry.urls] +"Homepage" = "https://github.com/databricks/databricks-sql-python" +"Bug Tracker" = "https://github.com/databricks/databricks-sql-python/issues" [tool.poetry.plugins."sqlalchemy.dialects"] "databricks" = "databricks_sqlalchemy:DatabricksDialect" @@ -49,14 +33,14 @@ databricks_sqlalchemy = ["databricks_sqlalchemy"] [build-system] requires = ["poetry-core>=1.0.0"] build-backend = "poetry.core.masonry.api" -# -#[tool.mypy] -#ignore_missing_imports = "true" -#exclude = ['ttypes\.py$', 'TCLIService\.py$'] -# -#[tool.black] -#exclude = '/(\.eggs|\.git|\.hg|\.mypy_cache|\.nox|\.tox|\.venv|\.svn|_build|buck-out|build|dist|thrift_api)/' -# + +[tool.mypy] +ignore_missing_imports = "true" +exclude = ['ttypes\.py$', 'TCLIService\.py$'] + +[tool.black] +exclude = '/(\.eggs|\.git|\.hg|\.mypy_cache|\.nox|\.tox|\.venv|\.svn|_build|buck-out|build|dist|thrift_api)/' + [tool.pytest.ini_options] markers = {"reviewed" = "Test case has been reviewed by Databricks"} minversion = "6.0" diff --git a/setup_script.py b/setup_script.py index 27b021cb1..6b12fe9db 100644 --- a/setup_script.py +++ b/setup_script.py @@ -28,4 +28,4 @@ def build_and_install_library(directory_name): if __name__ == "__main__": build_and_install_library("databricks_sql_connector_core") build_and_install_library("databricks_sql_connector") - build_and_install_library("databricks_sqlalchemy") \ No newline at end of file + # build_and_install_library("databricks_sqlalchemy") \ No newline at end of file