databricks · jprakash-db · Nov 26, 2024 · Oct 29, 2024 · Nov 2, 2024 · Nov 2, 2024
@@ -46,7 +46,7 @@
 from databricks.sql.experimental.oauth_persistence import OAuthPersistence
 
 from databricks.sql.thrift_api.TCLIService.ttypes import (
-    TSparkParameter,
+    TSparkParameter, TOperationState,
 )
 
 
@@ -733,6 +733,7 @@ def execute(
         self,
         operation: str,
         parameters: Optional[TParameterCollection] = None,
+        async_op=False,
     ) -> "Cursor":
         """
         Execute a query and wait for execution to complete.
@@ -763,6 +764,11 @@ def execute(
         Both will result in the query equivalent to "SELECT * FROM table WHERE field = 'foo'
         being sent to the server
 
+        async_op:
+        Denotes whether the execute command will execute the request asynchronously or not
+        By default it is set to False, if set True the execution request will be submitted and the code
+        will be non-blocking. User can later poll and request the result when ready
+
         :returns self
         """
 
@@ -796,13 +802,15 @@ def execute(
             cursor=self,
             use_cloud_fetch=self.connection.use_cloud_fetch,
             parameters=prepared_params,
+            async_op=async_op,
         )
         self.active_result_set = ResultSet(
             self.connection,
             execute_response,
             self.thrift_backend,
             self.buffer_size_bytes,
             self.arraysize,
+            async_op,
         )
 
         if execute_response.is_staging_operation:
@@ -812,6 +820,65 @@ def execute(
 
         return self
 
+    def execute_async(
+        self,
+        operation: str,
+        parameters: Optional[TParameterCollection] = None,
+    ) -> "Cursor":
+        """
+
+        Execute a query and do not wait for it to complete and just move ahead
+
+        Internally it calls execute function with async_op=True
+        :param operation:
+        :param parameters:
+        :return:
+        """
+        self.execute(operation, parameters, True)
+        return self
+
+    def get_query_state(self) -> "TOperationState":
+        """
+        Get the state of the async executing query or basically poll the status of the query
+
+        :return:
+        """
+        self._check_not_closed()
+        return self.thrift_backend.get_query_state(self.active_op_handle)
+
+    def get_execution_result(self):
+        """
+
+        Checks for the status of the async executing query and fetches the result if the query is finished
+        If executed sets the active_result_set to the obtained result
+        :return:
+        """
+        self._check_not_closed()
+
+        operation_state = self.get_query_state()
+        if operation_state == ttypes.TOperationState.FINISHED_STATE:
+            execute_response = self.thrift_backend.get_execution_result(
+                self.active_op_handle, self
+            )
+            self.active_result_set = ResultSet(
+                self.connection,
+                execute_response,
+                self.thrift_backend,
+                self.buffer_size_bytes,
+                self.arraysize,
+            )
+
+            if execute_response.is_staging_operation:
+                self._handle_staging_operation(
+                    staging_allowed_local_path=self.thrift_backend.staging_allowed_local_path
+                )
+
+            return self
+        else:
+            raise Error(
+                f"get_execution_result failed with Operation status {operation_state}"
+            )
+
     def executemany(self, operation, seq_of_parameters):
         """
         Execute the operation once for every set of passed in parameters.
@@ -1097,6 +1164,7 @@ def __init__(
         thrift_backend: ThriftBackend,
         result_buffer_size_bytes: int = DEFAULT_RESULT_BUFFER_SIZE_BYTES,
         arraysize: int = 10000,
+        async_op=False,
     ):
         """
         A ResultSet manages the results of a single command.
@@ -1119,7 +1187,7 @@ def __init__(
         self._arrow_schema_bytes = execute_response.arrow_schema_bytes
         self._next_row_index = 0
 
-        if execute_response.arrow_queue:
+        if execute_response.arrow_queue or async_op:
             # In this case the server has taken the fast path and returned an initial batch of
             # results
             self.results = execute_response.arrow_queue

@@ -7,6 +7,8 @@
 import threading
 from typing import List, Union
 
+from databricks.sql.thrift_api.TCLIService.ttypes import TOperationState
+
 try:
     import pyarrow
 except ImportError:
@@ -769,6 +771,63 @@ def _results_message_to_execute_response(self, resp, operation_state):
             arrow_schema_bytes=schema_bytes,
         )
 
+    def get_execution_result(self, op_handle, cursor):
+
+        assert op_handle is not None
+
+        req = ttypes.TFetchResultsReq(
+            operationHandle=ttypes.TOperationHandle(
+                op_handle.operationId,
+                op_handle.operationType,
+                False,
+                op_handle.modifiedRowCount,
+            ),
+            maxRows=cursor.arraysize,
+            maxBytes=cursor.buffer_size_bytes,
+            orientation=ttypes.TFetchOrientation.FETCH_NEXT,
+            includeResultSetMetadata=True,
+        )
+
+        resp = self.make_request(self._client.FetchResults, req)
+
+        t_result_set_metadata_resp = resp.resultSetMetadata
+
+        lz4_compressed = t_result_set_metadata_resp.lz4Compressed
+        is_staging_operation = t_result_set_metadata_resp.isStagingOperation
+        has_more_rows = resp.hasMoreRows
+        description = self._hive_schema_to_description(
+            t_result_set_metadata_resp.schema
+        )
+
+        schema_bytes = (
+            t_result_set_metadata_resp.arrowSchema
+            or self._hive_schema_to_arrow_schema(t_result_set_metadata_resp.schema)
+            .serialize()
+            .to_pybytes()
+        )
+
+        queue = ResultSetQueueFactory.build_queue(
+            row_set_type=resp.resultSetMetadata.resultFormat,
+            t_row_set=resp.results,
+            arrow_schema_bytes=schema_bytes,
+            max_download_threads=self.max_download_threads,
+            lz4_compressed=lz4_compressed,
+            description=description,
+            ssl_options=self._ssl_options,
+        )
+
+        return ExecuteResponse(
+            arrow_queue=queue,
+            status=resp.status,
+            has_been_closed_server_side=False,
+            has_more_rows=has_more_rows,
+            lz4_compressed=lz4_compressed,
+            is_staging_operation=is_staging_operation,
+            command_handle=op_handle,
+            description=description,
+            arrow_schema_bytes=schema_bytes,
+        )
+
     def _wait_until_command_done(self, op_handle, initial_operation_status_resp):
         if initial_operation_status_resp:
             self._check_command_not_in_error_or_closed_state(
@@ -787,6 +846,12 @@ def _wait_until_command_done(self, op_handle, initial_operation_status_resp):
             self._check_command_not_in_error_or_closed_state(op_handle, poll_resp)
         return operation_state
 
+    def get_query_state(self, op_handle) -> "TOperationState":
+        poll_resp = self._poll_for_status(op_handle)
+        operation_state = poll_resp.operationState
+        self._check_command_not_in_error_or_closed_state(op_handle, poll_resp)
+        return operation_state
+
     @staticmethod
     def _check_direct_results_for_error(t_spark_direct_results):
         if t_spark_direct_results:
@@ -817,6 +882,7 @@ def execute_command(
         cursor,
         use_cloud_fetch=True,
         parameters=[],
+        async_op=False,
     ):
         assert session_handle is not None
 
@@ -846,7 +912,11 @@ def execute_command(
             parameters=parameters,
         )
         resp = self.make_request(self._client.ExecuteStatement, req)
-        return self._handle_execute_response(resp, cursor)
+
+        if async_op:
+            return self._handle_execute_response_async(resp, cursor)
+        else:
+            return self._handle_execute_response(resp, cursor)
 
     def get_catalogs(self, session_handle, max_rows, max_bytes, cursor):
         assert session_handle is not None
@@ -945,6 +1015,23 @@ def _handle_execute_response(self, resp, cursor):
 
         return self._results_message_to_execute_response(resp, final_operation_state)
 
+    def _handle_execute_response_async(self, resp, cursor):
+        cursor.active_op_handle = resp.operationHandle
+        self._check_direct_results_for_error(resp.directResults)
+        operation_status = resp.status.statusCode
+
+        return ExecuteResponse(
+            arrow_queue=None,
+            status=operation_status,
+            has_been_closed_server_side=None,
+            has_more_rows=None,
+            lz4_compressed=None,
+            is_staging_operation=None,
+            command_handle=resp.operationHandle,
+            description=None,
+            arrow_schema_bytes=None,
+        )
+
     def fetch_results(
         self,
         op_handle,

@@ -36,6 +36,7 @@
     compare_dbr_versions,
     is_thrift_v5_plus,
 )
+from databricks.sql.thrift_api.TCLIService import ttypes
 from tests.e2e.common.core_tests import CoreTestMixin, SmokeTestMixin
 from tests.e2e.common.large_queries_mixin import LargeQueriesMixin
 from tests.e2e.common.timestamp_tests import TimestampTestsMixin
@@ -175,6 +176,27 @@ def test_cloud_fetch(self):
                 for i in range(len(cf_result)):
                     assert cf_result[i] == noop_result[i]
 
+    def test_execute_async(self):
+        def isExecuting(operation_state):
+            return not operation_state or operation_state in [
+                ttypes.TOperationState.RUNNING_STATE,
+                ttypes.TOperationState.PENDING_STATE,
+            ]
+
+        long_running_query = "SELECT COUNT(*) FROM RANGE(10000 * 16) x JOIN RANGE(10000) y ON FROM_UNIXTIME(x.id * y.id, 'yyyy-MM-dd') LIKE '%not%a%date%'"
+        with self.cursor() as cursor:
+            cursor.execute_async(long_running_query)
+
+            ## Polling after every 10 seconds
+            while isExecuting(cursor.get_query_state()):
+                time.sleep(10)
+                log.info("Polling the status in test_execute_async")
+
+            cursor.get_execution_result()
+            result = cursor.fetchall()
+
+            assert result[0].asDict() == {"count(1)": 0}
+
 
 # Exclude Retry tests because they require specific setups, and LargeQueries too slow for core
 # tests