From 6ede3271ff3e4c82a53a08e0dd18b35e01c4fa4d Mon Sep 17 00:00:00 2001 From: anamitraadhikari <97945303+anamitraadhikari@users.noreply.github.com> Date: Mon, 14 Oct 2024 18:03:28 -0700 Subject: [PATCH] fix(SQL Lab): hang when result set size is too big (#30522) Co-authored-by: aadhikari Co-authored-by: Ville Brofeldt <33317356+villebro@users.noreply.github.com> --- docs/static/resources/openapi.json | 3 +- .../superset-ui-core/src/query/types/Query.ts | 1 + .../src/setup/setupErrorMessages.ts | 4 + superset/config.py | 3 + superset/errors.py | 3 + superset/sql_lab.py | 36 ++++++ tests/unit_tests/sql_lab_test.py | 115 +++++++++++++++++- 7 files changed, 162 insertions(+), 3 deletions(-) diff --git a/docs/static/resources/openapi.json b/docs/static/resources/openapi.json index a039bd3a2a51f..43781ac9658b1 100644 --- a/docs/static/resources/openapi.json +++ b/docs/static/resources/openapi.json @@ -116,7 +116,8 @@ "GENERIC_BACKEND_ERROR", "INVALID_PAYLOAD_FORMAT_ERROR", "INVALID_PAYLOAD_SCHEMA_ERROR", - "REPORT_NOTIFICATION_ERROR" + "REPORT_NOTIFICATION_ERROR", + "RESULT_TOO_LARGE_ERROR" ], "type": "string" }, diff --git a/superset-frontend/packages/superset-ui-core/src/query/types/Query.ts b/superset-frontend/packages/superset-ui-core/src/query/types/Query.ts index 4feba17319e55..a83d17a937f6a 100644 --- a/superset-frontend/packages/superset-ui-core/src/query/types/Query.ts +++ b/superset-frontend/packages/superset-ui-core/src/query/types/Query.ts @@ -227,6 +227,7 @@ export const ErrorTypeEnum = { ASYNC_WORKERS_ERROR: 'ASYNC_WORKERS_ERROR', ADHOC_SUBQUERY_NOT_ALLOWED_ERROR: 'ADHOC_SUBQUERY_NOT_ALLOWED_ERROR', INVALID_SQL_ERROR: 'INVALID_SQL_ERROR', + RESULT_TOO_LARGE_ERROR: 'RESULT_TOO_LARGE_ERROR', // Generic errors GENERIC_COMMAND_ERROR: 'GENERIC_COMMAND_ERROR', diff --git a/superset-frontend/src/setup/setupErrorMessages.ts b/superset-frontend/src/setup/setupErrorMessages.ts index 442cd32152bc9..e4c2380150c9d 100644 --- a/superset-frontend/src/setup/setupErrorMessages.ts +++ b/superset-frontend/src/setup/setupErrorMessages.ts @@ -159,5 +159,9 @@ export default function setupErrorMessages() { ErrorTypeEnum.INVALID_SQL_ERROR, InvalidSQLErrorMessage, ); + errorMessageComponentRegistry.registerValue( + ErrorTypeEnum.RESULT_TOO_LARGE_ERROR, + DatabaseErrorMessage, + ); setupErrorMessagesExtra(); } diff --git a/superset/config.py b/superset/config.py index 7ae46bba76f36..d19e30a5a5282 100644 --- a/superset/config.py +++ b/superset/config.py @@ -960,6 +960,9 @@ class D3TimeFormat(TypedDict, total=False): SQLLAB_SAVE_WARNING_MESSAGE = None SQLLAB_SCHEDULE_WARNING_MESSAGE = None +# Max payload size (MB) for SQL Lab to prevent browser hangs with large results. +SQLLAB_PAYLOAD_MAX_MB = None + # Force refresh while auto-refresh in dashboard DASHBOARD_AUTO_REFRESH_MODE: Literal["fetch", "force"] = "force" # Dashboard auto refresh intervals diff --git a/superset/errors.py b/superset/errors.py index 9f0f5bed05354..4d9c08d14214d 100644 --- a/superset/errors.py +++ b/superset/errors.py @@ -87,6 +87,7 @@ class SupersetErrorType(StrEnum): ASYNC_WORKERS_ERROR = "ASYNC_WORKERS_ERROR" ADHOC_SUBQUERY_NOT_ALLOWED_ERROR = "ADHOC_SUBQUERY_NOT_ALLOWED_ERROR" INVALID_SQL_ERROR = "INVALID_SQL_ERROR" + RESULT_TOO_LARGE_ERROR = "RESULT_TOO_LARGE_ERROR" # Generic errors GENERIC_COMMAND_ERROR = "GENERIC_COMMAND_ERROR" @@ -151,6 +152,7 @@ class SupersetErrorType(StrEnum): 1036: _("The database was deleted."), 1037: _("Custom SQL fields cannot contain sub-queries."), 1040: _("The submitted payload failed validation."), + 1041: _("The result size exceeds the allowed limit."), } @@ -190,6 +192,7 @@ class SupersetErrorType(StrEnum): SupersetErrorType.DATABASE_NOT_FOUND_ERROR: [1011, 1036], SupersetErrorType.CONNECTION_DATABASE_TIMEOUT: [1001, 1009], SupersetErrorType.MARSHMALLOW_ERROR: [1040], + SupersetErrorType.RESULT_TOO_LARGE_ERROR: [1041], } diff --git a/superset/sql_lab.py b/superset/sql_lab.py index 7685b4419ce37..88e1bc1aad858 100644 --- a/superset/sql_lab.py +++ b/superset/sql_lab.py @@ -17,6 +17,7 @@ # pylint: disable=consider-using-transaction import dataclasses import logging +import sys import uuid from contextlib import closing from datetime import datetime @@ -78,6 +79,7 @@ SQLLAB_CTAS_NO_LIMIT = config["SQLLAB_CTAS_NO_LIMIT"] log_query = config["QUERY_LOGGER"] logger = logging.getLogger(__name__) +BYTES_IN_MB = 1024 * 1024 class SqlLabException(Exception): @@ -531,6 +533,7 @@ def execute_sql_statements( log_params, apply_ctas, ) + except SqlLabQueryStoppedException: payload.update({"status": QueryStatus.STOPPED}) return payload @@ -601,6 +604,22 @@ def execute_sql_statements( serialized_payload = _serialize_payload( payload, cast(bool, results_backend_use_msgpack) ) + + # Check the size of the serialized payload + if sql_lab_payload_max_mb := config.get("SQLLAB_PAYLOAD_MAX_MB"): + serialized_payload_size = sys.getsizeof(serialized_payload) + max_bytes = sql_lab_payload_max_mb * BYTES_IN_MB + + if serialized_payload_size > max_bytes: + logger.info("Result size exceeds the allowed limit.") + raise SupersetErrorException( + SupersetError( + message=f"Result size ({serialized_payload_size / BYTES_IN_MB:.2f} MB) exceeds the allowed limit of {sql_lab_payload_max_mb} MB.", + error_type=SupersetErrorType.RESULT_TOO_LARGE_ERROR, + level=ErrorLevel.ERROR, + ) + ) + cache_timeout = database.cache_timeout if cache_timeout is None: cache_timeout = config["CACHE_DEFAULT_TIMEOUT"] @@ -635,6 +654,23 @@ def execute_sql_statements( "expanded_columns": expanded_columns, } ) + # Check the size of the serialized payload (opt-in logic for return_results) + if sql_lab_payload_max_mb := config.get("SQLLAB_PAYLOAD_MAX_MB"): + serialized_payload = _serialize_payload( + payload, cast(bool, results_backend_use_msgpack) + ) + serialized_payload_size = sys.getsizeof(serialized_payload) + max_bytes = sql_lab_payload_max_mb * BYTES_IN_MB + + if serialized_payload_size > max_bytes: + logger.info("Result size exceeds the allowed limit.") + raise SupersetErrorException( + SupersetError( + message=f"Result size ({serialized_payload_size / BYTES_IN_MB:.2f} MB) exceeds the allowed limit of {sql_lab_payload_max_mb} MB.", + error_type=SupersetErrorType.RESULT_TOO_LARGE_ERROR, + level=ErrorLevel.ERROR, + ) + ) return payload return None diff --git a/tests/unit_tests/sql_lab_test.py b/tests/unit_tests/sql_lab_test.py index 83093352bca8f..cc9b146f39be5 100644 --- a/tests/unit_tests/sql_lab_test.py +++ b/tests/unit_tests/sql_lab_test.py @@ -17,8 +17,10 @@ # pylint: disable=import-outside-toplevel, invalid-name, unused-argument, too-many-locals import json +from unittest import mock from uuid import UUID +import pytest import sqlparse from freezegun import freeze_time from pytest_mock import MockerFixture @@ -27,9 +29,9 @@ from superset import db from superset.common.db_query_status import QueryStatus from superset.errors import ErrorLevel, SupersetErrorType -from superset.exceptions import OAuth2Error +from superset.exceptions import OAuth2Error, SupersetErrorException from superset.models.core import Database -from superset.sql_lab import get_sql_results +from superset.sql_lab import execute_sql_statements, get_sql_results from superset.utils.core import override_user from tests.unit_tests.models.core_test import oauth2_client_info @@ -125,6 +127,115 @@ def test_execute_sql_statement_with_rls( SupersetResultSet.assert_called_with([(42,)], cursor.description, db_engine_spec) +@mock.patch.dict( + "superset.sql_lab.config", + {"SQLLAB_PAYLOAD_MAX_MB": 50}, # Set the desired config value for testing +) +def test_execute_sql_statement_exceeds_payload_limit(mocker: MockerFixture) -> None: + """ + Test for `execute_sql_statements` when the result payload size exceeds the limit. + """ + + # Mock the query object and database + query = mocker.MagicMock() + query.limit = 1 + query.database = mocker.MagicMock() + query.database.db_engine_spec.is_select_query.return_value = True + query.database.cache_timeout = 100 + query.status = "RUNNING" + query.select_as_cta = False + query.database.allow_run_async = True + + # Mock get_query to return our mocked query object + mocker.patch("superset.sql_lab.get_query", return_value=query) + + # Mock sys.getsizeof to simulate a large payload size + mocker.patch("sys.getsizeof", return_value=100000000) # 100 MB + + # Mock _serialize_payload + def mock_serialize_payload(payload, use_msgpack): + return "serialized_payload" + + mocker.patch( + "superset.sql_lab._serialize_payload", side_effect=mock_serialize_payload + ) + + # Mock db.session.refresh to avoid AttributeError during session refresh + mocker.patch("superset.sql_lab.db.session.refresh", return_value=None) + + # Mock the results backend to avoid "Results backend is not configured" error + mocker.patch("superset.sql_lab.results_backend", return_value=True) + + # Test that the exception is raised when the payload exceeds the limit + with pytest.raises(SupersetErrorException): + execute_sql_statements( + query_id=1, + rendered_query="SELECT 42 AS answer", + return_results=True, # Simulate that results are being returned + store_results=True, # Not storing results but returning them + start_time=None, + expand_data=False, + log_params={}, + ) + + +@mock.patch.dict( + "superset.sql_lab.config", + {"SQLLAB_PAYLOAD_MAX_MB": 50}, # Set the desired config value for testing +) +def test_execute_sql_statement_within_payload_limit(mocker: MockerFixture) -> None: + """ + Test for `execute_sql_statements` when the result payload size is within the limit, + and check if the flow executes smoothly without raising any exceptions. + """ + + # Mock the query object and database + query = mocker.MagicMock() + query.limit = 1 + query.database = mocker.MagicMock() + query.database.db_engine_spec.is_select_query.return_value = True + query.database.cache_timeout = 100 + query.status = "RUNNING" + query.select_as_cta = False + query.database.allow_run_async = True + + # Mock get_query to return our mocked query object + mocker.patch("superset.sql_lab.get_query", return_value=query) + + # Mock sys.getsizeof to simulate a payload size that is within the limit + mocker.patch("sys.getsizeof", return_value=10000000) # 10 MB (within limit) + + # Mock _serialize_payload + def mock_serialize_payload(payload, use_msgpack): + return "serialized_payload" + + mocker.patch( + "superset.sql_lab._serialize_payload", side_effect=mock_serialize_payload + ) + + # Mock db.session.refresh to avoid AttributeError during session refresh + mocker.patch("superset.sql_lab.db.session.refresh", return_value=None) + + # Mock the results backend to avoid "Results backend is not configured" error + mocker.patch("superset.sql_lab.results_backend", return_value=True) + + # Test that no exception is raised and the function executes smoothly + try: + execute_sql_statements( + query_id=1, + rendered_query="SELECT 42 AS answer", + return_results=True, # Simulate that results are being returned + store_results=True, # Not storing results but returning them + start_time=None, + expand_data=False, + log_params={}, + ) + except SupersetErrorException: + pytest.fail( + "SupersetErrorException should not have been raised for payload within the limit" + ) + + def test_sql_lab_insert_rls_as_subquery( mocker: MockerFixture, session: Session,