Skip to content

perf: Replace expensive len() call with PandasBatches.total_rows in anywidget TableWidget #1937

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 13 additions & 8 deletions bigframes/display/anywidget.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,10 @@


class TableWidget(WIDGET_BASE):
"""
An interactive, paginated table widget for BigFrames DataFrames.
"""An interactive, paginated table widget for BigFrames DataFrames.

This widget provides a user-friendly way to display and navigate through
large BigQuery DataFrames within a Jupyter environment.
"""

def __init__(self, dataframe: bigframes.dataframe.DataFrame):
Expand Down Expand Up @@ -74,15 +76,18 @@ def __init__(self, dataframe: bigframes.dataframe.DataFrame):
# Initialize data fetching attributes.
self._batches = dataframe.to_pandas_batches(page_size=initial_page_size)

# Access total_rows through type casting (internal use only)
from bigframes.core.blocks import PandasBatches

if isinstance(self._batches, PandasBatches):
self.row_count = self._batches.total_rows or 0
else:
# Fallback for compatibility
self.row_count = 0
Comment on lines +79 to +86
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do you need this? to_pandas_batches is implemented in this package. We shouldn't have to protect against that.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I recommend using the cast method, if this is just for the type checker.


# set traitlets properties that trigger observers
self.page_size = initial_page_size

# len(dataframe) is expensive, since it will trigger a
# SELECT COUNT(*) query. It is a must have however.
# TODO(b/428238610): Start iterating over the result of `to_pandas_batches()`
# before we get here so that the count might already be cached.
self.row_count = len(dataframe)

# get the initial page
self._set_table_html()

Expand Down
10 changes: 5 additions & 5 deletions notebooks/dataframes/anywidget_mode.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@
{
"data": {
"text/html": [
"Query job c5fcfd5e-1617-49c8-afa3-86ca21019de4 is DONE. 0 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:c5fcfd5e-1617-49c8-afa3-86ca21019de4&page=queryresults\">Open Job</a>"
"Query job 1ea2b594-2bd7-46de-a3c8-6aeee5884ba2 is DONE. 0 Bytes processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:1ea2b594-2bd7-46de-a3c8-6aeee5884ba2&page=queryresults\">Open Job</a>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
Expand Down Expand Up @@ -141,7 +141,7 @@
{
"data": {
"text/html": [
"Query job ab900a53-5011-4e80-85d5-0ef2958598db is DONE. 171.4 MB processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:ab900a53-5011-4e80-85d5-0ef2958598db&page=queryresults\">Open Job</a>"
"Query job 67e679e9-94da-47f7-8be1-8b4a496fbfbd is DONE. 171.4 MB processed. <a target=\"_blank\" href=\"https://console.cloud.google.com/bigquery?project=bigframes-dev&j=bq:US:67e679e9-94da-47f7-8be1-8b4a496fbfbd&page=queryresults\">Open Job</a>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
Expand All @@ -153,7 +153,7 @@
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "bda63ba739dc4d5f83a5e18eb27b2686",
"model_id": "e74c3920b93644a0b2afdaa3841cad31",
"version_major": 2,
"version_minor": 1
},
Expand Down Expand Up @@ -204,7 +204,7 @@
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "9bffeb73549e48419c3dd895edfe30e8",
"model_id": "b4f7a3f86ef54e07b24ef10061088391",
"version_major": 2,
"version_minor": 1
},
Expand Down Expand Up @@ -290,7 +290,7 @@
{
"data": {
"application/vnd.jupyter.widget-view+json": {
"model_id": "dfd4fa3a1c6f4b3eb1877cb0e9ba7e94",
"model_id": "44a829aca2f24cfdba4b61afd1a259fe",
"version_major": 2,
"version_minor": 1
},
Expand Down
116 changes: 110 additions & 6 deletions tests/system/small/test_anywidget.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import typing

import pandas as pd
import pytest

Expand Down Expand Up @@ -61,11 +63,12 @@ def table_widget(paginated_bf_df: bf.dataframe.DataFrame):
Helper fixture to create a TableWidget instance with a fixed page size.
This reduces duplication across tests that use the same widget configuration.
"""
from bigframes import display

from bigframes.display import TableWidget

with bf.option_context("display.repr_mode", "anywidget", "display.max_rows", 2):
# Delay context manager cleanup of `max_rows` until after tests finish.
yield display.TableWidget(paginated_bf_df)
yield TableWidget(paginated_bf_df)


@pytest.fixture(scope="module")
Expand All @@ -90,10 +93,10 @@ def small_bf_df(
@pytest.fixture
def small_widget(small_bf_df):
"""Helper fixture for tests using a DataFrame smaller than the page size."""
from bigframes import display
from bigframes.display import TableWidget

with bf.option_context("display.repr_mode", "anywidget", "display.max_rows", 5):
yield display.TableWidget(small_bf_df)
yield TableWidget(small_bf_df)


@pytest.fixture(scope="module")
Expand Down Expand Up @@ -135,10 +138,10 @@ def test_widget_initialization_should_calculate_total_row_count(
paginated_bf_df: bf.dataframe.DataFrame,
):
"""A TableWidget should correctly calculate the total row count on creation."""
from bigframes import display
from bigframes.display import TableWidget

with bf.option_context("display.repr_mode", "anywidget", "display.max_rows", 2):
widget = display.TableWidget(paginated_bf_df)
widget = TableWidget(paginated_bf_df)

assert widget.row_count == EXPECTED_ROW_COUNT

Expand Down Expand Up @@ -436,6 +439,107 @@ def test_widget_creation_should_load_css_for_rendering(table_widget):
assert ".bigframes-widget .footer" in css_content


def test_widget_row_count_should_be_immutable_after_creation(
paginated_bf_df: bf.dataframe.DataFrame,
):
"""
Given a widget created with a specific configuration when global display
options are changed later, the widget's original row_count should remain
unchanged.
"""
from bigframes.display import TableWidget

with bf.option_context("display.repr_mode", "anywidget", "display.max_rows", 2):
widget = TableWidget(paginated_bf_df)
initial_row_count = widget.row_count
assert initial_row_count == EXPECTED_ROW_COUNT

# Change a global option that could influence row count
bf.options.display.max_rows = 10

# The widget's row count was fixed at creation and should not change.
assert widget.row_count == initial_row_count


def test_widget_should_fallback_to_zero_rows_when_total_rows_is_none(
paginated_bf_df: bf.dataframe.DataFrame, monkeypatch: pytest.MonkeyPatch
):
"""
Given an internal component that fails to provide a total row count,
when the widget is created, the row_count should safely fall back to 0.
"""
from bigframes.core.blocks import PandasBatches

# Simulate an internal failure where total_rows returns None
monkeypatch.setattr(PandasBatches, "total_rows", property(lambda self: None))

with bf.option_context("display.repr_mode", "anywidget"):
from bigframes.display import TableWidget

widget = TableWidget(paginated_bf_df)

assert widget.row_count == 0


def test_widget_should_fallback_to_zero_rows_when_batches_are_invalid_type(
paginated_bf_df: bf.dataframe.DataFrame, monkeypatch: pytest.MonkeyPatch
):
"""
Given an internal component that returns an unexpected data type,
when the widget is created, the row_count should safely fall back to 0.
"""
# Simulate internal method returning an unexpected type (a simple iterator)
def mock_to_pandas_batches(self, **kwargs):
return iter([paginated_bf_df.to_pandas().iloc[:2]])

monkeypatch.setattr(
"bigframes.dataframe.DataFrame.to_pandas_batches", mock_to_pandas_batches
)

with bf.option_context("display.repr_mode", "anywidget"):
from bigframes.display import TableWidget

widget = TableWidget(paginated_bf_df)

assert widget.row_count == 0


@pytest.mark.parametrize(
"max_results, expected_rows",
[
(None, EXPECTED_ROW_COUNT),
(3, 3),
(10, EXPECTED_ROW_COUNT),
],
ids=["no_limit", "limit_is_less_than_total", "limit_is_greater_than_total"],
)
def test_widget_row_count_should_respect_max_results_on_creation(
paginated_bf_df: bf.dataframe.DataFrame,
max_results: typing.Optional[int],
expected_rows: int,
):
"""
Given a max_results value, when a TableWidget is created with custom batches,
its row_count should be correctly capped by that value.
"""
with bf.option_context("display.repr_mode", "anywidget", "display.max_rows", 2):
from bigframes.core.blocks import PandasBatches
from bigframes.display import TableWidget

widget = TableWidget(paginated_bf_df)

# Override batches with max_results to test the behavior
widget._batches = paginated_bf_df.to_pandas_batches(
page_size=widget.page_size, max_results=max_results
)

# Re-apply the logic to update row_count
if isinstance(widget._batches, PandasBatches):
widget.row_count = widget._batches.total_rows or 0

assert widget.row_count == expected_rows


# TODO(shuowei): Add tests for custom index and multiindex
# This may not be necessary for the SQL Cell use case but should be
# considered for completeness.