⚡️ Speed up function _format_schema_info by 13%
#611
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
📄 13% (0.13x) speedup for
_format_schema_infoinmarimo/_server/ai/prompts.py⏱️ Runtime :
1.02 milliseconds→904 microseconds(best of250runs)📝 Explanation and details
The optimization replaces inefficient repeated string concatenation with a list-based approach, delivering a 12% speedup by avoiding Python's string immutability overhead.
Key Changes:
String Building Pattern: Changed from
schema_info += f"..."toparts.append(f"...")followed by"".join(parts). This eliminates the quadratic time complexity of repeated string concatenation, which creates new string objects on each+=operation.Sample Values Processing: Optimized
", ".join(f"{v}" for v in col.sample_values)to", ".join(map(str, col.sample_values)). Usingmap(str, ...)is more efficient than generator expressions with f-strings for simple string conversion.Performance Analysis:
Why This Works:
String concatenation in Python creates new string objects each time due to immutability. With many schema tables and columns, this becomes O(n²) behavior. The list approach collects all parts first, then performs a single join operation, maintaining O(n) complexity.
The optimization is most effective for schemas with many tables/columns and sample values, which aligns with typical database schema documentation use cases where this function would process substantial metadata.
✅ Correctness verification report:
🌀 Generated Regression Tests and Runtime
from typing import List, Optional
imports
import pytest
from marimo._server.ai.prompts import _format_schema_info
Define minimal SchemaTable and Column classes for testing
class Column:
def init(self, name: str, type: str, sample_values: Optional[List[str]] = None):
self.name = name
self.type = type
self.sample_values = sample_values if sample_values is not None else []
class SchemaTable:
def init(self, name: str, columns: List[Column]):
self.name = name
self.columns = columns
from marimo._server.ai.prompts import _format_schema_info
------------------- UNIT TESTS -------------------
1. Basic Test Cases
def test_empty_tables_none():
# Scenario: tables is None
codeflash_output = _format_schema_info(None) # 328ns -> 345ns (4.93% slower)
def test_empty_tables_empty_list():
# Scenario: tables is an empty list
codeflash_output = _format_schema_info([]) # 348ns -> 354ns (1.69% slower)
def test_single_table_single_column_no_samples():
# Scenario: One table, one column, no sample values
table = SchemaTable(
name="users",
columns=[Column(name="id", type="integer")]
)
expected = (
"\n\n## Available schema:\n"
"- Table: users\n"
" - Column: id\n"
" - Type: integer\n"
)
codeflash_output = _format_schema_info([table]) # 1.42μs -> 1.67μs (14.7% slower)
def test_single_table_single_column_with_samples():
# Scenario: One table, one column, with sample values
table = SchemaTable(
name="users",
columns=[Column(name="name", type="text", sample_values=["Alice", "Bob"])]
)
expected = (
"\n\n## Available schema:\n"
"- Table: users\n"
" - Column: name\n"
" - Type: text\n"
" - Sample values: Alice, Bob\n"
)
codeflash_output = _format_schema_info([table]) # 2.50μs -> 2.84μs (11.8% slower)
def test_single_table_multiple_columns_mixed_samples():
# Scenario: One table, multiple columns, some with and some without sample values
table = SchemaTable(
name="users",
columns=[
Column(name="id", type="integer"),
Column(name="email", type="text", sample_values=["[email protected]", "[email protected]"])
]
)
expected = (
"\n\n## Available schema:\n"
"- Table: users\n"
" - Column: id\n"
" - Type: integer\n"
" - Column: email\n"
" - Type: text\n"
" - Sample values: [email protected], [email protected]\n"
)
codeflash_output = _format_schema_info([table]) # 2.65μs -> 2.85μs (6.98% slower)
def test_multiple_tables():
# Scenario: Multiple tables, each with columns
tables = [
SchemaTable(
name="users",
columns=[Column(name="id", type="integer")]
),
SchemaTable(
name="orders",
columns=[Column(name="order_id", type="integer", sample_values=["1", "2"])]
)
]
expected = (
"\n\n## Available schema:\n"
"- Table: users\n"
" - Column: id\n"
" - Type: integer\n"
"- Table: orders\n"
" - Column: order_id\n"
" - Type: integer\n"
" - Sample values: 1, 2\n"
)
codeflash_output = _format_schema_info(tables) # 2.84μs -> 2.89μs (1.66% slower)
2. Edge Test Cases
def test_column_with_empty_sample_values():
# Scenario: Column with sample_values as empty list
table = SchemaTable(
name="products",
columns=[Column(name="sku", type="text", sample_values=[])]
)
expected = (
"\n\n## Available schema:\n"
"- Table: products\n"
" - Column: sku\n"
" - Type: text\n"
)
codeflash_output = _format_schema_info([table]) # 1.34μs -> 1.53μs (12.4% slower)
def test_column_with_none_sample_values():
# Scenario: Column with sample_values as None
table = SchemaTable(
name="products",
columns=[Column(name="sku", type="text", sample_values=None)]
)
expected = (
"\n\n## Available schema:\n"
"- Table: products\n"
" - Column: sku\n"
" - Type: text\n"
)
codeflash_output = _format_schema_info([table]) # 1.30μs -> 1.49μs (12.7% slower)
def test_table_with_no_columns():
# Scenario: Table with no columns
table = SchemaTable(
name="empty_table",
columns=[]
)
expected = (
"\n\n## Available schema:\n"
"- Table: empty_table\n"
)
codeflash_output = _format_schema_info([table]) # 864ns -> 1.05μs (17.7% slower)
def test_column_with_special_characters():
# Scenario: Column and table names with special characters
table = SchemaTable(
name="user-data",
columns=[Column(name="e-mail", type="strange-type", sample_values=["[email protected]"])]
)
expected = (
"\n\n## Available schema:\n"
"- Table: user-data\n"
" - Column: e-mail\n"
" - Type: strange-type\n"
" - Sample values: [email protected]\n"
)
codeflash_output = _format_schema_info([table]) # 2.33μs -> 2.65μs (12.4% slower)
def test_column_with_numeric_sample_values():
# Scenario: Numeric sample values
table = SchemaTable(
name="numbers",
columns=[Column(name="value", type="integer", sample_values=[1, 2, 3])]
)
expected = (
"\n\n## Available schema:\n"
"- Table: numbers\n"
" - Column: value\n"
" - Type: integer\n"
" - Sample values: 1, 2, 3\n"
)
codeflash_output = _format_schema_info([table]) # 2.46μs -> 2.87μs (14.3% slower)
def test_column_with_empty_string_sample_value():
# Scenario: Sample values include empty string
table = SchemaTable(
name="test",
columns=[Column(name="field", type="text", sample_values=["", "abc"])]
)
expected = (
"\n\n## Available schema:\n"
"- Table: test\n"
" - Column: field\n"
" - Type: text\n"
" - Sample values: , abc\n"
)
codeflash_output = _format_schema_info([table]) # 2.26μs -> 2.60μs (13.1% slower)
def test_column_with_single_sample_value():
# Scenario: Only one sample value
table = SchemaTable(
name="single",
columns=[Column(name="col", type="text", sample_values=["only"])]
)
expected = (
"\n\n## Available schema:\n"
"- Table: single\n"
" - Column: col\n"
" - Type: text\n"
" - Sample values: only\n"
)
codeflash_output = _format_schema_info([table]) # 2.19μs -> 2.35μs (6.73% slower)
def test_column_with_non_string_sample_values():
# Scenario: Sample values are mixed types (int, float, str)
table = SchemaTable(
name="mixed",
columns=[Column(name="data", type="mixed", sample_values=[1, 2.5, "three"])]
)
expected = (
"\n\n## Available schema:\n"
"- Table: mixed\n"
" - Column: data\n"
" - Type: mixed\n"
" - Sample values: 1, 2.5, three\n"
)
codeflash_output = _format_schema_info([table]) # 4.38μs -> 4.14μs (5.80% faster)
3. Large Scale Test Cases
def test_large_number_of_tables_and_columns():
# Scenario: Many tables and columns, each with sample values
num_tables = 10
num_columns = 10
tables = []
for t in range(num_tables):
columns = []
for c in range(num_columns):
columns.append(Column(
name=f"col{c}",
type="integer",
sample_values=[str(c), str(c+1)]
))
tables.append(SchemaTable(name=f"table{t}", columns=columns))
codeflash_output = _format_schema_info(tables); result = codeflash_output # 44.5μs -> 37.2μs (19.5% faster)
for t in range(num_tables):
for c in range(num_columns):
pass
def test_large_table_with_many_columns_no_samples():
# Scenario: One table with many columns, none with sample values
num_columns = 100
columns = [Column(name=f"c{i}", type="text") for i in range(num_columns)]
table = SchemaTable(name="bigtable", columns=columns)
codeflash_output = _format_schema_info([table]); result = codeflash_output # 15.9μs -> 14.7μs (8.53% faster)
for i in range(num_columns):
pass
def test_large_table_with_many_columns_and_samples():
# Scenario: One table with many columns, all with sample values
num_columns = 100
columns = [Column(name=f"c{i}", type="text", sample_values=[str(i)]) for i in range(num_columns)]
table = SchemaTable(name="bigtable", columns=columns)
codeflash_output = _format_schema_info([table]); result = codeflash_output # 41.2μs -> 31.8μs (29.6% faster)
for i in range(num_columns):
pass
def test_large_number_of_tables_no_columns():
# Scenario: Many tables, none with columns
num_tables = 100
tables = [SchemaTable(name=f"t{i}", columns=[]) for i in range(num_tables)]
codeflash_output = _format_schema_info(tables); result = codeflash_output # 11.1μs -> 10.7μs (3.87% faster)
for i in range(num_tables):
pass
def test_scalability_with_maximum_elements():
# Scenario: Stress test with ~1000 elements (10 tables x 100 columns)
num_tables = 10
num_columns = 100
tables = []
for t in range(num_tables):
columns = [Column(name=f"c{c}", type="text", sample_values=[str(c)]) for c in range(num_columns)]
tables.append(SchemaTable(name=f"t{t}", columns=columns))
codeflash_output = _format_schema_info(tables); result = codeflash_output # 368μs -> 300μs (22.6% faster)
codeflash_output is used to check that the output of the original code is the same as that of the optimized code.
#------------------------------------------------
from typing import List, Optional
imports
import pytest
from marimo._server.ai.prompts import _format_schema_info
Minimal stub for SchemaTable and Column to allow testing
class Column:
def init(self, name: str, type_: str, sample_values: Optional[List[str]] = None):
self.name = name
self.type = type_
self.sample_values = sample_values if sample_values is not None else []
class SchemaTable:
def init(self, name: str, columns: List[Column]):
self.name = name
self.columns = columns
from marimo._server.ai.prompts import _format_schema_info
unit tests
------------------------
Basic Test Cases
------------------------
def test_empty_tables_none():
# Test with None as input
codeflash_output = _format_schema_info(None) # 322ns -> 346ns (6.94% slower)
def test_empty_tables_list():
# Test with empty list as input
codeflash_output = _format_schema_info([]) # 348ns -> 347ns (0.288% faster)
def test_single_table_single_column_no_samples():
# Test with one table, one column, no sample values
table = SchemaTable(
name="users",
columns=[Column(name="id", type_="INTEGER")]
)
expected = (
"\n\n## Available schema:\n"
"- Table: users\n"
" - Column: id\n"
" - Type: INTEGER\n"
)
codeflash_output = _format_schema_info([table]) # 1.44μs -> 1.65μs (12.7% slower)
def test_single_table_single_column_with_samples():
# Test with one table, one column, with sample values
table = SchemaTable(
name="users",
columns=[Column(name="name", type_="TEXT", sample_values=["Alice", "Bob"])]
)
expected = (
"\n\n## Available schema:\n"
"- Table: users\n"
" - Column: name\n"
" - Type: TEXT\n"
" - Sample values: Alice, Bob\n"
)
codeflash_output = _format_schema_info([table]) # 2.47μs -> 2.94μs (15.8% slower)
def test_single_table_multiple_columns_mixed_samples():
# Test with one table, multiple columns, mixed sample values
table = SchemaTable(
name="users",
columns=[
Column(name="id", type_="INTEGER"),
Column(name="name", type_="TEXT", sample_values=["Alice", "Bob"]),
Column(name="email", type_="TEXT", sample_values=[]),
]
)
expected = (
"\n\n## Available schema:\n"
"- Table: users\n"
" - Column: id\n"
" - Type: INTEGER\n"
" - Column: name\n"
" - Type: TEXT\n"
" - Sample values: Alice, Bob\n"
" - Column: email\n"
" - Type: TEXT\n"
)
codeflash_output = _format_schema_info([table]) # 3.00μs -> 3.29μs (8.64% slower)
def test_multiple_tables_single_column_each():
# Test with multiple tables, each with a single column
tables = [
SchemaTable(name="users", columns=[Column(name="id", type_="INTEGER")]),
SchemaTable(name="products", columns=[Column(name="sku", type_="TEXT", sample_values=["A1", "B2"])]),
]
expected = (
"\n\n## Available schema:\n"
"- Table: users\n"
" - Column: id\n"
" - Type: INTEGER\n"
"- Table: products\n"
" - Column: sku\n"
" - Type: TEXT\n"
" - Sample values: A1, B2\n"
)
codeflash_output = _format_schema_info(tables) # 2.88μs -> 3.05μs (5.54% slower)
------------------------
Edge Test Cases
------------------------
def test_table_with_no_columns():
# Table with no columns should still show table name
table = SchemaTable(name="empty_table", columns=[])
expected = (
"\n\n## Available schema:\n"
"- Table: empty_table\n"
)
codeflash_output = _format_schema_info([table]) # 856ns -> 1.11μs (23.1% slower)
def test_column_with_empty_sample_values():
# Column with empty sample_values list should not show samples
table = SchemaTable(
name="users",
columns=[Column(name="id", type_="INTEGER", sample_values=[])]
)
expected = (
"\n\n## Available schema:\n"
"- Table: users\n"
" - Column: id\n"
" - Type: INTEGER\n"
)
codeflash_output = _format_schema_info([table]) # 1.32μs -> 1.54μs (14.3% slower)
def test_column_with_none_sample_values():
# Column with sample_values=None should not show samples
table = SchemaTable(
name="users",
columns=[Column(name="id", type_="INTEGER", sample_values=None)]
)
expected = (
"\n\n## Available schema:\n"
"- Table: users\n"
" - Column: id\n"
" - Type: INTEGER\n"
)
codeflash_output = _format_schema_info([table]) # 1.26μs -> 1.46μs (13.5% slower)
def test_table_and_column_names_with_special_characters():
# Table and column names with special characters
table = SchemaTable(
name="user-data_2024",
columns=[Column(name="email@address", type_="TEXT", sample_values=["[email protected]", "[email protected]"])]
)
expected = (
"\n\n## Available schema:\n"
"- Table: user-data_2024\n"
" - Column: email@address\n"
" - Type: TEXT\n"
" - Sample values: [email protected], [email protected]\n"
)
codeflash_output = _format_schema_info([table]) # 2.41μs -> 2.80μs (13.9% slower)
def test_column_type_with_numbers_and_symbols():
# Column type with unusual characters
table = SchemaTable(
name="products",
columns=[Column(name="price", type_="DECIMAL(10,2)", sample_values=["19.99", "5.00"])]
)
expected = (
"\n\n## Available schema:\n"
"- Table: products\n"
" - Column: price\n"
" - Type: DECIMAL(10,2)\n"
" - Sample values: 19.99, 5.00\n"
)
codeflash_output = _format_schema_info([table]) # 2.32μs -> 2.71μs (14.6% slower)
def test_column_with_numeric_sample_values():
# Column with numeric sample values (as strings)
table = SchemaTable(
name="numbers",
columns=[Column(name="num", type_="INTEGER", sample_values=["1", "2", "3"])]
)
expected = (
"\n\n## Available schema:\n"
"- Table: numbers\n"
" - Column: num\n"
" - Type: INTEGER\n"
" - Sample values: 1, 2, 3\n"
)
codeflash_output = _format_schema_info([table]) # 2.43μs -> 2.59μs (6.03% slower)
def test_column_with_single_sample_value():
# Column with only one sample value
table = SchemaTable(
name="single",
columns=[Column(name="only", type_="TEXT", sample_values=["unique"])]
)
expected = (
"\n\n## Available schema:\n"
"- Table: single\n"
" - Column: only\n"
" - Type: TEXT\n"
" - Sample values: unique\n"
)
codeflash_output = _format_schema_info([table]) # 2.15μs -> 2.36μs (9.19% slower)
def test_column_with_empty_string_sample_value():
# Column with empty string as a sample value
table = SchemaTable(
name="empties",
columns=[Column(name="empty", type_="TEXT", sample_values=[""])]
)
expected = (
"\n\n## Available schema:\n"
"- Table: empties\n"
" - Column: empty\n"
" - Type: TEXT\n"
" - Sample values: \n"
)
codeflash_output = _format_schema_info([table]) # 2.12μs -> 2.43μs (13.0% slower)
------------------------
Large Scale Test Cases
------------------------
def test_large_number_of_tables_and_columns():
# Test with 50 tables, each with 20 columns, each column with 2 sample values
num_tables = 50
num_columns = 20
tables = []
for t in range(num_tables):
columns = [
Column(
name=f"col{c}",
type_=f"TYPE{c}",
sample_values=[f"val{c}_1", f"val{c}_2"]
) for c in range(num_columns)
]
tables.append(SchemaTable(name=f"table{t}", columns=columns))
codeflash_output = _format_schema_info(tables); result = codeflash_output # 411μs -> 383μs (7.21% faster)
# Should contain all table names
for t in range(num_tables):
pass
# Should contain all column names and types and sample values for first table
for c in range(num_columns):
pass
def test_large_number_of_sample_values():
# Test with one table, one column, with 1000 sample values
samples = [f"sample{i}" for i in range(1000)]
table = SchemaTable(
name="big_table",
columns=[Column(name="big_col", type_="TEXT", sample_values=samples)]
)
codeflash_output = _format_schema_info([table]); result = codeflash_output # 30.5μs -> 28.4μs (7.41% faster)
# Should contain all sample values in correct order
sample_str = ", ".join(samples)
expected = (
"\n\n## Available schema:\n"
"- Table: big_table\n"
" - Column: big_col\n"
" - Type: TEXT\n"
f" - Sample values: {sample_str}\n"
)
def test_large_number_of_tables_empty_columns():
# 100 tables, each with no columns
tables = [SchemaTable(name=f"t{i}", columns=[]) for i in range(100)]
codeflash_output = _format_schema_info(tables); result = codeflash_output # 11.9μs -> 10.1μs (18.6% faster)
# Should contain all table names, no columns
for i in range(100):
pass
def test_large_mixed_schema():
# 10 tables, each with 10 columns, alternating sample values
tables = []
for t in range(10):
columns = []
for c in range(10):
if c % 2 == 0:
samples = [f"val{c}_a", f"val{c}b"]
else:
samples = []
columns.append(Column(name=f"col{c}", type=f"TYPE{c}", sample_values=samples))
tables.append(SchemaTable(name=f"table{t}", columns=columns))
codeflash_output = _format_schema_info(tables); result = codeflash_output # 33.4μs -> 28.6μs (16.8% faster)
# Check that all tables and columns are present
for t in range(10):
for c in range(10):
if c % 2 == 0:
pass
codeflash_output is used to check that the output of the original code is the same as that of the optimized code.
#------------------------------------------------
from marimo._server.ai.prompts import _format_schema_info
def test__format_schema_info():
_format_schema_info([])
🔎 Concolic Coverage Tests and Runtime
codeflash_concolic_bps3n5s8/tmpv9qmmpa6/test_concolic_coverage.py::test__format_schema_infoTo edit these changes
git checkout codeflash/optimize-_format_schema_info-mhviy9qyand push.