⚡️ Speed up function is_fuzzy_match by 11%
#601
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
📄 11% (0.11x) speedup for
is_fuzzy_matchinmarimo/_utils/fuzzy_match.py⏱️ Runtime :
120 microseconds→108 microseconds(best of250runs)📝 Explanation and details
The optimized code achieves a 10% speedup through two key optimizations that reduce string processing overhead:
Key Optimizations:
Early Exit for Exact Case-Sensitive Matches: Added
if query in name: return Truebefore performing case-insensitive matching. This eliminates unnecessary.lower()calls when the query already matches the name exactly as-is.Eliminated Redundant
bool()Conversion: Replacedbool(compiled_pattern.search(name))withcompiled_pattern.search(name) is not None, avoiding an extra Python-level type conversion sincesearch()already returns a truthy/falsy value.Optimized String Lowercasing: When case-insensitive matching is needed, the code now computes
query.lower()andname.lower()only once and stores them in variables, rather than calling.lower()inline in the comparison.Performance Impact by Test Case:
is_fuzzy_match("foo", "foobar")show 40-50% speedups due to the early exit.lower()bool()conversionThe optimization is particularly effective for workloads with many exact case-sensitive matches, which appear common based on the test results showing substantial speedups for direct substring matches.
✅ Correctness verification report:
⚙️ Existing Unit Tests and Runtime
_utils/test_fuzzy_match.py::test_is_fuzzy_match_case_insensitive_utils/test_fuzzy_match.py::test_is_fuzzy_match_with_regex_utils/test_fuzzy_match.py::test_is_fuzzy_match_without_regex🌀 Generated Regression Tests and Runtime
import re
imports
import pytest
from marimo._utils.fuzzy_match import is_fuzzy_match
unit tests
===================
Basic Test Cases
===================
def test_basic_substring_match_case_insensitive():
# Basic substring match, not regex, case-insensitive
codeflash_output = is_fuzzy_match("foo", "FooBar", None, False) # 766ns -> 845ns (9.35% slower)
codeflash_output = is_fuzzy_match("BAR", "foobar", None, False) # 338ns -> 435ns (22.3% slower)
codeflash_output = is_fuzzy_match("oBa", "FOOBAR", None, False) # 208ns -> 295ns (29.5% slower)
codeflash_output = not is_fuzzy_match("baz", "foobar", None, False) # 229ns -> 283ns (19.1% slower)
def test_basic_regex_match():
# Basic regex match, is_regex True, compiled_pattern provided
pattern = re.compile(r"ba.")
codeflash_output = is_fuzzy_match("ba.", "foobar", pattern, True) # 1.14μs -> 1.02μs (11.6% faster)
codeflash_output = not is_fuzzy_match("ba.", "foocar", pattern, True) # 416ns -> 394ns (5.58% faster)
def test_basic_regex_match_case_insensitive():
# Regex with case-insensitive flag
pattern = re.compile(r"BaR", re.IGNORECASE)
codeflash_output = is_fuzzy_match("BaR", "fooBAR", pattern, True) # 1.27μs -> 1.15μs (9.71% faster)
codeflash_output = is_fuzzy_match("BaR", "bar", pattern, True) # 510ns -> 485ns (5.15% faster)
codeflash_output = not is_fuzzy_match("BaR", "baz", pattern, True) # 413ns -> 396ns (4.29% faster)
def test_basic_regex_match_with_special_characters():
# Regex with special characters
pattern = re.compile(r"\d{3}-\d{2}-\d{4}")
codeflash_output = is_fuzzy_match(r"\d{3}-\d{2}-\d{4}", "My SSN is 123-45-6789", pattern, True) # 1.61μs -> 1.56μs (2.88% faster)
codeflash_output = not is_fuzzy_match(r"\d{3}-\d{2}-\d{4}", "No SSN here", pattern, True) # 547ns -> 542ns (0.923% faster)
===================
Edge Test Cases
===================
def test_empty_query_and_name():
# Both query and name empty
codeflash_output = is_fuzzy_match("", "", None, False) # 623ns -> 442ns (41.0% faster)
# Empty query, non-empty name
codeflash_output = is_fuzzy_match("", "foo", None, False) # 376ns -> 191ns (96.9% faster)
# Non-empty query, empty name
codeflash_output = not is_fuzzy_match("foo", "", None, False) # 266ns -> 548ns (51.5% slower)
def test_empty_query_and_name_regex():
# Regex with empty query, should match everything (empty regex matches start)
pattern = re.compile("")
codeflash_output = is_fuzzy_match("", "", pattern, True) # 945ns -> 943ns (0.212% faster)
codeflash_output = is_fuzzy_match("", "foobar", pattern, True) # 424ns -> 429ns (1.17% slower)
def test_none_compiled_pattern_with_is_regex_true():
# is_regex True but compiled_pattern is None, should fallback to substring
codeflash_output = is_fuzzy_match("foo", "foobar", None, True) # 746ns -> 481ns (55.1% faster)
codeflash_output = not is_fuzzy_match("baz", "foobar", None, True) # 378ns -> 616ns (38.6% slower)
def test_regex_pattern_none_and_is_regex_false():
# is_regex False and compiled_pattern is None, normal substring match
codeflash_output = is_fuzzy_match("foo", "foobar", None, False) # 669ns -> 433ns (54.5% faster)
codeflash_output = not is_fuzzy_match("baz", "foobar", None, False) # 333ns -> 539ns (38.2% slower)
def test_query_is_longer_than_name():
# Query longer than name, no match possible
codeflash_output = not is_fuzzy_match("foobar", "foo", None, False) # 630ns -> 657ns (4.11% slower)
def test_query_equals_name():
# Query equals name, should match
codeflash_output = is_fuzzy_match("foo", "foo", None, False) # 634ns -> 438ns (44.7% faster)
def test_regex_special_chars_literal_vs_regex():
# Query contains regex special chars but is_regex is False, should treat as substring
codeflash_output = is_fuzzy_match("f.o", "f.o", None, False) # 658ns -> 469ns (40.3% faster)
codeflash_output = not is_fuzzy_match("f.o", "foo", None, False) # 351ns -> 575ns (39.0% slower)
# is_regex True, should treat as regex
pattern = re.compile("f.o")
codeflash_output = is_fuzzy_match("f.o", "foo", pattern, True) # 1.11μs -> 1.01μs (9.81% faster)
codeflash_output = not is_fuzzy_match("f.o", "faoo", pattern, True) # 352ns -> 340ns (3.53% faster)
def test_unicode_and_non_ascii_characters():
# Unicode in query and name
codeflash_output = is_fuzzy_match("café", "Le Café du Monde", None, False) # 1.10μs -> 1.27μs (13.0% slower)
# Unicode in regex
pattern = re.compile(r"\u2603") # Unicode snowman
codeflash_output = is_fuzzy_match(r"\u2603", "Here is a snowman: \u2603", pattern, True) # 877ns -> 851ns (3.06% faster)
codeflash_output = not is_fuzzy_match(r"\u2603", "No snowman here", pattern, True) # 335ns -> 325ns (3.08% faster)
def test_match_at_start_middle_end():
# Match at start
codeflash_output = is_fuzzy_match("foo", "foobar", None, False) # 662ns -> 458ns (44.5% faster)
# Match in middle
codeflash_output = is_fuzzy_match("bar", "foobar", None, False) # 342ns -> 288ns (18.8% faster)
# Match at end
codeflash_output = is_fuzzy_match("ar", "foobar", None, False) # 306ns -> 205ns (49.3% faster)
def test_match_with_whitespace():
# Query with spaces
codeflash_output = is_fuzzy_match("foo bar", "foo bar baz", None, False) # 730ns -> 491ns (48.7% faster)
codeflash_output = not is_fuzzy_match("foo bar", "foobar", None, False) # 367ns -> 596ns (38.4% slower)
def test_regex_with_multiline():
# Regex with multiline flag
pattern = re.compile(r"^bar", re.MULTILINE)
text = "foo\nbar\nbaz"
codeflash_output = is_fuzzy_match(r"^bar", text, pattern, True) # 1.38μs -> 1.24μs (11.0% faster)
codeflash_output = not is_fuzzy_match(r"^bar", "foo\nbaz", pattern, True) # 638ns -> 564ns (13.1% faster)
def test_regex_anchors():
# Regex with start/end anchors
pattern = re.compile(r"^foo")
codeflash_output = is_fuzzy_match(r"^foo", "foobar", pattern, True) # 1.11μs -> 1.04μs (6.63% faster)
codeflash_output = not is_fuzzy_match(r"^foo", "barfoo", pattern, True) # 636ns -> 580ns (9.66% faster)
pattern_end = re.compile(r"bar$")
codeflash_output = is_fuzzy_match(r"bar$", "foobar", pattern_end, True) # 515ns -> 463ns (11.2% faster)
codeflash_output = not is_fuzzy_match(r"bar$", "barfoo", pattern_end, True) # 367ns -> 346ns (6.07% faster)
===================
Large Scale Test Cases
===================
def test_large_name_substring_match():
# Large name string, substring at the end
name = "a" * 999 + "xyz"
codeflash_output = is_fuzzy_match("xyz", name, None, False) # 1.54μs -> 727ns (112% faster)
codeflash_output = not is_fuzzy_match("xyZz", name, None, False) # 913ns -> 1.48μs (38.5% slower)
def test_large_name_regex_match():
# Large name, regex pattern at the end
name = "a" * 995 + "12345"
pattern = re.compile(r"\d{5}$")
codeflash_output = is_fuzzy_match(r"\d{5}$", name, pattern, True) # 9.65μs -> 9.52μs (1.43% faster)
pattern_no_match = re.compile(r"\d{6}$")
codeflash_output = not is_fuzzy_match(r"\d{6}$", name, pattern_no_match, True) # 8.92μs -> 8.88μs (0.451% faster)
def test_large_query_no_match():
# Query almost as long as name but not present
name = "a" * 1000
query = "a" * 999 + "b"
codeflash_output = not is_fuzzy_match(query, name, None, False) # 2.25μs -> 2.74μs (17.9% slower)
def test_large_scale_multiple_patterns():
# Test many patterns against a large string
name = "abc" * 333 + "xyz"
patterns = [re.compile("a"), re.compile("b"), re.compile("c"), re.compile("xyz$"), re.compile("123")]
expected = [True, True, True, True, False]
for pat, exp in zip(patterns, expected):
codeflash_output = is_fuzzy_match(pat.pattern, name, pat, True) # 2.79μs -> 2.75μs (1.42% faster)
def test_performance_large_inputs():
# Large input, ensure function returns quickly and correctly
name = "ab" * 500 # 1000 chars
query = "b" * 500
# Should not match as "b"*500 is not a substring
codeflash_output = not is_fuzzy_match(query, name, None, False) # 1.94μs -> 2.30μs (15.5% slower)
# But "ab" * 250 is a substring
codeflash_output = is_fuzzy_match("ab" * 250, name, None, False) # 1.44μs -> 778ns (85.6% faster)
# Regex for repeated pattern
pattern = re.compile(r"(ab){500}")
codeflash_output = is_fuzzy_match(r"(ab){500}", name, pattern, True) # 8.13μs -> 7.47μs (8.72% faster)
===================
Mutation Testing Guards
===================
def test_mutation_guard_regex_vs_substring():
# If is_regex True but compiled_pattern is None, must fallback to substring
codeflash_output = is_fuzzy_match("foo", "foobar", None, True) # 755ns -> 489ns (54.4% faster)
# If is_regex False, compiled_pattern is ignored
pattern = re.compile("foo")
codeflash_output = is_fuzzy_match("foo", "foobar", pattern, False) # 360ns -> 264ns (36.4% faster)
# If is_regex True and compiled_pattern present, must use regex
pattern = re.compile("foo$")
codeflash_output = is_fuzzy_match("foo$", "barfoo", pattern, True) # 979ns -> 896ns (9.26% faster)
codeflash_output = not is_fuzzy_match("foo$", "foobar", pattern, True) # 475ns -> 480ns (1.04% slower)
def test_mutation_guard_case_insensitivity():
# Substring match must be case-insensitive
codeflash_output = is_fuzzy_match("FOO", "foo", None, False) # 627ns -> 790ns (20.6% slower)
codeflash_output = is_fuzzy_match("foo", "FOO", None, False) # 310ns -> 373ns (16.9% slower)
codeflash_output = not is_fuzzy_match("baz", "FOO", None, False) # 241ns -> 271ns (11.1% slower)
codeflash_output is used to check that the output of the original code is the same as that of the optimized code.
#------------------------------------------------
import re
imports
import pytest
from marimo._utils.fuzzy_match import is_fuzzy_match
-------------------------
Basic Test Cases
-------------------------
def test_basic_exact_match_substring():
# Basic substring match, not regex
codeflash_output = is_fuzzy_match("foo", "foobar", None, False) # 936ns -> 519ns (80.3% faster)
def test_basic_exact_match_case_insensitive():
# Case-insensitive substring match
codeflash_output = is_fuzzy_match("FOO", "foobar", None, False) # 741ns -> 872ns (15.0% slower)
def test_basic_no_match_substring():
# Substring not present
codeflash_output = is_fuzzy_match("baz", "foobar", None, False) # 700ns -> 781ns (10.4% slower)
def test_basic_regex_simple_match():
# Simple regex match
pattern = re.compile(r"foo")
codeflash_output = is_fuzzy_match("foo", "foobar", pattern, True) # 924ns -> 835ns (10.7% faster)
def test_basic_regex_no_match():
# Regex does not match
pattern = re.compile(r"baz")
codeflash_output = is_fuzzy_match("baz", "foobar", pattern, True) # 774ns -> 749ns (3.34% faster)
def test_basic_regex_case_sensitive():
# Regex is case-sensitive by default
pattern = re.compile(r"FOO")
codeflash_output = is_fuzzy_match("FOO", "foobar", pattern, True) # 713ns -> 698ns (2.15% faster)
def test_basic_regex_case_insensitive():
# Regex with IGNORECASE flag
pattern = re.compile(r"FOO", re.IGNORECASE)
codeflash_output = is_fuzzy_match("FOO", "foobar", pattern, True) # 1.18μs -> 1.10μs (7.39% faster)
def test_basic_empty_query():
# Empty query matches everything as substring
codeflash_output = is_fuzzy_match("", "foobar", None, False) # 723ns -> 439ns (64.7% faster)
def test_basic_empty_name():
# Empty name, non-empty query
codeflash_output = is_fuzzy_match("foo", "", None, False) # 624ns -> 764ns (18.3% slower)
def test_basic_both_empty():
# Both query and name are empty
codeflash_output = is_fuzzy_match("", "", None, False) # 623ns -> 438ns (42.2% faster)
-------------------------
Edge Test Cases
-------------------------
def test_edge_query_longer_than_name():
# Query longer than name, should not match
codeflash_output = is_fuzzy_match("foobar", "foo", None, False) # 678ns -> 664ns (2.11% faster)
def test_edge_special_regex_characters_in_query():
# Query contains regex special chars, but not regex mode
codeflash_output = is_fuzzy_match("foo.*", "foo.bar", None, False) # 741ns -> 493ns (50.3% faster)
codeflash_output = is_fuzzy_match("foo.", "foobar", None, False) # 360ns -> 571ns (37.0% slower)
def test_edge_regex_with_special_characters():
# Regex with special characters
pattern = re.compile(r"foo.*bar")
codeflash_output = is_fuzzy_match("foo.*bar", "foo123bar", pattern, True) # 1.34μs -> 1.23μs (9.01% faster)
codeflash_output = is_fuzzy_match("foo.*bar", "foobar", pattern, True) # 619ns -> 542ns (14.2% faster)
codeflash_output = is_fuzzy_match("foo.*bar", "foobaz", pattern, True) # 497ns -> 503ns (1.19% slower)
def test_edge_regex_with_anchors():
# Regex with start/end anchors
pattern = re.compile(r"^foo$")
codeflash_output = is_fuzzy_match("^foo$", "foo", pattern, True) # 1.10μs -> 1.10μs (0.546% faster)
codeflash_output = is_fuzzy_match("^foo$", "foobar", pattern, True) # 716ns -> 644ns (11.2% faster)
def test_edge_regex_empty_compiled_pattern():
# is_regex True but compiled_pattern is None, should fallback to substring
codeflash_output = is_fuzzy_match("foo", "foobar", None, True) # 776ns -> 470ns (65.1% faster)
codeflash_output = is_fuzzy_match("baz", "foobar", None, True) # 357ns -> 641ns (44.3% slower)
def test_edge_is_regex_false_but_compiled_pattern_present():
# is_regex False but compiled_pattern present, should ignore pattern and do substring
pattern = re.compile(r"baz")
codeflash_output = is_fuzzy_match("foo", "foobar", pattern, False) # 638ns -> 444ns (43.7% faster)
codeflash_output = is_fuzzy_match("baz", "foobar", pattern, False) # 348ns -> 543ns (35.9% slower)
def test_edge_unicode_characters():
# Unicode in query and name
codeflash_output = is_fuzzy_match("café", "Le Café Bleu", None, False) # 1.03μs -> 1.21μs (14.5% slower)
codeflash_output = is_fuzzy_match("CAFÉ", "Le café bleu", None, False) # 382ns -> 473ns (19.2% slower)
pattern = re.compile(r"café", re.IGNORECASE)
codeflash_output = is_fuzzy_match("café", "Le Café Bleu", pattern, True) # 1.24μs -> 1.08μs (14.9% faster)
def test_edge_whitespace_handling():
# Whitespace in query/name
codeflash_output = is_fuzzy_match("foo bar", "foo bar baz", None, False) # 768ns -> 500ns (53.6% faster)
codeflash_output = is_fuzzy_match("foo bar", "foo bar baz", None, False) # 406ns -> 610ns (33.4% slower)
def test_edge_only_whitespace_query():
# Query is only whitespace
codeflash_output = is_fuzzy_match(" ", "foo bar", None, False) # 663ns -> 404ns (64.1% faster)
codeflash_output = is_fuzzy_match(" ", "foobar", None, False) # 374ns -> 528ns (29.2% slower)
def test_edge_non_ascii_characters():
# Non-ASCII characters
codeflash_output = is_fuzzy_match("ß", "straße", None, False) # 1.16μs -> 447ns (159% faster)
pattern = re.compile(r"straße")
codeflash_output = is_fuzzy_match("straße", "straße", pattern, True) # 835ns -> 766ns (9.01% faster)
def test_edge_query_is_substring_of_name_at_end():
codeflash_output = is_fuzzy_match("bar", "foobar", None, False) # 732ns -> 466ns (57.1% faster)
def test_edge_query_is_substring_of_name_at_start():
codeflash_output = is_fuzzy_match("foo", "foobar", None, False) # 686ns -> 457ns (50.1% faster)
def test_edge_query_is_substring_of_name_in_middle():
codeflash_output = is_fuzzy_match("oob", "foobar", None, False) # 674ns -> 455ns (48.1% faster)
def test_edge_query_is_entire_name():
codeflash_output = is_fuzzy_match("foobar", "foobar", None, False) # 667ns -> 484ns (37.8% faster)
-------------------------
Large Scale Test Cases
-------------------------
def test_large_scale_long_name_substring():
# Very long name, short query
long_name = "a" * 999 + "foo"
codeflash_output = is_fuzzy_match("foo", long_name, None, False) # 1.54μs -> 702ns (119% faster)
codeflash_output = is_fuzzy_match("bar", long_name, None, False) # 1.56μs -> 2.72μs (42.8% slower)
def test_large_scale_long_query_and_name():
# Both query and name are long, query matches at end
long_query = "b" * 500
long_name = "a" * 499 + long_query
codeflash_output = is_fuzzy_match(long_query, long_name, None, False) # 2.77μs -> 1.82μs (51.8% faster)
def test_large_scale_regex_match():
# Large name, regex matches at end
pattern = re.compile(r"foo$")
long_name = "a" * 995 + "foo"
codeflash_output = is_fuzzy_match("foo$", long_name, pattern, True) # 1.41μs -> 1.27μs (10.9% faster)
def test_large_scale_regex_no_match():
# Large name, regex does not match
pattern = re.compile(r"bar$")
long_name = "a" * 999 + "foo"
codeflash_output = is_fuzzy_match("bar$", long_name, pattern, True) # 924ns -> 909ns (1.65% faster)
def test_large_scale_multiple_matches():
# Large name with multiple occurrences of query
long_name = ("foo" + "bar") * 200
codeflash_output = is_fuzzy_match("foo", long_name, None, False) # 1.55μs -> 477ns (224% faster)
pattern = re.compile(r"foo")
codeflash_output = is_fuzzy_match("foo", long_name, pattern, True) # 800ns -> 742ns (7.82% faster)
def test_large_scale_unicode():
# Large string with unicode
long_name = ("café" * 250)
codeflash_output = is_fuzzy_match("café", long_name, None, False) # 3.08μs -> 507ns (508% faster)
pattern = re.compile(r"café")
codeflash_output = is_fuzzy_match("café", long_name, pattern, True) # 791ns -> 760ns (4.08% faster)
def test_large_scale_all_different_characters():
# Name is all unique characters, query is a substring
import string
long_name = string.ascii_letters * 10 # 520 chars
codeflash_output = is_fuzzy_match("abc", long_name, None, False) # 903ns -> 412ns (119% faster)
codeflash_output = is_fuzzy_match("xyz", long_name, None, False) # 553ns -> 273ns (103% faster)
codeflash_output = is_fuzzy_match("123", long_name, None, False) # 593ns -> 1.11μs (46.5% slower)
def test_large_scale_query_not_found():
# Query not found in large name
long_name = "a" * 1000
codeflash_output = is_fuzzy_match("b", long_name, None, False) # 1.40μs -> 1.34μs (4.03% faster)
def test_large_scale_empty_query_large_name():
# Empty query matches any name, even if large
long_name = "x" * 1000
codeflash_output = is_fuzzy_match("", long_name, None, False) # 1.20μs -> 417ns (187% faster)
def test_large_scale_regex_with_many_matches():
# Regex matches many times in large string
pattern = re.compile(r"\d{3}")
long_name = "".join([str(i).zfill(3) for i in range(300)]) # 0..299 as 3-digit numbers
codeflash_output = is_fuzzy_match(r"\d{3}", long_name, pattern, True) # 1.34μs -> 1.18μs (14.2% faster)
codeflash_output is used to check that the output of the original code is the same as that of the optimized code.
#------------------------------------------------
from marimo._utils.fuzzy_match import is_fuzzy_match
def test_is_fuzzy_match():
is_fuzzy_match('', 'ć', None, True)
def test_is_fuzzy_match_2():
is_fuzzy_match('', '', re.compile(''), True)
🔎 Concolic Coverage Tests and Runtime
codeflash_concolic_bps3n5s8/tmpje0k7ten/test_concolic_coverage.py::test_is_fuzzy_matchTo edit these changes
git checkout codeflash/optimize-is_fuzzy_match-mhv9r2s6and push.