Skip to content

Commit

Permalink
remove short workds
Browse files Browse the repository at this point in the history
  • Loading branch information
fabridamicelli committed Apr 1, 2024
1 parent e4cf437 commit fd2e364
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 0 deletions.
6 changes: 6 additions & 0 deletions src/ficamp/classifier/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,11 @@ def remove_isolated_digits(s: str) -> str:
return " ".join(clean)


def remove_short_words(s: str) -> str:
"""Remove words made only of digits"""
return " ".join((word for word in s.split() if len(word) >= 2))


def preprocess(s: str) -> str:
"Clean up transaction description"
steps = (
Expand All @@ -58,6 +63,7 @@ def preprocess(s: str) -> str:
remove_digits,
remove_punctuation,
remove_isolated_digits,
remove_short_words,
)
out = s
for func in steps:
Expand Down
13 changes: 13 additions & 0 deletions tests/test_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
remove_pipes,
remove_punctuation,
remove_isolated_digits,
remove_short_words,
)


Expand Down Expand Up @@ -82,6 +83,17 @@ def test_remove_isolated_digits(inp, exp):
assert remove_isolated_digits(inp) == exp


@pytest.mark.parametrize(
("inp,exp"),
(
("hello a world", "hello world"),
("hello aa world", "hello aa world"),
),
)
def test_remove_short_words(inp, exp):
assert remove_short_words(inp) == exp


@pytest.mark.parametrize(
("inp,exp"),
(
Expand All @@ -98,6 +110,7 @@ def test_remove_isolated_digits(inp, exp):
("CSID:NL0213324324324 HELLO,world1332", "csid hello"),
("CSID:NL021332432 N26 HELLO,world1332", "csid n26 hello"),
("CSID:NL021332432 4324 HELLO,world1332", "csid hello"),
("CSID:NL021332432 n. HELLO,world1332", "csid hello"),
),
)
def test_preprocess(inp, exp):
Expand Down

0 comments on commit fd2e364

Please sign in to comment.