Skip to content

Commit

Permalink
Restructured batch splitting into new method
Browse files Browse the repository at this point in the history
  • Loading branch information
fexfl committed Dec 26, 2024
1 parent 0c328ae commit 6526c16
Show file tree
Hide file tree
Showing 2 changed files with 69 additions and 4 deletions.
16 changes: 12 additions & 4 deletions mailcom/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,13 +221,21 @@ def pseudonymize_email_addresses(self, sentence):
def concatenate(self, sentences):
return " ".join(sentences)

def split_batches(self, sentences):
batches = (
[
sentences[n : n + self.n_batch_sentences] # noqa
for n in range(0, len(sentences), self.n_batch_sentences)
]
if self.n_batch_sentences != -1
else [sentences]
)
return batches

def pseudonymize(self, text: str):
self.reset()
sentences = self.get_sentences(text)
batches = [
sentences[n : n + self.n_batch_sentences] # noqa
for n in range(0, len(sentences), self.n_batch_sentences)
]
batches = self.split_batches(sentences)
pseudonymized_batches = []
for batch in batches:
batch = self.concatenate(batch)
Expand Down
57 changes: 57 additions & 0 deletions mailcom/test/test_parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,3 +306,60 @@ def test_set_sentence_batch_size(get_default_fr):

with pytest.raises(ValueError):
get_default_fr.set_sentence_batch_size(-2)


def test_split_batches_empty_list(get_default_fr):
sentences = []
batches = get_default_fr.split_batches(sentences)
assert batches == []


def test_split_batches_single_sentence(get_default_fr):
sentences = ["This is a single sentence."]
get_default_fr.set_sentence_batch_size(1)
batches = get_default_fr.split_batches(sentences)
assert batches == [["This is a single sentence."]]


def test_split_batches_multiple_sentences(get_default_fr):
sentences = [
"This is the first sentence.",
"This is the second sentence.",
"This is the third sentence.",
"This is the fourth sentence.",
]
get_default_fr.set_sentence_batch_size(2)
batches = get_default_fr.split_batches(sentences)
assert batches == [
["This is the first sentence.", "This is the second sentence."],
["This is the third sentence.", "This is the fourth sentence."],
]


def test_split_batches_batch_size_greater_than_sentences(get_default_fr):
sentences = [
"This is the first sentence.",
"This is the second sentence.",
]
get_default_fr.set_sentence_batch_size(5)
batches = get_default_fr.split_batches(sentences)
assert batches == [["This is the first sentence.", "This is the second sentence."]]


def test_split_batches_batch_size_minus_one(get_default_fr):
sentences = [
"This is the first sentence.",
"This is the second sentence.",
"This is the third sentence.",
"This is the fourth sentence.",
]
get_default_fr.set_sentence_batch_size(-1)
batches = get_default_fr.split_batches(sentences)
assert batches == [
[
"This is the first sentence.",
"This is the second sentence.",
"This is the third sentence.",
"This is the fourth sentence.",
]
]

0 comments on commit 6526c16

Please sign in to comment.