@@ -33,22 +33,22 @@ def test_init_with_negative_split_length():
33
33
34
34
def test_apply_overlap_no_overlap ():
35
35
# Test the case where there is no overlap between chunks
36
- splitter = RecursiveDocumentSplitter (split_length = 20 , split_overlap = 0 , separators = ["." ])
36
+ splitter = RecursiveDocumentSplitter (split_length = 20 , split_overlap = 0 , separators = ["." ], split_unit = "char" )
37
37
chunks = ["chunk1" , "chunk2" , "chunk3" ]
38
38
result = splitter ._apply_overlap (chunks )
39
39
assert result == ["chunk1" , "chunk2" , "chunk3" ]
40
40
41
41
42
42
def test_apply_overlap_with_overlap ():
43
43
# Test the case where there is overlap between chunks
44
- splitter = RecursiveDocumentSplitter (split_length = 20 , split_overlap = 4 , separators = ["." ])
44
+ splitter = RecursiveDocumentSplitter (split_length = 20 , split_overlap = 4 , separators = ["." ], split_unit = "char" )
45
45
chunks = ["chunk1" , "chunk2" , "chunk3" ]
46
46
result = splitter ._apply_overlap (chunks )
47
47
assert result == ["chunk1" , "unk1chunk2" , "unk2chunk3" ]
48
48
49
49
50
50
def test_apply_overlap_with_overlap_capturing_completely_previous_chunk (caplog ):
51
- splitter = RecursiveDocumentSplitter (split_length = 20 , split_overlap = 6 , separators = ["." ])
51
+ splitter = RecursiveDocumentSplitter (split_length = 20 , split_overlap = 6 , separators = ["." ], split_unit = "char" )
52
52
chunks = ["chunk1" , "chunk2" , "chunk3" , "chunk4" ]
53
53
_ = splitter ._apply_overlap (chunks )
54
54
assert (
@@ -59,7 +59,7 @@ def test_apply_overlap_with_overlap_capturing_completely_previous_chunk(caplog):
59
59
60
60
def test_apply_overlap_single_chunk ():
61
61
# Test the case where there is only one chunk
62
- splitter = RecursiveDocumentSplitter (split_length = 20 , split_overlap = 3 , separators = ["." ])
62
+ splitter = RecursiveDocumentSplitter (split_length = 20 , split_overlap = 3 , separators = ["." ], split_unit = "char" )
63
63
chunks = ["chunk1" ]
64
64
result = splitter ._apply_overlap (chunks )
65
65
assert result == ["chunk1" ]
@@ -74,7 +74,7 @@ def test_chunk_text_smaller_than_chunk_size():
74
74
75
75
76
76
def test_chunk_text_by_period ():
77
- splitter = RecursiveDocumentSplitter (split_length = 20 , split_overlap = 0 , separators = ["." ])
77
+ splitter = RecursiveDocumentSplitter (split_length = 20 , split_overlap = 0 , separators = ["." ], split_unit = "char" )
78
78
text = "This is a test. Another sentence. And one more."
79
79
chunks = splitter ._chunk_text (text )
80
80
assert len (chunks ) == 3
@@ -84,7 +84,7 @@ def test_chunk_text_by_period():
84
84
85
85
86
86
def test_run_multiple_new_lines ():
87
- splitter = RecursiveDocumentSplitter (split_length = 20 , separators = ["\n \n " , "\n " ])
87
+ splitter = RecursiveDocumentSplitter (split_length = 20 , separators = ["\n \n " , "\n " ], split_unit = "char" )
88
88
text = "This is a test.\n \n \n Another test.\n \n \n \n Final test."
89
89
doc = Document (content = text )
90
90
chunks = splitter .run ([doc ])["documents" ]
@@ -110,6 +110,7 @@ def test_run_using_custom_sentence_tokenizer():
110
110
splitter = RecursiveDocumentSplitter (
111
111
split_length = 400 ,
112
112
split_overlap = 0 ,
113
+ split_unit = "char" ,
113
114
separators = ["\n \n " , "\n " , "sentence" , " " ],
114
115
sentence_splitter_params = {"language" : "en" , "use_split_rules" : True , "keep_white_spaces" : False },
115
116
)
@@ -134,8 +135,8 @@ def test_run_using_custom_sentence_tokenizer():
134
135
) # noqa: E501
135
136
136
137
137
- def test_run_split_by_dot_count_page_breaks () -> None :
138
- document_splitter = RecursiveDocumentSplitter (separators = ["." ], split_length = 30 , split_overlap = 0 )
138
+ def test_run_split_by_dot_count_page_breaks_split_unit_char () -> None :
139
+ document_splitter = RecursiveDocumentSplitter (separators = ["." ], split_length = 30 , split_overlap = 0 , split_unit = "char" )
139
140
140
141
text = (
141
142
"Sentence on page 1. Another on page 1.\f Sentence on page 2. Another on page 2.\f "
@@ -181,8 +182,8 @@ def test_run_split_by_dot_count_page_breaks() -> None:
181
182
assert documents [6 ].meta ["split_idx_start" ] == text .index (documents [6 ].content )
182
183
183
184
184
- def test_run_split_by_word_count_page_breaks ():
185
- splitter = RecursiveDocumentSplitter (split_length = 18 , split_overlap = 0 , separators = ["w" ])
185
+ def test_run_split_by_word_count_page_breaks_split_unit_char ():
186
+ splitter = RecursiveDocumentSplitter (split_length = 18 , split_overlap = 0 , separators = ["w" ], split_unit = "char" )
186
187
text = "This is some text. \f This text is on another page. \f This is the last pag3."
187
188
doc = Document (content = text )
188
189
doc_chunks = splitter .run ([doc ])
@@ -216,7 +217,9 @@ def test_run_split_by_word_count_page_breaks():
216
217
217
218
218
219
def test_run_split_by_page_break_count_page_breaks () -> None :
219
- document_splitter = RecursiveDocumentSplitter (separators = ["\f " ], split_length = 50 , split_overlap = 0 )
220
+ document_splitter = RecursiveDocumentSplitter (
221
+ separators = ["\f " ], split_length = 50 , split_overlap = 0 , split_unit = "char"
222
+ )
220
223
221
224
text = (
222
225
"Sentence on page 1. Another on page 1.\f Sentence on page 2. Another on page 2.\f "
@@ -247,8 +250,10 @@ def test_run_split_by_page_break_count_page_breaks() -> None:
247
250
assert chunks_docs [3 ].meta ["split_idx_start" ] == text .index (chunks_docs [3 ].content )
248
251
249
252
250
- def test_run_split_by_new_line_count_page_breaks () -> None :
251
- document_splitter = RecursiveDocumentSplitter (separators = ["\n " ], split_length = 21 , split_overlap = 0 )
253
+ def test_run_split_by_new_line_count_page_breaks_split_unit_char () -> None :
254
+ document_splitter = RecursiveDocumentSplitter (
255
+ separators = ["\n " ], split_length = 21 , split_overlap = 0 , split_unit = "char"
256
+ )
252
257
253
258
text = (
254
259
"Sentence on page 1.\n Another on page 1.\n \f "
@@ -298,8 +303,10 @@ def test_run_split_by_new_line_count_page_breaks() -> None:
298
303
assert chunks_docs [6 ].meta ["split_idx_start" ] == text .index (chunks_docs [6 ].content )
299
304
300
305
301
- def test_run_split_by_sentence_count_page_breaks () -> None :
302
- document_splitter = RecursiveDocumentSplitter (separators = ["sentence" ], split_length = 28 , split_overlap = 0 )
306
+ def test_run_split_by_sentence_count_page_breaks_split_unit_char () -> None :
307
+ document_splitter = RecursiveDocumentSplitter (
308
+ separators = ["sentence" ], split_length = 28 , split_overlap = 0 , split_unit = "char"
309
+ )
303
310
304
311
text = (
305
312
"Sentence on page 1. Another on page 1.\f Sentence on page 2. Another on page 2.\f "
@@ -347,7 +354,7 @@ def test_run_split_by_sentence_count_page_breaks() -> None:
347
354
348
355
349
356
def test_run_split_document_with_overlap_character_unit ():
350
- splitter = RecursiveDocumentSplitter (split_length = 20 , split_overlap = 11 , separators = ["." , " " ])
357
+ splitter = RecursiveDocumentSplitter (split_length = 20 , split_overlap = 11 , separators = ["." , " " ], split_unit = "char" )
351
358
text = """A simple sentence1. A bright sentence2. A clever sentence3. A joyful sentence4"""
352
359
353
360
doc = Document (content = text )
@@ -384,18 +391,18 @@ def test_run_split_document_with_overlap_character_unit():
384
391
385
392
386
393
def test_run_separator_exists_but_split_length_too_small_fall_back_to_character_chunking ():
387
- splitter = RecursiveDocumentSplitter (separators = [" " ], split_length = 2 )
394
+ splitter = RecursiveDocumentSplitter (separators = [" " ], split_length = 2 , split_unit = "char" )
388
395
doc = Document (content = "This is some text. This is some more text." )
389
396
result = splitter .run (documents = [doc ])
390
397
assert len (result ["documents" ]) == 21
391
398
for doc in result ["documents" ]:
392
399
assert len (doc .content ) == 2
393
400
394
401
395
- def test_run_fallback_to_character_chunking ():
402
+ def test_run_fallback_to_character_chunking_by_default_length_too_short ():
396
403
text = "abczdefzghizjkl"
397
404
separators = ["\n \n " , "\n " , "z" ]
398
- splitter = RecursiveDocumentSplitter (split_length = 2 , separators = separators )
405
+ splitter = RecursiveDocumentSplitter (split_length = 2 , separators = separators , split_unit = "char" )
399
406
doc = Document (content = text )
400
407
chunks = splitter .run ([doc ])["documents" ]
401
408
for chunk in chunks :
@@ -404,7 +411,7 @@ def test_run_fallback_to_character_chunking():
404
411
405
412
def test_run_custom_sentence_tokenizer_document_and_overlap_char_unit ():
406
413
"""Test that RecursiveDocumentSplitter works correctly with custom sentence tokenizer and overlap"""
407
- splitter = RecursiveDocumentSplitter (split_length = 25 , split_overlap = 5 , separators = ["sentence" ])
414
+ splitter = RecursiveDocumentSplitter (split_length = 25 , split_overlap = 5 , separators = ["sentence" ], split_unit = "char" )
408
415
text = "This is sentence one. This is sentence two. This is sentence three."
409
416
410
417
doc = Document (content = text )
@@ -485,6 +492,10 @@ def test_run_split_by_word_count_page_breaks_word_unit():
485
492
doc_chunks = splitter .run ([doc ])
486
493
doc_chunks = doc_chunks ["documents" ]
487
494
495
+ for doc in doc_chunks :
496
+ print (doc .content )
497
+ print (doc .meta )
498
+
488
499
assert len (doc_chunks ) == 4
489
500
assert doc_chunks [0 ].content == "This is some text."
490
501
assert doc_chunks [0 ].meta ["page_number" ] == 1
@@ -546,9 +557,7 @@ def test_run_split_by_page_break_count_page_breaks_word_unit() -> None:
546
557
547
558
548
559
def test_run_split_by_new_line_count_page_breaks_word_unit () -> None :
549
- document_splitter = RecursiveDocumentSplitter (
550
- separators = ["\n " ], split_length = 21 , split_overlap = 0 , split_unit = "word"
551
- )
560
+ document_splitter = RecursiveDocumentSplitter (separators = ["\n " ], split_length = 4 , split_overlap = 0 , split_unit = "word" )
552
561
553
562
text = (
554
563
"Sentence on page 1.\n Another on page 1.\n \f "
@@ -600,7 +609,7 @@ def test_run_split_by_new_line_count_page_breaks_word_unit() -> None:
600
609
601
610
def test_run_split_by_sentence_count_page_breaks_word_unit () -> None :
602
611
document_splitter = RecursiveDocumentSplitter (
603
- separators = ["sentence" ], split_length = 28 , split_overlap = 0 , split_unit = "word"
612
+ separators = ["sentence" ], split_length = 7 , split_overlap = 0 , split_unit = "word"
604
613
)
605
614
606
615
text = (
0 commit comments