From 37f4a456cda71c881a072f2b5d9d587dda0181b0 Mon Sep 17 00:00:00 2001
From: ARIAS Camila <deycy-camila.arias-villamil@inria.fr>
Date: Mon, 27 Nov 2023 11:13:06 +0000
Subject: [PATCH] Resolve "Create minidocs from an annotated corpus"

---
 docs/_toc.yml                                 |   1 +
 docs/api/text.md                              |   8 +
 docs/examples/text_segmentation/document.md   | 114 +++++++
 medkit/text/postprocessing/__init__.py        |   2 +
 medkit/text/postprocessing/alignment_utils.py |   5 +-
 .../text/postprocessing/document_splitter.py  | 277 ++++++++++++++++++
 medkit/tools/_save_prov_to_dot.py             |   3 +-
 .../test_attribute_duplicator.py              |  13 +
 .../postprocessing/test_document_splitter.py  | 217 ++++++++++++++
 tests/unit/text/utils/__init__.py             |   0
 10 files changed, 637 insertions(+), 3 deletions(-)
 create mode 100644 docs/examples/text_segmentation/document.md
 create mode 100644 medkit/text/postprocessing/document_splitter.py
 create mode 100644 tests/unit/text/postprocessing/test_document_splitter.py
 create mode 100644 tests/unit/text/utils/__init__.py

diff --git a/docs/_toc.yml b/docs/_toc.yml
index 60345f2f..e52b630a 100644
--- a/docs/_toc.yml
+++ b/docs/_toc.yml
@@ -27,6 +27,7 @@ parts:
     sections:
       - file: examples/text_segmentation/section
       - file: examples/text_segmentation/syntagma
+      - file: examples/text_segmentation/document
   - file: examples/brat_io
   - file: examples/spacy_io
   - file: examples/custom_text_operation
diff --git a/docs/api/text.md b/docs/api/text.md
index f2952d22..fc9990ce 100644
--- a/docs/api/text.md
+++ b/docs/api/text.md
@@ -90,6 +90,8 @@ coder normalizer
     - Translation operation relying on [HuggingFace transformers](https://huggingface.co/docs/transformers/) models
 *   - {mod}`AttributeDuplicator<medkit.text.postprocessing.attribute_duplicator>`
     - Propagation of attributes based on annotation spans
+*   - {mod}`DocumentSplitter<medkit.text.postprocessing.document_splitter>`
+    - A component to divide text documents using its segments as a reference
 :::
 
 ## Pre-processing modules
@@ -554,6 +556,12 @@ For the moment, you can use this module to:
 - duplicate attributes bewteen segments. For example, you can duplicate an attribute from a sentence to its entities.
 
 - filter overlapping entities: useful when creating named entity reconigtion (NER) datasets
+- create mini-documents from a {class}`~.core.text.TextDocument`. 
+  
+
+```{admonition} Examples
+Creating mini-documents from sections: [document splitter](../examples/text_segmentation/document.md)
+```
 
 :::{note}
 For more details about public API, refer to {mod}`~.text.postprocessing`.
diff --git a/docs/examples/text_segmentation/document.md b/docs/examples/text_segmentation/document.md
new file mode 100644
index 00000000..788e9db2
--- /dev/null
+++ b/docs/examples/text_segmentation/document.md
@@ -0,0 +1,114 @@
+---
+jupytext:
+  text_representation:
+    extension: .md
+    format_name: myst
+    format_version: 0.13
+    jupytext_version: 1.14.5
+kernelspec:
+  display_name: Python 3 (ipykernel)
+  language: python
+  name: python3
+---
+
+
+# Document splitter
+
++++
+
+This tutorial will show an example of how to split a document using its sections as a reference. 
+
+```{seealso}
+We combine some operations like **section tokenizer**, **regexp matcher** and **custom operation**. Please see the other examples for more information.
+```
++++
+
+## Adding annotations in a document
+
+Let's detect the sections and add some annotations using medkit operations.
+
+```{code-cell} ipython3
+# You can download the file available in source code
+# !wget https://raw.githubusercontent.com/TeamHeka/medkit/main/docs/data/text/1.txt
+
+from pathlib import Path
+from medkit.core.text import TextDocument
+
+doc = TextDocument.from_file(Path("../../data/text/1.txt"))
+print(doc.text)
+```
+**Defining the operations**
+
+```{code-cell} ipython3
+from medkit.text.ner import RegexpMatcher, RegexpMatcherRule
+from medkit.text.segmentation import SectionTokenizer
+
+# Define a section tokenizer
+# The section tokenizer uses a dictionary with keywords to identify sections
+section_dict = {
+    "patient": ["SUBJECTIF"],
+    "traitement": ["MÉDICAMENTS", "PLAN"],
+    "allergies": ["ALLERGIES"],
+    "examen clinique": ["EXAMEN PHYSIQUE"],
+    "diagnostique": ["EVALUATION"],
+}
+section_tokenizer = SectionTokenizer(section_dict=section_dict)
+
+# Define a NER operation to create 'problem', and 'treatment' entities
+regexp_rules = [
+    RegexpMatcherRule(regexp=r"\ballergies\b", label="problem"),
+    RegexpMatcherRule(regexp=r"\basthme\b", label="problem"),
+    RegexpMatcherRule(regexp=r"\ballegra\b", label="treatment", case_sensitive=False),
+    RegexpMatcherRule(regexp=r"\bvaporisateurs\b", label="treatment"),
+    RegexpMatcherRule(regexp=r"\bloratadine\b", label="treatment", case_sensitive=False),
+    RegexpMatcherRule(regexp=r"\bnasonex\b", label="treatment", case_sensitive=False),
+]
+regexp_matcher = RegexpMatcher(rules=regexp_rules)
+```
+
+We can now annotate the document
+
+```{code-cell} ipython3
+# Detect annotations
+sections = section_tokenizer.run([doc.raw_segment])
+entities = regexp_matcher.run([doc.raw_segment])
+# Annotate
+for ann in sections + entities:
+    doc.anns.add(ann)
+
+print(f"The document contains {len(sections)} sections and {len(entities)} entities\n")
+```
+
+## Split the document by sections 
+
+Once annotated, we can use the medkit operation {class}`~medkit.text.postprocessing.DocumentSplitter` to create smaller versions of the document using the sections. 
+
+By default, since its `entity_labels`, `attr_labels`, and `relation_labels` are set to `None`, all annotations will be in the resulting documents. You can select the annotations using their labels.
+
+```{code-cell} ipython3
+from medkit.text.postprocessing import DocumentSplitter
+
+doc_splitter = DocumentSplitter(segment_label="section", # segments of reference
+                                entity_labels=["treatment","problem"],# entities to include 
+                                attr_labels=[], # without attrs
+                                relation_labels=[], #without relations
+)
+new_docs = doc_splitter.run([doc])
+print(f"The document was divided into {len(new_docs)} documents\n")
+```
+
+Each document contains entities and attributes from the source segment; below, we visualize the new documents via displacy utils.
+
+```{code-cell} ipython3
+from spacy import displacy
+from medkit.text.spacy.displacy_utils import medkit_doc_to_displacy
+
+options_displacy = dict(colors={'treatment': "#85C1E9", "problem": "#ff6961"})
+
+for new_doc in new_docs:
+    print(f"New document from the section called '{new_doc.metadata['name']}'")
+    # convert new document to displacy 
+    displacy_data = medkit_doc_to_displacy(new_doc)
+    displacy.render(displacy_data, manual=True, style="ent", options=options_displacy)
+```
+
diff --git a/medkit/text/postprocessing/__init__.py b/medkit/text/postprocessing/__init__.py
index 8f5b8759..a9ae16b1 100644
--- a/medkit/text/postprocessing/__init__.py
+++ b/medkit/text/postprocessing/__init__.py
@@ -1,9 +1,11 @@
 __all__ = [
     "AttributeDuplicator",
     "compute_nested_segments",
+    "DocumentSplitter",
     "filter_overlapping_entities",
 ]
 
 from .alignment_utils import compute_nested_segments
 from .attribute_duplicator import AttributeDuplicator
+from .document_splitter import DocumentSplitter
 from .overlapping import filter_overlapping_entities
diff --git a/medkit/text/postprocessing/alignment_utils.py b/medkit/text/postprocessing/alignment_utils.py
index 14b132e5..3c5735ff 100644
--- a/medkit/text/postprocessing/alignment_utils.py
+++ b/medkit/text/postprocessing/alignment_utils.py
@@ -38,6 +38,7 @@ def compute_nested_segments(
     source_segments: List[Segment], target_segments: List[Segment]
 ) -> List[Tuple[Segment, List[Segment]]]:
     """Return source segments aligned with its nested segments.
+    Only nested segments fully contained in the `source_segments` are returned.
 
     Parameters
     ----------
@@ -58,8 +59,8 @@ def compute_nested_segments(
 
         if not normalized_spans:
             continue
-
         start, end = normalized_spans[0].start, normalized_spans[-1].end
-        children = [child.data for child in tree.overlap(start, end)]
+        # use 'tree.envelop' to get only fully contained children
+        children = [child.data for child in tree.envelop(start, end)]
         nested.append((parent, children))
     return nested
diff --git a/medkit/text/postprocessing/document_splitter.py b/medkit/text/postprocessing/document_splitter.py
new file mode 100644
index 00000000..56c86201
--- /dev/null
+++ b/medkit/text/postprocessing/document_splitter.py
@@ -0,0 +1,277 @@
+__all__ = ["DocumentSplitter"]
+# functions to create minidocs from segments
+from typing import List, Optional
+from medkit.core import Attribute, Operation
+from medkit.core.text import (
+    Entity,
+    ModifiedSpan,
+    Relation,
+    Segment,
+    Span,
+    TextDocument,
+    TextAnnotation,
+    span_utils,
+)
+from medkit.text.postprocessing.alignment_utils import compute_nested_segments
+
+
+class DocumentSplitter(Operation):
+    """Split text documents using its segments as a reference.
+
+    The resulting 'mini-documents' contain the entities belonging to each
+    segment along with their attributes.
+
+    This operation can be used to create datasets from medkit text documents.
+    """
+
+    def __init__(
+        self,
+        segment_label: str,
+        entity_labels: Optional[List[str]] = None,
+        attr_labels: Optional[List[str]] = None,
+        relation_labels: Optional[List[str]] = None,
+        name: Optional[str] = None,
+        uid: Optional[str] = None,
+    ):
+        """
+        Instantiate the document splitter
+
+        Parameters
+        ----------
+        segment_label:
+            Label of the segments to use as references for the splitter
+        entity_labels:
+            Labels of entities to be included in the mini documents.
+            If None, all entities from the document will be included.
+        attr_labels:
+            Labels of the attributes to be included into the new annotations.
+            If None, all attributes will be included.
+        relation_labels:
+            Labels of relations to be included in the mini documents.
+            If None, all relations will be included.
+        name:
+            Name describing the splitter (default to the class name).
+        uid: str, Optional
+            Identifier of the operation
+        """
+        # Pass all arguments to super (remove self)
+        init_args = locals()
+        init_args.pop("self")
+        super().__init__(**init_args)
+
+        self.segment_label = segment_label
+        self.entity_labels = entity_labels
+        self.attr_labels = attr_labels
+        self.relation_labels = relation_labels
+
+    def run(self, docs: List[TextDocument]) -> List[TextDocument]:
+        """Split docs into mini documents
+
+        Parameters
+        ----------
+        documents:
+            List of text documents to split
+
+        Returns
+        -------
+            List of documents created from the selected segments
+        """
+        segment_docs = []
+
+        for doc in docs:
+            segments = doc.anns.get_segments(label=self.segment_label)
+
+            # filter entities
+            entities = (
+                doc.anns.get_entities()
+                if self.entity_labels is None
+                else [
+                    ent
+                    for label in self.entity_labels
+                    for ent in doc.anns.get_entities(label=label)
+                ]
+            )
+
+            # align segment and entities (fully contained)
+            segment_and_entities = compute_nested_segments(segments, entities)
+
+            # filter relations in the document
+            relations = (
+                doc.anns.get_relations()
+                if self.relation_labels is None
+                else [
+                    rel
+                    for label in self.relation_labels
+                    for rel in doc.anns.get_relations(label=label)
+                ]
+            )
+
+            # Iterate over all segments and corresponding nested entities
+            for segment, nested_entities in segment_and_entities:
+                # filter relations in nested entities
+                entities_uid = set(ent.uid for ent in nested_entities)
+                nested_relations = [
+                    relation
+                    for relation in relations
+                    if relation.source_id in entities_uid
+                    and relation.target_id in entities_uid
+                ]
+                # create new document from segment
+                segment_doc = self._create_segment_doc(
+                    segment=segment,
+                    entities=nested_entities,
+                    relations=nested_relations,
+                    doc_source=doc,
+                )
+                segment_docs.append(segment_doc)
+
+        return segment_docs
+
+    def _create_segment_doc(
+        self,
+        segment: Segment,
+        entities: List[Entity],
+        relations: List[Relation],
+        doc_source: TextDocument,
+    ) -> TextDocument:
+        """Create a TextDocument from a segment and its entities.
+        The original zone of the segment becomes the text of the document.
+
+        Parameters
+        ----------
+        segment:
+            Segment to use as reference for the new document
+        entities:
+            Entities inside the segment
+        relations:
+            Relations inside the segment
+        doc_source:
+            Initial document from which annotations where extracted
+
+        Returns
+        -------
+        TextDocument
+            A new document with entities, the metadata includes the original span and metadata
+        """
+
+        normalized_spans = span_utils.normalize_spans(segment.spans)
+
+        # create an empty mini-doc with the raw text of the segment
+        offset, end_span = normalized_spans[0].start, normalized_spans[-1].end
+        metadata = doc_source.metadata.copy()
+        metadata.update(segment.metadata)
+
+        segment_doc = TextDocument(
+            text=doc_source.text[offset:end_span], metadata=metadata
+        )
+
+        # handle provenance
+        if self._prov_tracer is not None:
+            self._prov_tracer.add_prov(
+                segment_doc, self.description, source_data_items=[segment]
+            )
+
+        # Copy segment attributes
+        segment_attrs = self._filter_attrs_from_ann(segment)
+        for attr in segment_attrs:
+            new_doc_attr = attr.copy()
+            segment_doc.attrs.add(new_doc_attr)
+            # handle provenance
+            if self._prov_tracer is not None:
+                self._prov_tracer.add_prov(
+                    new_doc_attr,
+                    self.description,
+                    source_data_items=[attr],
+                )
+
+        # Add selected entities
+        uid_mapping = {}
+        for ent in entities:
+            spans = []
+            for span in ent.spans:
+                # relocate entity spans using segment offset
+                if isinstance(span, Span):
+                    spans.append(Span(span.start - offset, span.end - offset))
+                else:
+                    replaced_spans = [
+                        Span(sp.start - offset, sp.end - offset)
+                        for sp in span.replaced_spans
+                    ]
+                    spans.append(
+                        ModifiedSpan(length=span.length, replaced_spans=replaced_spans)
+                    )
+            # define the new entity
+            relocated_ent = Entity(
+                text=ent.text,
+                label=ent.label,
+                spans=spans,
+                metadata=ent.metadata.copy(),
+            )
+            # add mapping for relations
+            uid_mapping[ent.uid] = relocated_ent.uid
+
+            # handle provenance
+            if self._prov_tracer is not None:
+                self._prov_tracer.add_prov(
+                    relocated_ent, self.description, source_data_items=[ent]
+                )
+
+            # Copy entity attributes
+            entity_attrs = self._filter_attrs_from_ann(ent)
+            for attr in entity_attrs:
+                new_ent_attr = attr.copy()
+                relocated_ent.attrs.add(new_ent_attr)
+                # handle provenance
+                if self._prov_tracer is not None:
+                    self._prov_tracer.add_prov(
+                        new_ent_attr,
+                        self.description,
+                        source_data_items=[attr],
+                    )
+
+            # add entity to the new document
+            segment_doc.anns.add(relocated_ent)
+
+        for rel in relations:
+            relation = Relation(
+                label=rel.label,
+                source_id=uid_mapping[rel.source_id],
+                target_id=uid_mapping[rel.target_id],
+                metadata=rel.metadata.copy(),
+            )
+            # handle provenance
+            if self._prov_tracer is not None:
+                self._prov_tracer.add_prov(
+                    relation, self.description, source_data_items=[rel]
+                )
+
+            # Copy relation attributes
+            relation_attrs = self._filter_attrs_from_ann(rel)
+            for attr in relation_attrs:
+                new_rel_attr = attr.copy()
+                relation.attrs.add(new_rel_attr)
+                # handle provenance
+                if self._prov_tracer is not None:
+                    self._prov_tracer.add_prov(
+                        new_rel_attr,
+                        self.description,
+                        source_data_items=[attr],
+                    )
+
+            # add relation to the new document
+            segment_doc.anns.add(relation)
+
+        return segment_doc
+
+    def _filter_attrs_from_ann(self, ann: TextAnnotation) -> List[Attribute]:
+        """Filter attributes from an annotation using 'attr_labels'"""
+        attrs = (
+            ann.attrs.get()
+            if self.attr_labels is None
+            else [
+                attr
+                for label in self.attr_labels
+                for attr in ann.attrs.get(label=label)
+            ]
+        )
+        return attrs
diff --git a/medkit/tools/_save_prov_to_dot.py b/medkit/tools/_save_prov_to_dot.py
index 6350d8af..0c1e3e5f 100644
--- a/medkit/tools/_save_prov_to_dot.py
+++ b/medkit/tools/_save_prov_to_dot.py
@@ -11,7 +11,7 @@
     IdentifiableDataItemWithAttrs,
     Attribute,
 )
-from medkit.core.text import Segment
+from medkit.core.text import Segment, TextDocument
 
 
 def save_prov_to_dot(
@@ -63,6 +63,7 @@ def save_prov_to_dot(
 _DEFAULT_DATA_ITEMS_FORMATTERS = {
     Segment: lambda s: f"{s.label}: {s.text}",
     Attribute: lambda a: f"{a.label}: {a.value}",
+    TextDocument: lambda d: f"doc_text:{d.text}",
 }
 
 
diff --git a/tests/unit/text/postprocessing/test_attribute_duplicator.py b/tests/unit/text/postprocessing/test_attribute_duplicator.py
index acb7e92e..73f69169 100644
--- a/tests/unit/text/postprocessing/test_attribute_duplicator.py
+++ b/tests/unit/text/postprocessing/test_attribute_duplicator.py
@@ -50,6 +50,19 @@ def test_compute_nested_segments(doc):
     assert nested[1][1][0].uid == "target_1"
 
 
+def test_compute_nested_segments_target_no_fully_contained(doc):
+    # align syntagme with entities
+    # sytagme_0 goes from 0:37, target goes from 35:40
+    # so, target should not be a nested segment
+    source = [doc.anns.get_by_id("syntagme_0")]
+    target = [_extract_segment(doc.raw_segment, [(35, 40)], "disease", uid="target_x")]
+
+    nested = compute_nested_segments(source_segments=source, target_segments=target)
+    assert len(nested) == 1
+    assert len(nested[0][1]) == 0
+    assert nested[0][0].uid == "syntagme_0"
+
+
 def test__create_segments_tree(doc):
     targets = doc.anns.get(label="disease")
     tree = _create_segments_tree(target_segments=targets)
diff --git a/tests/unit/text/postprocessing/test_document_splitter.py b/tests/unit/text/postprocessing/test_document_splitter.py
new file mode 100644
index 00000000..fb8f9a46
--- /dev/null
+++ b/tests/unit/text/postprocessing/test_document_splitter.py
@@ -0,0 +1,217 @@
+import pytest
+
+from medkit.core import Attribute, ProvTracer
+from medkit.core.text import Entity, ModifiedSpan, Segment, Span, TextDocument, Relation
+from medkit.text.postprocessing.document_splitter import DocumentSplitter
+
+
+@pytest.fixture()
+def doc():
+    text = "The medkit library. This is a    large      entity"
+
+    # a normal segment
+    segment_1 = Segment(
+        label="normal_sentence",
+        spans=[Span(0, 18)],
+        text="The medkit library",
+        attrs=[Attribute(label="segment_attr", value=0)],
+        metadata={"sent_id": "001"},
+    )
+    # modified segment
+    segment_2 = Segment(
+        label="modified_sentence",
+        spans=[
+            Span(start=20, end=29),
+            ModifiedSpan(length=1, replaced_spans=[Span(start=29, end=33)]),
+            Span(start=33, end=38),
+            ModifiedSpan(length=1, replaced_spans=[Span(start=38, end=44)]),
+            Span(start=44, end=50),
+        ],
+        text="This is a large entity",
+        metadata={"sent_id": "002"},
+    )
+    entity_1 = Entity(
+        uid="e1",
+        label="ORG",
+        text="medkit",
+        spans=[Span(4, 10)],
+        attrs=[Attribute(label="entity_attr", value=0)],
+    )
+
+    entity_2 = Entity(
+        uid="e2",
+        label="ENTITY",
+        spans=[
+            Span(start=33, end=38),
+            ModifiedSpan(length=1, replaced_spans=[Span(start=38, end=44)]),
+            Span(start=44, end=50),
+        ],
+        text="large entity",
+    )
+
+    entity_3 = Entity(
+        uid="e3",
+        label="MISC",
+        spans=[Span(20, 24)],
+        text="This",
+    )
+    # relations
+    relation_1 = Relation(label="not_related", source_id="e1", target_id="e2")
+    relation_2 = Relation(label="related", source_id="e2", target_id="e3")
+
+    anns = [segment_1, segment_2, entity_1, entity_2, entity_3, relation_1, relation_2]
+    doc = TextDocument(text=text, anns=anns, metadata={"doc_id": "001"})
+    return doc
+
+
+def test_document_splitter_no_attrs(doc):
+    splitter = DocumentSplitter(
+        segment_label="normal_sentence",
+        entity_labels=["ORG"],
+        attr_labels=[],
+        relation_labels=[],
+    )
+    new_docs = splitter.run([doc])
+    assert len(new_docs) == 1
+
+    new_doc = new_docs[0]
+    assert isinstance(new_doc, TextDocument)
+
+    assert new_doc.text == "The medkit library"
+    assert new_doc.metadata == {"sent_id": "001", "doc_id": "001"}
+    entities_doc = new_doc.anns.get_entities()
+    assert len(entities_doc) == 1
+    assert entities_doc[0].spans == [Span(4, 10)]
+
+
+def test_document_splitter_attrs(doc):
+    splitter = DocumentSplitter(
+        segment_label="normal_sentence",
+        entity_labels=["ORG"],
+        attr_labels=None,
+        relation_labels=[],
+    )
+    new_docs = splitter.run([doc])
+    assert len(new_docs) == 1
+
+    new_doc = new_docs[0]
+    assert new_doc.attrs.get(label="segment_attr")
+    assert len(new_doc.attrs.get(label="entity_attr")) == 0
+
+    entity_0 = new_doc.anns.get(label="ORG")[0]
+    assert entity_0.attrs.get(label="entity_attr")
+    assert len(entity_0.attrs.get(label="segment_attr")) == 0
+
+
+def test_with_modified_spans(doc):
+    splitter = DocumentSplitter(
+        segment_label="modified_sentence",
+        entity_labels=["ENTITY"],
+        attr_labels=[],
+        relation_labels=[],
+    )
+    new_docs = splitter.run([doc])
+    assert len(new_docs) == 1
+
+    new_doc = new_docs[0]
+    assert isinstance(new_doc, TextDocument)
+
+    assert new_doc.text == "This is a    large      entity"
+    assert new_doc.metadata == {"sent_id": "002", "doc_id": "001"}
+
+    entities_doc = new_doc.anns.get_entities()
+    assert len(entities_doc) == 1
+
+    # spans should be offset
+    entity_1 = entities_doc[0]
+    assert entity_1.spans == [
+        Span(start=13, end=18),
+        ModifiedSpan(length=1, replaced_spans=[Span(start=18, end=24)]),
+        Span(start=24, end=30),
+    ]
+    assert entity_1.text == "large entity"
+
+
+def test_with_relations(doc):
+    splitter = DocumentSplitter(
+        segment_label="modified_sentence",
+        entity_labels=["ENTITY", "MISC"],
+        attr_labels=[],
+        relation_labels=["related"],
+    )
+    new_docs = splitter.run([doc])
+    assert len(new_docs) == 1
+
+    new_doc = new_docs[0]
+    relations = new_doc.anns.get_relations()
+    assert len(relations) == 1
+
+    relation = relations[0]
+    entity_1 = new_doc.anns.get(label="ENTITY")[0]
+    entity_2 = new_doc.anns.get(label="MISC")[0]
+    assert relation.source_id == entity_1.uid
+    assert relation.target_id == entity_2.uid
+
+
+def test_prov(doc):
+    splitter = DocumentSplitter(
+        segment_label="normal_sentence",
+        entity_labels=None,  # include all entities
+        attr_labels=None,
+        relation_labels=[],
+    )
+    prov_tracer = ProvTracer()
+    splitter.set_prov_tracer(prov_tracer)
+    new_docs = splitter.run([doc])
+    new_doc = new_docs[0]
+
+    sentence_1 = doc.anns.get(label="normal_sentence")[0]
+    prov_1 = prov_tracer.get_prov(new_doc.uid)
+    assert prov_1.data_item == new_doc
+    assert prov_1.op_desc == splitter.description
+    assert prov_1.source_data_items == [sentence_1]
+
+    # check prov doc attr
+    segment_attr = sentence_1.attrs.get(label="segment_attr")[0]
+    doc_attr = new_doc.attrs.get(label="segment_attr")[0]
+    prov_2 = prov_tracer.get_prov(doc_attr.uid)
+    assert prov_2.data_item == doc_attr
+    assert prov_2.op_desc == splitter.description
+    assert prov_2.source_data_items == [segment_attr]
+
+    entity_1 = doc.anns.get(label="ORG")[0]
+    entity_1_new_doc = new_doc.anns.get(label="ORG")[0]
+    prov_3 = prov_tracer.get_prov(entity_1_new_doc.uid)
+    assert prov_3.data_item == entity_1_new_doc
+    assert prov_3.op_desc == splitter.description
+    assert prov_3.source_data_items == [entity_1]
+
+    # check prov entity attr
+    entity_attr = entity_1.attrs.get(label="entity_attr")[0]
+    new_entity_attr = entity_1_new_doc.attrs.get(label="entity_attr")[0]
+    prov_4 = prov_tracer.get_prov(new_entity_attr.uid)
+    assert prov_4.data_item == new_entity_attr
+    assert prov_4.op_desc == splitter.description
+    assert prov_4.source_data_items == [entity_attr]
+
+
+def test_prov_with_relations(doc):
+    splitter = DocumentSplitter(
+        segment_label="modified_sentence",
+        entity_labels=None,  # include all entities
+        attr_labels=["segment_attr"],
+        relation_labels=None,
+    )
+
+    prov_tracer = ProvTracer()
+    splitter.set_prov_tracer(prov_tracer)
+    new_docs = splitter.run([doc])
+    new_doc = new_docs[0]
+
+    # check provenance in the new relation
+    relation = doc.anns.get(label="related")[0]
+    new_relation = new_doc.anns.get(label="related")[0]
+    prov_1 = prov_tracer.get_prov(new_relation.uid)
+    assert prov_1.data_item == new_relation
+    assert prov_1.op_desc == splitter.description
+    assert prov_1.source_data_items == [relation]
diff --git a/tests/unit/text/utils/__init__.py b/tests/unit/text/utils/__init__.py
new file mode 100644
index 00000000..e69de29b