From 37f4a456cda71c881a072f2b5d9d587dda0181b0 Mon Sep 17 00:00:00 2001 From: ARIAS Camila Date: Mon, 27 Nov 2023 11:13:06 +0000 Subject: [PATCH] Resolve "Create minidocs from an annotated corpus" --- docs/_toc.yml | 1 + docs/api/text.md | 8 + docs/examples/text_segmentation/document.md | 114 +++++++ medkit/text/postprocessing/__init__.py | 2 + medkit/text/postprocessing/alignment_utils.py | 5 +- .../text/postprocessing/document_splitter.py | 277 ++++++++++++++++++ medkit/tools/_save_prov_to_dot.py | 3 +- .../test_attribute_duplicator.py | 13 + .../postprocessing/test_document_splitter.py | 217 ++++++++++++++ tests/unit/text/utils/__init__.py | 0 10 files changed, 637 insertions(+), 3 deletions(-) create mode 100644 docs/examples/text_segmentation/document.md create mode 100644 medkit/text/postprocessing/document_splitter.py create mode 100644 tests/unit/text/postprocessing/test_document_splitter.py create mode 100644 tests/unit/text/utils/__init__.py diff --git a/docs/_toc.yml b/docs/_toc.yml index 60345f2f..e52b630a 100644 --- a/docs/_toc.yml +++ b/docs/_toc.yml @@ -27,6 +27,7 @@ parts: sections: - file: examples/text_segmentation/section - file: examples/text_segmentation/syntagma + - file: examples/text_segmentation/document - file: examples/brat_io - file: examples/spacy_io - file: examples/custom_text_operation diff --git a/docs/api/text.md b/docs/api/text.md index f2952d22..fc9990ce 100644 --- a/docs/api/text.md +++ b/docs/api/text.md @@ -90,6 +90,8 @@ coder normalizer - Translation operation relying on [HuggingFace transformers](https://huggingface.co/docs/transformers/) models * - {mod}`AttributeDuplicator` - Propagation of attributes based on annotation spans +* - {mod}`DocumentSplitter` + - A component to divide text documents using its segments as a reference ::: ## Pre-processing modules @@ -554,6 +556,12 @@ For the moment, you can use this module to: - duplicate attributes bewteen segments. For example, you can duplicate an attribute from a sentence to its entities. - filter overlapping entities: useful when creating named entity reconigtion (NER) datasets +- create mini-documents from a {class}`~.core.text.TextDocument`. + + +```{admonition} Examples +Creating mini-documents from sections: [document splitter](../examples/text_segmentation/document.md) +``` :::{note} For more details about public API, refer to {mod}`~.text.postprocessing`. diff --git a/docs/examples/text_segmentation/document.md b/docs/examples/text_segmentation/document.md new file mode 100644 index 00000000..788e9db2 --- /dev/null +++ b/docs/examples/text_segmentation/document.md @@ -0,0 +1,114 @@ +--- +jupytext: + text_representation: + extension: .md + format_name: myst + format_version: 0.13 + jupytext_version: 1.14.5 +kernelspec: + display_name: Python 3 (ipykernel) + language: python + name: python3 +--- + + +# Document splitter + ++++ + +This tutorial will show an example of how to split a document using its sections as a reference. + +```{seealso} +We combine some operations like **section tokenizer**, **regexp matcher** and **custom operation**. Please see the other examples for more information. +``` ++++ + +## Adding annotations in a document + +Let's detect the sections and add some annotations using medkit operations. + +```{code-cell} ipython3 +# You can download the file available in source code +# !wget https://raw.githubusercontent.com/TeamHeka/medkit/main/docs/data/text/1.txt + +from pathlib import Path +from medkit.core.text import TextDocument + +doc = TextDocument.from_file(Path("../../data/text/1.txt")) +print(doc.text) +``` +**Defining the operations** + +```{code-cell} ipython3 +from medkit.text.ner import RegexpMatcher, RegexpMatcherRule +from medkit.text.segmentation import SectionTokenizer + +# Define a section tokenizer +# The section tokenizer uses a dictionary with keywords to identify sections +section_dict = { + "patient": ["SUBJECTIF"], + "traitement": ["MÉDICAMENTS", "PLAN"], + "allergies": ["ALLERGIES"], + "examen clinique": ["EXAMEN PHYSIQUE"], + "diagnostique": ["EVALUATION"], +} +section_tokenizer = SectionTokenizer(section_dict=section_dict) + +# Define a NER operation to create 'problem', and 'treatment' entities +regexp_rules = [ + RegexpMatcherRule(regexp=r"\ballergies\b", label="problem"), + RegexpMatcherRule(regexp=r"\basthme\b", label="problem"), + RegexpMatcherRule(regexp=r"\ballegra\b", label="treatment", case_sensitive=False), + RegexpMatcherRule(regexp=r"\bvaporisateurs\b", label="treatment"), + RegexpMatcherRule(regexp=r"\bloratadine\b", label="treatment", case_sensitive=False), + RegexpMatcherRule(regexp=r"\bnasonex\b", label="treatment", case_sensitive=False), +] +regexp_matcher = RegexpMatcher(rules=regexp_rules) +``` + +We can now annotate the document + +```{code-cell} ipython3 +# Detect annotations +sections = section_tokenizer.run([doc.raw_segment]) +entities = regexp_matcher.run([doc.raw_segment]) +# Annotate +for ann in sections + entities: + doc.anns.add(ann) + +print(f"The document contains {len(sections)} sections and {len(entities)} entities\n") +``` + +## Split the document by sections + +Once annotated, we can use the medkit operation {class}`~medkit.text.postprocessing.DocumentSplitter` to create smaller versions of the document using the sections. + +By default, since its `entity_labels`, `attr_labels`, and `relation_labels` are set to `None`, all annotations will be in the resulting documents. You can select the annotations using their labels. + +```{code-cell} ipython3 +from medkit.text.postprocessing import DocumentSplitter + +doc_splitter = DocumentSplitter(segment_label="section", # segments of reference + entity_labels=["treatment","problem"],# entities to include + attr_labels=[], # without attrs + relation_labels=[], #without relations +) +new_docs = doc_splitter.run([doc]) +print(f"The document was divided into {len(new_docs)} documents\n") +``` + +Each document contains entities and attributes from the source segment; below, we visualize the new documents via displacy utils. + +```{code-cell} ipython3 +from spacy import displacy +from medkit.text.spacy.displacy_utils import medkit_doc_to_displacy + +options_displacy = dict(colors={'treatment': "#85C1E9", "problem": "#ff6961"}) + +for new_doc in new_docs: + print(f"New document from the section called '{new_doc.metadata['name']}'") + # convert new document to displacy + displacy_data = medkit_doc_to_displacy(new_doc) + displacy.render(displacy_data, manual=True, style="ent", options=options_displacy) +``` + diff --git a/medkit/text/postprocessing/__init__.py b/medkit/text/postprocessing/__init__.py index 8f5b8759..a9ae16b1 100644 --- a/medkit/text/postprocessing/__init__.py +++ b/medkit/text/postprocessing/__init__.py @@ -1,9 +1,11 @@ __all__ = [ "AttributeDuplicator", "compute_nested_segments", + "DocumentSplitter", "filter_overlapping_entities", ] from .alignment_utils import compute_nested_segments from .attribute_duplicator import AttributeDuplicator +from .document_splitter import DocumentSplitter from .overlapping import filter_overlapping_entities diff --git a/medkit/text/postprocessing/alignment_utils.py b/medkit/text/postprocessing/alignment_utils.py index 14b132e5..3c5735ff 100644 --- a/medkit/text/postprocessing/alignment_utils.py +++ b/medkit/text/postprocessing/alignment_utils.py @@ -38,6 +38,7 @@ def compute_nested_segments( source_segments: List[Segment], target_segments: List[Segment] ) -> List[Tuple[Segment, List[Segment]]]: """Return source segments aligned with its nested segments. + Only nested segments fully contained in the `source_segments` are returned. Parameters ---------- @@ -58,8 +59,8 @@ def compute_nested_segments( if not normalized_spans: continue - start, end = normalized_spans[0].start, normalized_spans[-1].end - children = [child.data for child in tree.overlap(start, end)] + # use 'tree.envelop' to get only fully contained children + children = [child.data for child in tree.envelop(start, end)] nested.append((parent, children)) return nested diff --git a/medkit/text/postprocessing/document_splitter.py b/medkit/text/postprocessing/document_splitter.py new file mode 100644 index 00000000..56c86201 --- /dev/null +++ b/medkit/text/postprocessing/document_splitter.py @@ -0,0 +1,277 @@ +__all__ = ["DocumentSplitter"] +# functions to create minidocs from segments +from typing import List, Optional +from medkit.core import Attribute, Operation +from medkit.core.text import ( + Entity, + ModifiedSpan, + Relation, + Segment, + Span, + TextDocument, + TextAnnotation, + span_utils, +) +from medkit.text.postprocessing.alignment_utils import compute_nested_segments + + +class DocumentSplitter(Operation): + """Split text documents using its segments as a reference. + + The resulting 'mini-documents' contain the entities belonging to each + segment along with their attributes. + + This operation can be used to create datasets from medkit text documents. + """ + + def __init__( + self, + segment_label: str, + entity_labels: Optional[List[str]] = None, + attr_labels: Optional[List[str]] = None, + relation_labels: Optional[List[str]] = None, + name: Optional[str] = None, + uid: Optional[str] = None, + ): + """ + Instantiate the document splitter + + Parameters + ---------- + segment_label: + Label of the segments to use as references for the splitter + entity_labels: + Labels of entities to be included in the mini documents. + If None, all entities from the document will be included. + attr_labels: + Labels of the attributes to be included into the new annotations. + If None, all attributes will be included. + relation_labels: + Labels of relations to be included in the mini documents. + If None, all relations will be included. + name: + Name describing the splitter (default to the class name). + uid: str, Optional + Identifier of the operation + """ + # Pass all arguments to super (remove self) + init_args = locals() + init_args.pop("self") + super().__init__(**init_args) + + self.segment_label = segment_label + self.entity_labels = entity_labels + self.attr_labels = attr_labels + self.relation_labels = relation_labels + + def run(self, docs: List[TextDocument]) -> List[TextDocument]: + """Split docs into mini documents + + Parameters + ---------- + documents: + List of text documents to split + + Returns + ------- + List of documents created from the selected segments + """ + segment_docs = [] + + for doc in docs: + segments = doc.anns.get_segments(label=self.segment_label) + + # filter entities + entities = ( + doc.anns.get_entities() + if self.entity_labels is None + else [ + ent + for label in self.entity_labels + for ent in doc.anns.get_entities(label=label) + ] + ) + + # align segment and entities (fully contained) + segment_and_entities = compute_nested_segments(segments, entities) + + # filter relations in the document + relations = ( + doc.anns.get_relations() + if self.relation_labels is None + else [ + rel + for label in self.relation_labels + for rel in doc.anns.get_relations(label=label) + ] + ) + + # Iterate over all segments and corresponding nested entities + for segment, nested_entities in segment_and_entities: + # filter relations in nested entities + entities_uid = set(ent.uid for ent in nested_entities) + nested_relations = [ + relation + for relation in relations + if relation.source_id in entities_uid + and relation.target_id in entities_uid + ] + # create new document from segment + segment_doc = self._create_segment_doc( + segment=segment, + entities=nested_entities, + relations=nested_relations, + doc_source=doc, + ) + segment_docs.append(segment_doc) + + return segment_docs + + def _create_segment_doc( + self, + segment: Segment, + entities: List[Entity], + relations: List[Relation], + doc_source: TextDocument, + ) -> TextDocument: + """Create a TextDocument from a segment and its entities. + The original zone of the segment becomes the text of the document. + + Parameters + ---------- + segment: + Segment to use as reference for the new document + entities: + Entities inside the segment + relations: + Relations inside the segment + doc_source: + Initial document from which annotations where extracted + + Returns + ------- + TextDocument + A new document with entities, the metadata includes the original span and metadata + """ + + normalized_spans = span_utils.normalize_spans(segment.spans) + + # create an empty mini-doc with the raw text of the segment + offset, end_span = normalized_spans[0].start, normalized_spans[-1].end + metadata = doc_source.metadata.copy() + metadata.update(segment.metadata) + + segment_doc = TextDocument( + text=doc_source.text[offset:end_span], metadata=metadata + ) + + # handle provenance + if self._prov_tracer is not None: + self._prov_tracer.add_prov( + segment_doc, self.description, source_data_items=[segment] + ) + + # Copy segment attributes + segment_attrs = self._filter_attrs_from_ann(segment) + for attr in segment_attrs: + new_doc_attr = attr.copy() + segment_doc.attrs.add(new_doc_attr) + # handle provenance + if self._prov_tracer is not None: + self._prov_tracer.add_prov( + new_doc_attr, + self.description, + source_data_items=[attr], + ) + + # Add selected entities + uid_mapping = {} + for ent in entities: + spans = [] + for span in ent.spans: + # relocate entity spans using segment offset + if isinstance(span, Span): + spans.append(Span(span.start - offset, span.end - offset)) + else: + replaced_spans = [ + Span(sp.start - offset, sp.end - offset) + for sp in span.replaced_spans + ] + spans.append( + ModifiedSpan(length=span.length, replaced_spans=replaced_spans) + ) + # define the new entity + relocated_ent = Entity( + text=ent.text, + label=ent.label, + spans=spans, + metadata=ent.metadata.copy(), + ) + # add mapping for relations + uid_mapping[ent.uid] = relocated_ent.uid + + # handle provenance + if self._prov_tracer is not None: + self._prov_tracer.add_prov( + relocated_ent, self.description, source_data_items=[ent] + ) + + # Copy entity attributes + entity_attrs = self._filter_attrs_from_ann(ent) + for attr in entity_attrs: + new_ent_attr = attr.copy() + relocated_ent.attrs.add(new_ent_attr) + # handle provenance + if self._prov_tracer is not None: + self._prov_tracer.add_prov( + new_ent_attr, + self.description, + source_data_items=[attr], + ) + + # add entity to the new document + segment_doc.anns.add(relocated_ent) + + for rel in relations: + relation = Relation( + label=rel.label, + source_id=uid_mapping[rel.source_id], + target_id=uid_mapping[rel.target_id], + metadata=rel.metadata.copy(), + ) + # handle provenance + if self._prov_tracer is not None: + self._prov_tracer.add_prov( + relation, self.description, source_data_items=[rel] + ) + + # Copy relation attributes + relation_attrs = self._filter_attrs_from_ann(rel) + for attr in relation_attrs: + new_rel_attr = attr.copy() + relation.attrs.add(new_rel_attr) + # handle provenance + if self._prov_tracer is not None: + self._prov_tracer.add_prov( + new_rel_attr, + self.description, + source_data_items=[attr], + ) + + # add relation to the new document + segment_doc.anns.add(relation) + + return segment_doc + + def _filter_attrs_from_ann(self, ann: TextAnnotation) -> List[Attribute]: + """Filter attributes from an annotation using 'attr_labels'""" + attrs = ( + ann.attrs.get() + if self.attr_labels is None + else [ + attr + for label in self.attr_labels + for attr in ann.attrs.get(label=label) + ] + ) + return attrs diff --git a/medkit/tools/_save_prov_to_dot.py b/medkit/tools/_save_prov_to_dot.py index 6350d8af..0c1e3e5f 100644 --- a/medkit/tools/_save_prov_to_dot.py +++ b/medkit/tools/_save_prov_to_dot.py @@ -11,7 +11,7 @@ IdentifiableDataItemWithAttrs, Attribute, ) -from medkit.core.text import Segment +from medkit.core.text import Segment, TextDocument def save_prov_to_dot( @@ -63,6 +63,7 @@ def save_prov_to_dot( _DEFAULT_DATA_ITEMS_FORMATTERS = { Segment: lambda s: f"{s.label}: {s.text}", Attribute: lambda a: f"{a.label}: {a.value}", + TextDocument: lambda d: f"doc_text:{d.text}", } diff --git a/tests/unit/text/postprocessing/test_attribute_duplicator.py b/tests/unit/text/postprocessing/test_attribute_duplicator.py index acb7e92e..73f69169 100644 --- a/tests/unit/text/postprocessing/test_attribute_duplicator.py +++ b/tests/unit/text/postprocessing/test_attribute_duplicator.py @@ -50,6 +50,19 @@ def test_compute_nested_segments(doc): assert nested[1][1][0].uid == "target_1" +def test_compute_nested_segments_target_no_fully_contained(doc): + # align syntagme with entities + # sytagme_0 goes from 0:37, target goes from 35:40 + # so, target should not be a nested segment + source = [doc.anns.get_by_id("syntagme_0")] + target = [_extract_segment(doc.raw_segment, [(35, 40)], "disease", uid="target_x")] + + nested = compute_nested_segments(source_segments=source, target_segments=target) + assert len(nested) == 1 + assert len(nested[0][1]) == 0 + assert nested[0][0].uid == "syntagme_0" + + def test__create_segments_tree(doc): targets = doc.anns.get(label="disease") tree = _create_segments_tree(target_segments=targets) diff --git a/tests/unit/text/postprocessing/test_document_splitter.py b/tests/unit/text/postprocessing/test_document_splitter.py new file mode 100644 index 00000000..fb8f9a46 --- /dev/null +++ b/tests/unit/text/postprocessing/test_document_splitter.py @@ -0,0 +1,217 @@ +import pytest + +from medkit.core import Attribute, ProvTracer +from medkit.core.text import Entity, ModifiedSpan, Segment, Span, TextDocument, Relation +from medkit.text.postprocessing.document_splitter import DocumentSplitter + + +@pytest.fixture() +def doc(): + text = "The medkit library. This is a large entity" + + # a normal segment + segment_1 = Segment( + label="normal_sentence", + spans=[Span(0, 18)], + text="The medkit library", + attrs=[Attribute(label="segment_attr", value=0)], + metadata={"sent_id": "001"}, + ) + # modified segment + segment_2 = Segment( + label="modified_sentence", + spans=[ + Span(start=20, end=29), + ModifiedSpan(length=1, replaced_spans=[Span(start=29, end=33)]), + Span(start=33, end=38), + ModifiedSpan(length=1, replaced_spans=[Span(start=38, end=44)]), + Span(start=44, end=50), + ], + text="This is a large entity", + metadata={"sent_id": "002"}, + ) + entity_1 = Entity( + uid="e1", + label="ORG", + text="medkit", + spans=[Span(4, 10)], + attrs=[Attribute(label="entity_attr", value=0)], + ) + + entity_2 = Entity( + uid="e2", + label="ENTITY", + spans=[ + Span(start=33, end=38), + ModifiedSpan(length=1, replaced_spans=[Span(start=38, end=44)]), + Span(start=44, end=50), + ], + text="large entity", + ) + + entity_3 = Entity( + uid="e3", + label="MISC", + spans=[Span(20, 24)], + text="This", + ) + # relations + relation_1 = Relation(label="not_related", source_id="e1", target_id="e2") + relation_2 = Relation(label="related", source_id="e2", target_id="e3") + + anns = [segment_1, segment_2, entity_1, entity_2, entity_3, relation_1, relation_2] + doc = TextDocument(text=text, anns=anns, metadata={"doc_id": "001"}) + return doc + + +def test_document_splitter_no_attrs(doc): + splitter = DocumentSplitter( + segment_label="normal_sentence", + entity_labels=["ORG"], + attr_labels=[], + relation_labels=[], + ) + new_docs = splitter.run([doc]) + assert len(new_docs) == 1 + + new_doc = new_docs[0] + assert isinstance(new_doc, TextDocument) + + assert new_doc.text == "The medkit library" + assert new_doc.metadata == {"sent_id": "001", "doc_id": "001"} + entities_doc = new_doc.anns.get_entities() + assert len(entities_doc) == 1 + assert entities_doc[0].spans == [Span(4, 10)] + + +def test_document_splitter_attrs(doc): + splitter = DocumentSplitter( + segment_label="normal_sentence", + entity_labels=["ORG"], + attr_labels=None, + relation_labels=[], + ) + new_docs = splitter.run([doc]) + assert len(new_docs) == 1 + + new_doc = new_docs[0] + assert new_doc.attrs.get(label="segment_attr") + assert len(new_doc.attrs.get(label="entity_attr")) == 0 + + entity_0 = new_doc.anns.get(label="ORG")[0] + assert entity_0.attrs.get(label="entity_attr") + assert len(entity_0.attrs.get(label="segment_attr")) == 0 + + +def test_with_modified_spans(doc): + splitter = DocumentSplitter( + segment_label="modified_sentence", + entity_labels=["ENTITY"], + attr_labels=[], + relation_labels=[], + ) + new_docs = splitter.run([doc]) + assert len(new_docs) == 1 + + new_doc = new_docs[0] + assert isinstance(new_doc, TextDocument) + + assert new_doc.text == "This is a large entity" + assert new_doc.metadata == {"sent_id": "002", "doc_id": "001"} + + entities_doc = new_doc.anns.get_entities() + assert len(entities_doc) == 1 + + # spans should be offset + entity_1 = entities_doc[0] + assert entity_1.spans == [ + Span(start=13, end=18), + ModifiedSpan(length=1, replaced_spans=[Span(start=18, end=24)]), + Span(start=24, end=30), + ] + assert entity_1.text == "large entity" + + +def test_with_relations(doc): + splitter = DocumentSplitter( + segment_label="modified_sentence", + entity_labels=["ENTITY", "MISC"], + attr_labels=[], + relation_labels=["related"], + ) + new_docs = splitter.run([doc]) + assert len(new_docs) == 1 + + new_doc = new_docs[0] + relations = new_doc.anns.get_relations() + assert len(relations) == 1 + + relation = relations[0] + entity_1 = new_doc.anns.get(label="ENTITY")[0] + entity_2 = new_doc.anns.get(label="MISC")[0] + assert relation.source_id == entity_1.uid + assert relation.target_id == entity_2.uid + + +def test_prov(doc): + splitter = DocumentSplitter( + segment_label="normal_sentence", + entity_labels=None, # include all entities + attr_labels=None, + relation_labels=[], + ) + prov_tracer = ProvTracer() + splitter.set_prov_tracer(prov_tracer) + new_docs = splitter.run([doc]) + new_doc = new_docs[0] + + sentence_1 = doc.anns.get(label="normal_sentence")[0] + prov_1 = prov_tracer.get_prov(new_doc.uid) + assert prov_1.data_item == new_doc + assert prov_1.op_desc == splitter.description + assert prov_1.source_data_items == [sentence_1] + + # check prov doc attr + segment_attr = sentence_1.attrs.get(label="segment_attr")[0] + doc_attr = new_doc.attrs.get(label="segment_attr")[0] + prov_2 = prov_tracer.get_prov(doc_attr.uid) + assert prov_2.data_item == doc_attr + assert prov_2.op_desc == splitter.description + assert prov_2.source_data_items == [segment_attr] + + entity_1 = doc.anns.get(label="ORG")[0] + entity_1_new_doc = new_doc.anns.get(label="ORG")[0] + prov_3 = prov_tracer.get_prov(entity_1_new_doc.uid) + assert prov_3.data_item == entity_1_new_doc + assert prov_3.op_desc == splitter.description + assert prov_3.source_data_items == [entity_1] + + # check prov entity attr + entity_attr = entity_1.attrs.get(label="entity_attr")[0] + new_entity_attr = entity_1_new_doc.attrs.get(label="entity_attr")[0] + prov_4 = prov_tracer.get_prov(new_entity_attr.uid) + assert prov_4.data_item == new_entity_attr + assert prov_4.op_desc == splitter.description + assert prov_4.source_data_items == [entity_attr] + + +def test_prov_with_relations(doc): + splitter = DocumentSplitter( + segment_label="modified_sentence", + entity_labels=None, # include all entities + attr_labels=["segment_attr"], + relation_labels=None, + ) + + prov_tracer = ProvTracer() + splitter.set_prov_tracer(prov_tracer) + new_docs = splitter.run([doc]) + new_doc = new_docs[0] + + # check provenance in the new relation + relation = doc.anns.get(label="related")[0] + new_relation = new_doc.anns.get(label="related")[0] + prov_1 = prov_tracer.get_prov(new_relation.uid) + assert prov_1.data_item == new_relation + assert prov_1.op_desc == splitter.description + assert prov_1.source_data_items == [relation] diff --git a/tests/unit/text/utils/__init__.py b/tests/unit/text/utils/__init__.py new file mode 100644 index 00000000..e69de29b