Skip to content

Commit

Permalink
Merge pull request #124 from vmenger/detect-multi-token-names
Browse files Browse the repository at this point in the history
Detect multi token names
  • Loading branch information
vmenger authored Nov 22, 2023
2 parents 0a12310 + 3aa0961 commit 29d6b0b
Show file tree
Hide file tree
Showing 10 changed files with 131 additions and 53 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## 2.4.2 (2023-11-22)

### Changed
- multi-token lookup for first- and last names, so multi token names are now detected
- some small lookup list additions

## 2.4.3 (2023-11-22)

### Changed
Expand Down
38 changes: 4 additions & 34 deletions config.json
Original file line number Diff line number Diff line change
Expand Up @@ -128,49 +128,19 @@
}
},
"first_name_lookup": {
"annotator_type": "token_pattern",
"annotator_type": "multi_token",
"group": "names",
"args": {
"tag": "voornaam",
"skip": [],
"pattern": [
{
"and": [
{
"lookup": "first_names"
},
{
"neg_lookup": "first_name_exceptions"
},
{
"neg_lookup": "whitelist"
}
]
}
]
"lookup_values": "first_names"
}
},
"surname_lookup": {
"annotator_type": "token_pattern",
"annotator_type": "multi_token",
"group": "names",
"args": {
"tag": "achternaam",
"skip": [],
"pattern": [
{
"and": [
{
"lookup": "surnames"
},
{
"neg_lookup": "surname_exceptions"
},
{
"neg_lookup": "whitelist"
}
]
}
]
"lookup_values": "surnames"
}
},
"person_first_name": {
Expand Down
5 changes: 5 additions & 0 deletions deduce-data/lookup_lists/medical_terms.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3386,6 +3386,7 @@ lange
langs
langzame
langzamer
lap
laquo
largactil
laryngeus
Expand Down Expand Up @@ -4872,6 +4873,7 @@ population
porfyrie
porphyria
portae
pos
positief
positive
postbus
Expand Down Expand Up @@ -5421,6 +5423,7 @@ scheikunde
scheikundig
scheikundige
schele
schep
scherp
scherpe
scherpstelling
Expand Down Expand Up @@ -5500,6 +5503,7 @@ sensitization
sensorieel
sensorisch
sensory
sep
sepsis
septi
septic
Expand Down Expand Up @@ -5681,6 +5685,7 @@ spondylitis
spondylolyse
spongiforme
spontaan
spoor
spoorelement
sporadic
sporadisch
Expand Down
4 changes: 0 additions & 4 deletions deduce-data/lookup_lists/names/first_names.txt
Original file line number Diff line number Diff line change
Expand Up @@ -854,7 +854,6 @@ Anne-Roos
Anne-Ruth
Anne-Sophie
Anne-Wil
Anne-marie
Annebel
Annebelle
Annebet
Expand Down Expand Up @@ -7134,8 +7133,6 @@ Jo-An
Jo-Ann
Jo-Anna
Jo-Anne
Jo-ann
Jo-anne
Joa
Joab
Joachem
Expand Down Expand Up @@ -8009,7 +8006,6 @@ Kwint
Kwinten
Kwok
Ky-Mani
Ky-mani
Kyan
Kyana
Kyandro
Expand Down
1 change: 1 addition & 0 deletions deduce-data/lookup_lists/names/interfixes.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ in het
l'
la
le
lo
op 't
op de
op den
Expand Down
8 changes: 8 additions & 0 deletions deduce-data/lookup_lists/top_1000_terms.txt
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ bui
buiten
bureau
buren
burger
bus
buurman
buurvrouw
Expand Down Expand Up @@ -184,8 +185,10 @@ drinken
drogen
dromen
droog
droog
druk
dubbel
duits
dun
dus
duur
Expand All @@ -207,6 +210,7 @@ elektrisch
elk
elke
en
engels
enkele
enthousiast
er
Expand Down Expand Up @@ -267,6 +271,7 @@ gevaar
gevaarlijk
gevangenis
geven
geven
gevolg
gewicht
gewoon
Expand Down Expand Up @@ -550,6 +555,7 @@ naam
naar
naast
nacht
nader
nat
natuur
natuurlijk
Expand Down Expand Up @@ -621,6 +627,7 @@ opnemen
oranje
orde
oud
oud
ouder
over
overeenkomen
Expand All @@ -637,6 +644,7 @@ park
partner
pas
passeren
pauw
pen
peper
per
Expand Down
22 changes: 22 additions & 0 deletions deduce/lookup_sets.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,17 @@ def _get_first_names() -> dd.ds.LookupSet:
cleaning_pipeline=[dd.str.FilterByLength(min_len=2)],
)

first_name_exceptions = _get_first_name_exceptions()

first_names.remove_items_from_iterable(first_name_exceptions)

first_names.add_items_from_self(
cleaning_pipeline=[
FilterBasedOnLookupSet(filter_set=_get_whitelist(), case_sensitive=False),
],
replace=True,
)

return first_names


Expand Down Expand Up @@ -92,6 +103,17 @@ def _get_surnames() -> dd.ds.LookupSet:
cleaning_pipeline=[dd.str.FilterByLength(min_len=2)],
)

surname_exceptions = _get_surname_exceptions()

surnames.remove_items_from_iterable(surname_exceptions)

surnames.add_items_from_self(
cleaning_pipeline=[
FilterBasedOnLookupSet(filter_set=_get_whitelist(), case_sensitive=False),
],
replace=True,
)

return surnames


Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "deduce"
version = "2.4.3"
version = "2.4.4"
description = "Deduce: de-identification method for Dutch medical text"
authors = ["Vincent Menger <[email protected]>"]
maintainers = ["Vincent Menger <[email protected]>"]
Expand Down
97 changes: 84 additions & 13 deletions tests/regression/data/names.json
Original file line number Diff line number Diff line change
Expand Up @@ -843,19 +843,6 @@
}
]
},
{
"id": 94,
"text": "Voornaam-Voornaam Achternaam",
"annotations": [
{
"text": "Voornaam-Voornaam Achternaam",
"start_char": 0,
"end_char": 28,
"tag": "persoon"
}
]
},

{
"id": 95,
"text": "de Heer",
Expand Down Expand Up @@ -1043,6 +1030,90 @@
"tag": "persoon"
}
]
},
{
"id": 117,
"text": "Jan-Willem",
"annotations": [
{
"text": "Jan-Willem",
"start_char": 0,
"end_char": 10,
"tag": "persoon"
}
]
},
{
"id": 118,
"text": "Jan-Onbekendenaam",
"annotations": [
{
"text": "Jan-Onbekendenaam",
"start_char": 0,
"end_char": 17,
"tag": "persoon"
}
]
},
{
"id": 119,
"text": "Onbekendenaam-Willem",
"annotations": [
{
"text": "Onbekendenaam-Willem",
"start_char": 0,
"end_char": 20,
"tag": "persoon"
}
]
},
{
"id": 120,
"text": "El Ahmadi",
"annotations": [
{
"text": "Ahmadi",
"start_char": 3,
"end_char": 9,
"tag": "persoon"
}
]
},
{
"id": 121,
"text": "Bruins Slot",
"annotations": [
{
"text": "Bruins Slot",
"start_char": 0,
"end_char": 11,
"tag": "persoon"
}
]
},
{
"id": 122,
"text": "Groot Wassink",
"annotations": [
{
"text": "Groot Wassink",
"start_char": 0,
"end_char": 13,
"tag": "persoon"
}
]
},
{
"id": 123,
"text": "Pieter Oude Nijhuis",
"annotations": [
{
"text": "Pieter Oude Nijhuis",
"start_char": 0,
"end_char": 19,
"tag": "persoon"
}
]
}
]
}
1 change: 0 additions & 1 deletion tests/regression/test_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,6 @@ def test_regression_name(self, model):
"name_context",
"person_annotation_converter",
},
known_failures={94},
)

def test_regression_location(self, model):
Expand Down

0 comments on commit 29d6b0b

Please sign in to comment.