Skip to content

Commit

Permalink
Merge pull request #117 from vmenger/fix-double-initials
Browse files Browse the repository at this point in the history
Fix double initials
  • Loading branch information
vmenger authored Nov 15, 2023
2 parents 4f74303 + 380ee2b commit 2b28022
Show file tree
Hide file tree
Showing 5 changed files with 106 additions and 3 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,11 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## 2.4.1 (2023-11-15)

### Added
- detection of initials `Ch.`, `Chr.`, `Ph.` and `Th.`

## 2.4.0 (2023-11-15)

### Added
Expand Down
7 changes: 5 additions & 2 deletions deduce/annotator.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,11 @@ def match(cls, pattern_position: dict, **kwargs) -> bool: # pylint: disable=R09
return re.match(value, kwargs.get("token").text) is not None
if func == "is_initial":
return (
len(kwargs.get("token").text) == 1
and kwargs.get("token").text[0].isupper()
(
len(kwargs.get("token").text) == 1
and kwargs.get("token").text[0].isupper()
)
or kwargs.get("token").text in {"Ch", "Chr", "Ph", "Th"}
) == value
if func == "is_initials":
return (
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "deduce"
version = "2.4.0"
version = "2.4.1"
description = "Deduce: de-identification method for Dutch medical text"
authors = ["Vincent Menger <[email protected]>"]
maintainers = ["Vincent Menger <[email protected]>"]
Expand Down
90 changes: 90 additions & 0 deletions tests/regression/data/names.json
Original file line number Diff line number Diff line change
Expand Up @@ -901,6 +901,96 @@
"tag": "persoon"
}
]
},
{
"id": 101,
"text": "Ph. Van der Laan",
"annotations": [
{
"text": "Ph. Van der Laan",
"start_char": 0,
"end_char": 16,
"tag": "persoon"
}
]
},
{
"id": 102,
"text": "A.Th.Chr. Van der Laan",
"annotations": [
{
"text": "A.Th.Chr. Van der Laan",
"start_char": 0,
"end_char": 22,
"tag": "persoon"
}
]
},
{
"id": 103,
"text": "Ah. Van der Laan",
"annotations": [
{
"text": "Van der Laan",
"start_char": 4,
"end_char": 16,
"tag": "persoon"
}
]
},
{
"id": 104,
"text": "J. Th. Bakker",
"annotations": [
{
"text": "J. Th. Bakker",
"start_char": 0,
"end_char": 13,
"tag": "persoon"
}
]
},
{
"id": 105,
"text": "J. Th. A. Bakker",
"annotations": [
{
"text": "J. Th. A. Bakker",
"start_char": 0,
"end_char": 16,
"tag": "persoon"
}
]
},
{
"id": 106,
"text": "Prof. Dr. Th. Bakker",
"annotations": [
{
"text": "Prof. Dr. Th. Bakker",
"start_char": 0,
"end_char": 20,
"tag": "persoon"
}
]
},
{
"id": 107,
"text": "Prof. Dr. Th. Ir. Bakker",
"annotations": [
{
"text": "Prof. Dr. Th",
"start_char": 0,
"end_char": 12,
"tag": "persoon"
},
{
"text": "Ir. Bakker",
"start_char": 14,
"end_char": 24,
"tag": "persoon"
}
]
}
]
}
5 changes: 5 additions & 0 deletions tests/unit/test_annotator.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,12 @@ def test_match_is_initial(self):
pattern_position = {"is_initial": True}

assert _PatternPositionMatcher.match(pattern_position, token=token("A"))
assert _PatternPositionMatcher.match(pattern_position, token=token("Ch"))
assert _PatternPositionMatcher.match(pattern_position, token=token("Chr"))
assert _PatternPositionMatcher.match(pattern_position, token=token("Ph"))
assert _PatternPositionMatcher.match(pattern_position, token=token("Th"))
assert not _PatternPositionMatcher.match(pattern_position, token=token("a"))
assert not _PatternPositionMatcher.match(pattern_position, token=token("Ah"))
assert not _PatternPositionMatcher.match(pattern_position, token=token("Abcd"))

def test_match_like_name(self):
Expand Down

0 comments on commit 2b28022

Please sign in to comment.