diff --git a/scibot/extract.py b/scibot/extract.py index 86b26ff..d380bde 100644 --- a/scibot/extract.py +++ b/scibot/extract.py @@ -329,11 +329,14 @@ def find_rrids(text): regex3 = '(.{0,32})(BDSC|BL|Bl|Bloomington)(\s?)(stock)?(\s)?(#|no|no\.)?(\s?)([0-9]{2,10})([^\w].{0,31})' matches3 = re.findall(regex3, text) for prefix, a, b, c, d, e, f, nums, suffix in matches3: + if nums in ['17', '21']: + # special case to avoid false positives + continue yield prefix, f'RRID:BDSC_{nums.strip()}', f'{a}{b}{c}{d}{e}{f}{nums}', suffix # fourth round for SAMN regex4 = '(.{0,32})(SAMN)(\s?)([0-9]{3,15})([^\w].{0,31})' - matches4 = re.findall(regex3, text) + matches4 = re.findall(regex4, text) for prefix, a, b, nums, suffix in matches4: yield prefix, f'RRID:SAMN{nums.strip()}', f'{a}{b}{nums}', suffix