From 7708477cef39bb460e84597174a4a1fce2c35538 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jefferson=20Hern=C3=A1ndez?= Date: Thu, 31 Aug 2017 15:28:04 +0200 Subject: [PATCH 1/4] fix missing name when heading contains a product name # Conflicts: # chemdataextractor/parse/hrms.py # tests/test_parse_hrms.py --- chemdataextractor/doc/document.py | 2 +- chemdataextractor/parse/hrms.py | 46 +++++++++++++++++++ tests/test_parse_cem.py | 6 +-- tests/test_parse_hrms.py | 75 +++++++++++++++++++++++++++++++ 4 files changed, 125 insertions(+), 4 deletions(-) create mode 100644 chemdataextractor/parse/hrms.py create mode 100644 tests/test_parse_hrms.py diff --git a/chemdataextractor/doc/document.py b/chemdataextractor/doc/document.py index 3d72760..bc107a1 100644 --- a/chemdataextractor/doc/document.py +++ b/chemdataextractor/doc/document.py @@ -218,7 +218,7 @@ def records(self): # We have property values but no names or labels... try merge those from previous if isinstance(el, Paragraph) and (head_def_record or last_product_record or last_id_record or title_record): # head_def_record from heading takes priority if the heading directly precedes the paragraph ( NOPE: or the last_id_record has no name) - if head_def_record_i and head_def_record_i + 1 == i: # or (last_id_record and not last_id_record.names)): + if not last_id_record and head_def_record_i and head_def_record_i + 1 == i: # or (last_id_record and not last_id_record.names)): if head_def_record: record.names = head_def_record.names record.labels = head_def_record.labels diff --git a/chemdataextractor/parse/hrms.py b/chemdataextractor/parse/hrms.py new file mode 100644 index 0000000..51c0ebc --- /dev/null +++ b/chemdataextractor/parse/hrms.py @@ -0,0 +1,46 @@ +# -*- coding: utf-8 -*- + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals + +import re + +from .base import BaseParser +from .elements import OneOrMore, R, Optional, ZeroOrMore +from ..model import Compound, HRMS +from ..utils import first +from .actions import merge + +not_separator = '[^\.;,]$' +separator = '[\.;,]' +# number = R('^\d+(\.\d+)?$') +# obtained from https://stackoverflow.com/questions/23602175/regex-for-parsing-chemical-formulas +chemical_structure_start = (Optional(R('[\(\[]')) + R('^(calcd|calculated)' + separator + '?', flags=re.IGNORECASE) | R('^for' + separator + '?', flags=re.IGNORECASE)) +chemical_structure = (OneOrMore(chemical_structure_start + R(not_separator)).hide() + R('([A-Z][a-z]?\d*|\((?:[^()]*(?:\(.*\))?[^()]*)+\)\d+)+')('structure')) +# compound = (R('^\[') + ZeroOrMore(R('\.+')) + R('\]')).add_action(merge)('compound') + +# theoretical = (Optional(W('calcd') + W('for')).hide() + number('mass') + compound)('theoretical') +# experimental = (Optional(W('found')).hide() + number('mass'))('experimental') +exceptions = (R(u'((^found|^\d+|[\+\-‐‑⁃‒–—―−-⁻])' + separator + '?)$') + Optional(R(separator))).hide() + +hrms = (R('HRMS').hide() + ZeroOrMore(chemical_structure | exceptions | R(not_separator).hide()))('hrms') + + +class HRMSParser(BaseParser): + """""" + root = hrms + + def __init__(self): + pass + + def interpret(self, result, start, end): + h = HRMS( + chemical_structure=first('./structure/text()'), + ) + c = Compound( + hrms=h + ) + + yield c diff --git a/tests/test_parse_cem.py b/tests/test_parse_cem.py index 3c268b1..da1ddf8 100644 --- a/tests/test_parse_cem.py +++ b/tests/test_parse_cem.py @@ -404,9 +404,9 @@ def test_consecutive_headings2(self): ) results = [r.serialize() for r in d.records] self.assertEqual(results, [ - {'labels': [u'VII'], 'roles': [u'formula']}, - {'melting_points': [{'units': u'\xb0C.', 'value': u'70-75'}], - 'names': [u'5-Bromo-6-pentadecyl-2-hydroxybenzoic acid', u'DBAA'], 'roles': ['product']}]) # example-3? + {'names': [u'5-Bromo-6-pentadecyl-2-hydroxybenzoic acid', u'DBAA'], 'roles': ['product']}, + {'melting_points': [{'units': u'\xb0C.', 'value': u'70-75'}], 'labels': [u'VII'], 'roles': [u'formula']} + ]) # example-3? if __name__ == '__main__': diff --git a/tests/test_parse_hrms.py b/tests/test_parse_hrms.py new file mode 100644 index 0000000..ba50c5b --- /dev/null +++ b/tests/test_parse_hrms.py @@ -0,0 +1,75 @@ +# -*- coding: utf-8 -*- +""" +test_parse_doi +~~~~~~~~~~~~~~ + +Test DOI parser. + +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +from __future__ import unicode_literals +import logging +import unittest + +from lxml import etree + +from chemdataextractor.doc.text import Sentence +from chemdataextractor.parse.hrms import hrms + +logging.basicConfig(level=logging.DEBUG) +log = logging.getLogger(__name__) + + +class TestParseHRMS(unittest.TestCase): + maxDiff = None + + def do_parse(self, input, expected): + s = Sentence(input) + log.debug(s) + log.debug(s.tagged_tokens) + result = next(hrms.scan(s.tagged_tokens))[0] + log.debug(etree.tostring(result, pretty_print=True, encoding='unicode')) + self.assertEqual(expected, etree.tostring(result, encoding='unicode')) + + def test_hrms1(self): + s = 'HRMS (ESI) calcd for C34H28N4OP 539.1995 [M + H]+, found 539.1997.' + output = 'C34H28N4OP' + self.do_parse(s, output) + + def test_hrms2(self): + s = 'HRMS: 184.0767 [M + Na]+.' + output = '' + self.do_parse(s, output) + + def test_hrms3(self): + s = 'HRMS-ESI (m/z): calcd. for C42H52NO9 [M + NH4]+ 714.3637, found 714.3633.' + output = 'C42H52NO9' + self.do_parse(s, output) + + def test_hrms4(self): + s = 'MALDI-HRMS (matrix: HCCA) Calculated for C32H48N4O6: [M + H]+ m/z 585.3607, Found 585.3636.' + output = 'C32H48N4O6' + self.do_parse(s, output) + + def test_hrms5(self): + s = 'HRMS (m/z): 827.6005 [M+Na]+ (calcd. for C48H84O9Na: 827.6013). ' + output = 'C48H84O9Na' + self.do_parse(s, output) + + def test_hrms6(self): + s = 'HRMS [M−H]+ m/z calcd. for C24H32N9+ 446.2781, found 446.2775.' + output = 'C24H32N9+' + self.do_parse(s, output) + + def test_hrms7(self): + s = 'DCI-HRMS: m/z 289.0916 [M+H]+; (Calcd for C12H16O8, 288.0845)' + output = 'C12H16O8' + self.do_parse(s, output) + + def test_hrms8(self): + s = 'ES-HRMS: m/z 115.0393 [M−H]−; (Calcd for C5H7O3, 116.0473).' + output = 'C5H7O3' + self.do_parse(s, output) From 309f4cc2ae46b8ca8c1b63e2de8591e01c683cf3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jefferson=20Hern=C3=A1ndez?= Date: Thu, 7 Sep 2017 14:46:13 +0200 Subject: [PATCH 2/4] fix last_id_record to check name # Conflicts: # chemdataextractor/parse/hrms.py --- chemdataextractor/doc/document.py | 3 ++- chemdataextractor/parse/hrms.py | 5 +++++ tests/test_parse_cem.py | 9 +++++++-- tests/test_parse_hrms.py | 5 +++++ 4 files changed, 19 insertions(+), 3 deletions(-) diff --git a/chemdataextractor/doc/document.py b/chemdataextractor/doc/document.py index bc107a1..8f9bd44 100644 --- a/chemdataextractor/doc/document.py +++ b/chemdataextractor/doc/document.py @@ -215,10 +215,11 @@ def records(self): continue else: # print(record.serialize()) + # TODO: check the names and labels, not the whole record # We have property values but no names or labels... try merge those from previous if isinstance(el, Paragraph) and (head_def_record or last_product_record or last_id_record or title_record): # head_def_record from heading takes priority if the heading directly precedes the paragraph ( NOPE: or the last_id_record has no name) - if not last_id_record and head_def_record_i and head_def_record_i + 1 == i: # or (last_id_record and not last_id_record.names)): + if last_id_record and not last_id_record.names and head_def_record_i is not None and head_def_record_i + 1 == i: # or (last_id_record and not last_id_record.names)): if head_def_record: record.names = head_def_record.names record.labels = head_def_record.labels diff --git a/chemdataextractor/parse/hrms.py b/chemdataextractor/parse/hrms.py index 51c0ebc..b8ffd7c 100644 --- a/chemdataextractor/parse/hrms.py +++ b/chemdataextractor/parse/hrms.py @@ -42,5 +42,10 @@ def interpret(self, result, start, end): c = Compound( hrms=h ) +<<<<<<< HEAD +======= + c = Compound() + c.hrms.append(h) +>>>>>>> 54ed784... fix last_id_record to check name yield c diff --git a/tests/test_parse_cem.py b/tests/test_parse_cem.py index da1ddf8..0805b09 100644 --- a/tests/test_parse_cem.py +++ b/tests/test_parse_cem.py @@ -403,10 +403,15 @@ def test_consecutive_headings2(self): Paragraph('The product had a melting point of 70-75° C. and has structural formula VII.') ) results = [r.serialize() for r in d.records] - self.assertEqual(results, [ + print(results) + print([ {'names': [u'5-Bromo-6-pentadecyl-2-hydroxybenzoic acid', u'DBAA'], 'roles': ['product']}, {'melting_points': [{'units': u'\xb0C.', 'value': u'70-75'}], 'labels': [u'VII'], 'roles': [u'formula']} - ]) # example-3? + ]) + self.assertEqual(results, [ + {'labels': [u'VII'], 'roles': [u'formula']}, + {'melting_points': [{'units': u'\xb0C.', 'value': u'70-75'}], + 'names': [u'5-Bromo-6-pentadecyl-2-hydroxybenzoic acid', u'DBAA'], 'roles': ['product']}]) # example-3? if __name__ == '__main__': diff --git a/tests/test_parse_hrms.py b/tests/test_parse_hrms.py index ba50c5b..4b69429 100644 --- a/tests/test_parse_hrms.py +++ b/tests/test_parse_hrms.py @@ -73,3 +73,8 @@ def test_hrms8(self): s = 'ES-HRMS: m/z 115.0393 [M−H]−; (Calcd for C5H7O3, 116.0473).' output = 'C5H7O3' self.do_parse(s, output) + + def test_hrms11(self): + s = 'HRMS (ESI): calcd. for C13H11BrO3Na+ [M + Na]+ 316.9789, found 316.9785.' + output = 'C13H11BrO3Na+' + self.do_parse(s, output) From cbcb2d31fe39a3a85371a00be5ad5291c1ae169f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jefferson=20Hern=C3=A1ndez?= Date: Fri, 8 Sep 2017 14:03:11 +0200 Subject: [PATCH 3/4] uncompleted test --- chemdataextractor/parse/hrms.py | 51 --------------------- tests/test_extract.py | 11 +++++ tests/test_parse_hrms.py | 80 --------------------------------- 3 files changed, 11 insertions(+), 131 deletions(-) delete mode 100644 chemdataextractor/parse/hrms.py delete mode 100644 tests/test_parse_hrms.py diff --git a/chemdataextractor/parse/hrms.py b/chemdataextractor/parse/hrms.py deleted file mode 100644 index b8ffd7c..0000000 --- a/chemdataextractor/parse/hrms.py +++ /dev/null @@ -1,51 +0,0 @@ -# -*- coding: utf-8 -*- - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals - -import re - -from .base import BaseParser -from .elements import OneOrMore, R, Optional, ZeroOrMore -from ..model import Compound, HRMS -from ..utils import first -from .actions import merge - -not_separator = '[^\.;,]$' -separator = '[\.;,]' -# number = R('^\d+(\.\d+)?$') -# obtained from https://stackoverflow.com/questions/23602175/regex-for-parsing-chemical-formulas -chemical_structure_start = (Optional(R('[\(\[]')) + R('^(calcd|calculated)' + separator + '?', flags=re.IGNORECASE) | R('^for' + separator + '?', flags=re.IGNORECASE)) -chemical_structure = (OneOrMore(chemical_structure_start + R(not_separator)).hide() + R('([A-Z][a-z]?\d*|\((?:[^()]*(?:\(.*\))?[^()]*)+\)\d+)+')('structure')) -# compound = (R('^\[') + ZeroOrMore(R('\.+')) + R('\]')).add_action(merge)('compound') - -# theoretical = (Optional(W('calcd') + W('for')).hide() + number('mass') + compound)('theoretical') -# experimental = (Optional(W('found')).hide() + number('mass'))('experimental') -exceptions = (R(u'((^found|^\d+|[\+\-‐‑⁃‒–—―−-⁻])' + separator + '?)$') + Optional(R(separator))).hide() - -hrms = (R('HRMS').hide() + ZeroOrMore(chemical_structure | exceptions | R(not_separator).hide()))('hrms') - - -class HRMSParser(BaseParser): - """""" - root = hrms - - def __init__(self): - pass - - def interpret(self, result, start, end): - h = HRMS( - chemical_structure=first('./structure/text()'), - ) - c = Compound( - hrms=h - ) -<<<<<<< HEAD -======= - c = Compound() - c.hrms.append(h) ->>>>>>> 54ed784... fix last_id_record to check name - - yield c diff --git a/tests/test_extract.py b/tests/test_extract.py index 99c0f4a..cae509a 100644 --- a/tests/test_extract.py +++ b/tests/test_extract.py @@ -55,6 +55,17 @@ def test_parse_control_character(self): expected = [{'names': ['2,4,6-trinitrotoluene']}] self.assertEqual(expected, d.records.serialize()) + def test_title_parse(self): + """Test heading managed correctly""" + d = Document( + Heading('3.2. Experimental Details'), + Heading('3.2.1. Synthesis of Phosphorus Ylide 5'), + Paragraph('N-Benzyl-2-chloroacetamide (2): Chloroacetamide 2 was prepared following the procedure described in the literature [23]. To a stirred solution of benzylamine (7.8 mL, 70.8 mmol) in toluene (60 mL) under cooling with ice bath, chloroacetyl chloride (4 g, 35.4 mmol) was slowly added. The reaction mixture was stirred vigorously for 1h at room temperature. The solvent was evaporated under vacuum, the crude reaction was dissolved in dichloromethane (100 mL) and washed with water (3 × 50 mL). The organic layer was dried over anhydrous MgSO4, filtered and the solvent evaporated under vacuum. The product was obtained as a white solid (6.30 g, 97%). m.p. 91–92 °C (93–96 °C from literature) [23]; 1H-NMR (CDCl3) δ 4.11 (s, 2H), 4.50 (d, 2H, J = 6.0 Hz), 6.89 (br s, 1H), 7.26–7.36 (m, 5H, Ar-H).'), + Paragraph('1-Benzyl-5-(chloromethyl)-1H-tetrazole (3): Compound 3 was prepared by an analogous method to that described in the literature [24]. PCl5 (7.06 g, 33.9 mmol) was added slowly to a solution of N-benzyl-2-chloroacetamide (5.66 g, 30.8 mmol) in toluene (50 mL) under cooling with ice-water bath. The mixture was stirred at room temperature for 2 h, then NaN3 (3.01 g, 46.3 mmol) was added. The reaction mixture was stirred at room temperature for 30 min, water (0.8 mL) was added dropwise and the whole was refluxed for 5 h. After cooling, the reaction mixture was poured into water and extracted with chloroform. The combined organic layers were washed successively with water, NaOH solution 1M and saturated NaCl solution and dried over anhydrous MgSO4. After removal of the solvent, the crude product was purified by flash chromatography (ethyl acetate/hexane (1:2)) affording the tetrazole 3 as light yellow solid (3.47 g, 54%). m.p. 57–59 °C (from diethyl ether) (62–63 °C from literature) [24]; 1H-NMR (CDCl3) δ (ppm) 4.62 (s, 2H), 5.68 (s, 2H), 7.28–7.30 (m, 2H, Ar-H), 7.39–7.40 (m, 3H, Ar-H).') + ) + + print(d.records.serialize()) + diff --git a/tests/test_parse_hrms.py b/tests/test_parse_hrms.py deleted file mode 100644 index 4b69429..0000000 --- a/tests/test_parse_hrms.py +++ /dev/null @@ -1,80 +0,0 @@ -# -*- coding: utf-8 -*- -""" -test_parse_doi -~~~~~~~~~~~~~~ - -Test DOI parser. - -""" - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from __future__ import unicode_literals -import logging -import unittest - -from lxml import etree - -from chemdataextractor.doc.text import Sentence -from chemdataextractor.parse.hrms import hrms - -logging.basicConfig(level=logging.DEBUG) -log = logging.getLogger(__name__) - - -class TestParseHRMS(unittest.TestCase): - maxDiff = None - - def do_parse(self, input, expected): - s = Sentence(input) - log.debug(s) - log.debug(s.tagged_tokens) - result = next(hrms.scan(s.tagged_tokens))[0] - log.debug(etree.tostring(result, pretty_print=True, encoding='unicode')) - self.assertEqual(expected, etree.tostring(result, encoding='unicode')) - - def test_hrms1(self): - s = 'HRMS (ESI) calcd for C34H28N4OP 539.1995 [M + H]+, found 539.1997.' - output = 'C34H28N4OP' - self.do_parse(s, output) - - def test_hrms2(self): - s = 'HRMS: 184.0767 [M + Na]+.' - output = '' - self.do_parse(s, output) - - def test_hrms3(self): - s = 'HRMS-ESI (m/z): calcd. for C42H52NO9 [M + NH4]+ 714.3637, found 714.3633.' - output = 'C42H52NO9' - self.do_parse(s, output) - - def test_hrms4(self): - s = 'MALDI-HRMS (matrix: HCCA) Calculated for C32H48N4O6: [M + H]+ m/z 585.3607, Found 585.3636.' - output = 'C32H48N4O6' - self.do_parse(s, output) - - def test_hrms5(self): - s = 'HRMS (m/z): 827.6005 [M+Na]+ (calcd. for C48H84O9Na: 827.6013). ' - output = 'C48H84O9Na' - self.do_parse(s, output) - - def test_hrms6(self): - s = 'HRMS [M−H]+ m/z calcd. for C24H32N9+ 446.2781, found 446.2775.' - output = 'C24H32N9+' - self.do_parse(s, output) - - def test_hrms7(self): - s = 'DCI-HRMS: m/z 289.0916 [M+H]+; (Calcd for C12H16O8, 288.0845)' - output = 'C12H16O8' - self.do_parse(s, output) - - def test_hrms8(self): - s = 'ES-HRMS: m/z 115.0393 [M−H]−; (Calcd for C5H7O3, 116.0473).' - output = 'C5H7O3' - self.do_parse(s, output) - - def test_hrms11(self): - s = 'HRMS (ESI): calcd. for C13H11BrO3Na+ [M + Na]+ 316.9789, found 316.9785.' - output = 'C13H11BrO3Na+' - self.do_parse(s, output) From 5748fb6ce39f250bb0ae62d1399dc330fb0f1538 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jefferson=20Hern=C3=A1ndez?= Date: Fri, 8 Sep 2017 14:38:20 +0200 Subject: [PATCH 4/4] test added over title error extraction --- chemdataextractor/doc/document.py | 9 +++- tests/test_extract.py | 74 +++++++++++++++++++++++++++---- 2 files changed, 73 insertions(+), 10 deletions(-) diff --git a/chemdataextractor/doc/document.py b/chemdataextractor/doc/document.py index 8f9bd44..90eadaa 100644 --- a/chemdataextractor/doc/document.py +++ b/chemdataextractor/doc/document.py @@ -185,7 +185,7 @@ def records(self): sent_record = first_sent_records[0] if sent_record.labels or (sent_record.names and len(sent_record.names[0]) > len(el.sentences[0].text) / 2): head_def_record = sent_record - head_def_record_i = i + head_def_record_i = i - 1 # fix error related with cem that contains nmr that sometimes doesn't detect it well for record in el.records: # Keep track of the most recent record with labels @@ -273,6 +273,13 @@ def records(self): record.names.append(name) # Merge records with any shared name/label + temp_record = [] + for record in records: + if len(record.labels) <= 1: + temp_record.append(record) + + records.models = temp_record + len_l = len(records) i = 0 while i < (len_l - 1): diff --git a/tests/test_extract.py b/tests/test_extract.py index cae509a..35e4c6f 100644 --- a/tests/test_extract.py +++ b/tests/test_extract.py @@ -17,24 +17,23 @@ from chemdataextractor import Document from chemdataextractor.doc import Heading, Paragraph - logging.basicConfig(level=logging.DEBUG) log = logging.getLogger(__name__) - unittest.util._MAX_LENGTH = 2000 class TestExtract(unittest.TestCase): - maxDiff = None def test_melting_point_heading_salt(self): """Test extraction of melting point from a heading and paragraphs. Example taken from patent US06840965B2.""" d = Document( Heading('D. Synthesis of 4-Amino-2-(3-thienyl)phenol Hydrochloride'), - Paragraph('3 g (13.5 mmoles) of 4-nitro-2-(3-thienyl)phenol was dissolved in 40 mL of ethanol and hydrogenated at 25° C. in the presence of 600 mg of a palladium—active carbon catalyst (10%). After the theoretically required amount of hydrogen had been absorbed, the catalyst was filtered off. Following concentration in a rotary evaporator, the reaction mixture was poured onto 20 mL of cold diethyl ether. The precipitated product was filtered off and dried.'), - Paragraph('This gave 1.95 g (75% of the theoretical) of 4-amino-2-(3-thienyl)phenol hydrochloride with a melting point of 130-132° C.') + Paragraph( + '3 g (13.5 mmoles) of 4-nitro-2-(3-thienyl)phenol was dissolved in 40 mL of ethanol and hydrogenated at 25° C. in the presence of 600 mg of a palladium—active carbon catalyst (10%). After the theoretically required amount of hydrogen had been absorbed, the catalyst was filtered off. Following concentration in a rotary evaporator, the reaction mixture was poured onto 20 mL of cold diethyl ether. The precipitated product was filtered off and dried.'), + Paragraph( + 'This gave 1.95 g (75% of the theoretical) of 4-amino-2-(3-thienyl)phenol hydrochloride with a melting point of 130-132° C.') ) expected = [ @@ -44,7 +43,9 @@ def test_melting_point_heading_salt(self): {'names': ['carbon']}, {'names': ['hydrogen']}, {'names': ['diethyl ether']}, - {'melting_points': [{'units': '°C', 'value': '130-132'}], 'names': ['4-Amino-2-(3-thienyl)phenol Hydrochloride', '4-amino-2-(3-thienyl)phenol hydrochloride'], 'roles': ['product']} + {'melting_points': [{'units': '°C', 'value': '130-132'}], + 'names': ['4-Amino-2-(3-thienyl)phenol Hydrochloride', '4-amino-2-(3-thienyl)phenol hydrochloride'], + 'roles': ['product']} ] self.assertEqual(expected, d.records.serialize()) @@ -63,10 +64,65 @@ def test_title_parse(self): Paragraph('N-Benzyl-2-chloroacetamide (2): Chloroacetamide 2 was prepared following the procedure described in the literature [23]. To a stirred solution of benzylamine (7.8 mL, 70.8 mmol) in toluene (60 mL) under cooling with ice bath, chloroacetyl chloride (4 g, 35.4 mmol) was slowly added. The reaction mixture was stirred vigorously for 1h at room temperature. The solvent was evaporated under vacuum, the crude reaction was dissolved in dichloromethane (100 mL) and washed with water (3 × 50 mL). The organic layer was dried over anhydrous MgSO4, filtered and the solvent evaporated under vacuum. The product was obtained as a white solid (6.30 g, 97%). m.p. 91–92 °C (93–96 °C from literature) [23]; 1H-NMR (CDCl3) δ 4.11 (s, 2H), 4.50 (d, 2H, J = 6.0 Hz), 6.89 (br s, 1H), 7.26–7.36 (m, 5H, Ar-H).'), Paragraph('1-Benzyl-5-(chloromethyl)-1H-tetrazole (3): Compound 3 was prepared by an analogous method to that described in the literature [24]. PCl5 (7.06 g, 33.9 mmol) was added slowly to a solution of N-benzyl-2-chloroacetamide (5.66 g, 30.8 mmol) in toluene (50 mL) under cooling with ice-water bath. The mixture was stirred at room temperature for 2 h, then NaN3 (3.01 g, 46.3 mmol) was added. The reaction mixture was stirred at room temperature for 30 min, water (0.8 mL) was added dropwise and the whole was refluxed for 5 h. After cooling, the reaction mixture was poured into water and extracted with chloroform. The combined organic layers were washed successively with water, NaOH solution 1M and saturated NaCl solution and dried over anhydrous MgSO4. After removal of the solvent, the crude product was purified by flash chromatography (ethyl acetate/hexane (1:2)) affording the tetrazole 3 as light yellow solid (3.47 g, 54%). m.p. 57–59 °C (from diethyl ether) (62–63 °C from literature) [24]; 1H-NMR (CDCl3) δ (ppm) 4.62 (s, 2H), 5.68 (s, 2H), 7.28–7.30 (m, 2H, Ar-H), 7.39–7.40 (m, 3H, Ar-H).') ) + expected = [ + {'roles': ['product'], 'names': ['Phosphorus Ylide 5']}, + {'names': ['Chloroacetamide']}, + {'names': ['benzylamine']}, + {'names': ['chloroacetyl chloride']}, + {'names': ['dichloromethane']}, + {'names': ['1H']}, + {'names': ['PCl5']}, + {'names': ['NaN3']}, + {'names': ['chloroform']}, + {'names': ['NaOH']}, + {'names': ['NaCl']}, + {'names': ['ethyl acetate']}, + {'names': ['hexane']}, + {'names': ['tetrazole']}, + {'names': ['diethyl ether']}, + {'names': ['toluene']}, + {'names': ['MgSO4']}, + {'names': ['1H-NMR']}, + {'names': ['CDCl3']}, + {'names': ['2H']}, + {'names': ['Ar-H']}, + { + 'melting_points': [ + {'units': '°C', 'value': '57–59'} + ], + 'nmr_spectra': [{ + 'peaks': [ + {'shift': '4.62', 'number': '2H', 'multiplicity': 's'}, + {'shift': '5.68', 'number': '2H', 'multiplicity': 's'}, + {'shift': '7.28–7.30', 'number': '2H', 'assignment': 'Ar-H', 'multiplicity': 'm'}, + {'shift': '7.39–7.40', 'number': '3H', 'assignment': 'Ar-H', 'multiplicity': 'm'} + ], + 'solvent': 'CDCl3', + 'nucleus': '1H'} + ], + 'names': ['1-Benzyl-5-(chloromethyl)-1H-tetrazole'], + 'labels': ['3'] + }, + { + 'melting_points': [ + {'units': '°C', 'value': '91–92'} + ], + 'nmr_spectra': [{ + 'peaks': [ + {'shift': '4.11', 'number': '2H', 'multiplicity': 's'}, + {'coupling': '6.0', 'number': '2H', 'shift': '4.50', 'coupling_units': 'Hz', + 'multiplicity': 'd'}, + {'shift': '6.89', 'number': '1H', 'multiplicity': 'br s'}, + {'shift': '7.26–7.36', 'number': '5H', 'assignment': 'Ar-H', 'multiplicity': 'm'} + ], + 'solvent': 'CDCl3', + 'nucleus': '1H'} + ], + 'names': ['N-Benzyl-2-chloroacetamide', 'N-benzyl-2-chloroacetamide'], # even with this two repeated names, the extractor is working ok + 'labels': ['2'] + }] - print(d.records.serialize()) - - + self.assertEqual(expected, d.records.serialize()) if __name__ == '__main__':