Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion chemdataextractor/doc/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
from ..parse.mp import MpParser
from ..parse.tg import TgParser
from ..parse.nmr import NmrParser
from ..parse.doi import DoiParser
from ..parse.uvvis import UvvisParser
from ..nlp.lexicon import ChemLexicon
from ..nlp.cem import CemTagger, IGNORE_PREFIX, IGNORE_SUFFIX, SPECIALS, SPLITS
Expand Down Expand Up @@ -266,7 +267,8 @@ def _repr_html_(self):

class Paragraph(Text):

parsers = [CompoundParser(), ChemicalLabelParser(), NmrParser(), IrParser(), UvvisParser(), MpParser(), TgParser(), ContextParser()]
parsers = [CompoundParser(), ChemicalLabelParser(), NmrParser(), IrParser(), UvvisParser(), MpParser(), TgParser(),
ContextParser(), DoiParser()]

def _repr_html_(self):
return '<p class="cde-paragraph">' + self.text + '</p>'
Expand Down Expand Up @@ -510,6 +512,7 @@ def records(self):
tagged_tokens = [(CONTROL_RE.sub('', token), tag) for token, tag in self.tagged_tokens]
for parser in self.parsers:
for record in parser.parse(tagged_tokens):
# print(record)
p = record.serialize()
if not p: # TODO: Potential performance issues?
continue
Expand Down
10 changes: 4 additions & 6 deletions chemdataextractor/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,10 @@

from .utils import python_2_unicode_compatible


log = logging.getLogger(__name__)


class BaseType(six.with_metaclass(ABCMeta)):

# This is assigned by ModelMeta to match the attribute on the Model
name = None

Expand Down Expand Up @@ -90,7 +88,6 @@ def process(self, value):


class ModelType(BaseType):

def __init__(self, model, **kwargs):
self.model_class = model
self.model_name = self.model_class.__name__
Expand All @@ -102,7 +99,6 @@ def serialize(self, value, primitive=False):


class ListType(BaseType):

def __init__(self, field, default=None, **kwargs):
super(ListType, self).__init__(**kwargs)
self.field = field
Expand Down Expand Up @@ -394,6 +390,7 @@ class GlassTransition(BaseModel):
concentration = StringType(contextual=True)
concentration_units = StringType(contextual=True)


class QuantumYield(BaseModel):
"""A quantum yield measurement."""
value = StringType()
Expand Down Expand Up @@ -439,6 +436,7 @@ class Compound(BaseModel):
names = ListType(StringType())
labels = ListType(StringType())
roles = ListType(StringType())
doi = ListType(StringType())
nmr_spectra = ListType(ModelType(NmrSpectrum))
ir_spectra = ListType(ModelType(IrSpectrum))
uvvis_spectra = ListType(ModelType(UvvisSpectrum))
Expand Down Expand Up @@ -502,8 +500,8 @@ def is_unidentified(self):
def is_id_only(self):
"""Return True if identifier information only."""
for key, value in self.items():
if key not in {'names', 'labels', 'roles'} and value:
if key not in {'names', 'labels', 'roles', 'doi'} and value:
return False
if self.names or self.labels:
if self.names or self.labels or self.doi:
return True
return False
30 changes: 30 additions & 0 deletions chemdataextractor/parse/doi.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

from .base import BaseParser
from .elements import W, R, Optional
from ..model import StringType, Compound
from .actions import merge


doi = ((R('[Dd][Oo][Ii]') + Optional(W(':'))).hide() +
R('10[.][0-9]{4,}(?:[.][0-9]+)*') +
W('/') +
R('(?:(?!["&\'<>])\S)+')).add_action(merge)('doi')


class DoiParser(BaseParser):
""""""
root = doi

def __init__(self):
pass

def interpret(self, result, start, end):
c = Compound(
doi=result.xpath('./text()')
)

yield c
49 changes: 49 additions & 0 deletions tests/test_parse_doi.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# -*- coding: utf-8 -*-
"""
test_parse_doi
~~~~~~~~~~~~~~

Test DOI parser.

"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import logging
import unittest

from lxml import etree

from chemdataextractor.doc.text import Sentence
from chemdataextractor.parse.doi import doi

logging.basicConfig(level=logging.DEBUG)
log = logging.getLogger(__name__)


class TestParseDOI(unittest.TestCase):
maxDiff = None

def do_parse(self, input, expected):
s = Sentence(input)
log.debug(s)
log.debug(s.tagged_tokens)
result = next(doi.scan(s.tagged_tokens))[0]
log.debug(etree.tostring(result, pretty_print=True, encoding='unicode'))
self.assertEqual(expected, etree.tostring(result, encoding='unicode'))

def test_doi1(self):
tests = [
'DOI:10.1021/jo101758t',
'doi:10.3390/molecules201219848\n hello world',
'Molecules 2015, 20(12), 22272-22285; doi:10.3390/molecules201219846'
]
values = [
'<doi>10.1021/jo101758t</doi>',
'<doi>10.3390/molecules201219848</doi>',
'<doi>10.3390/molecules201219846</doi>'
]
for test, expected in zip(tests, values):
self.do_parse(test, expected)
2 changes: 1 addition & 1 deletion tests/test_parse_nmr.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def do_parse(self, input, expected):
log.debug(s.tagged_tokens)
result = next(nmr.scan(s.tagged_tokens))[0]
log.debug(etree.tostring(result, pretty_print=True, encoding='unicode'))
self.assertEqual(expected, etree.tostring(result, encoding='unicode'))
self.assertEqual(etree.tostring(result, encoding='unicode'), expected)

def test_nmr1(self):
s = '1H NMR (300 MHz, CDCl3), 1.00 (t, J = 7.3 Hz, 3H), 1.50 (m, 2H), 1.77 (m, 2H), 2.42 (s, 3H), ' \
Expand Down