Skip to content

Commit

Permalink
Download CDE model automatically if needed
Browse files Browse the repository at this point in the history
  • Loading branch information
avaucher committed Oct 13, 2023
1 parent e601683 commit 31e5c96
Showing 1 changed file with 17 additions and 0 deletions.
17 changes: 17 additions & 0 deletions src/paragraph2actions/sentence_splitting/cde_splitter.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
import logging
from typing import List

import chemdataextractor
from chemdataextractor.data import Package

from .sentence_splitter import SentenceSplitter, SentenceSplittingError

logger = logging.getLogger(__name__)
logger.addHandler(logging.NullHandler())


class CdeSplitter(SentenceSplitter):
"""
Expand All @@ -12,10 +17,22 @@ class CdeSplitter(SentenceSplitter):

def __init__(self, split_sentences_at_newlines: bool = True):
super().__init__(split_sentences_at_newlines=split_sentences_at_newlines)
download_cde_data()

def _split_impl(self, text: str) -> List[str]:
try:
paragraph = chemdataextractor.doc.Paragraph(text)
return [sentence.text for sentence in paragraph.sentences]
except Exception as e:
raise SentenceSplittingError(text) from e


def download_cde_data() -> None:
"""Explicitly download the CDE model necessary for splitting sentences, if needed."""
package = Package("models/punkt_chem-1.0.pickle")
if package.local_exists():
return

logger.info("Downloading the necessary ChemDataExtractor data...")
package.download()
logger.info("Downloading the necessary ChemDataExtractor data... Done.")

0 comments on commit 31e5c96

Please sign in to comment.