Skip to content

Commit

Permalink
dynamically create settings file #3
Browse files Browse the repository at this point in the history
  • Loading branch information
hrshdhgd committed Jul 15, 2021
1 parent 979758b commit fc76d1f
Show file tree
Hide file tree
Showing 2 changed files with 91 additions and 2 deletions.
74 changes: 72 additions & 2 deletions sample_annotator/text_mining/TextMining.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,80 @@
import logging
from nmdc_schema.nmdc import QuantityValue
import re
import os
import configparser

SETTINGS_FILENAME = 'settings.ini'

@dataclass
class TextMining():
...
"""
Text mining Class
"""


def create_settings_file(self, path: str = '', ontList: List = ['ENVO']) -> None:
"""
Creates the settings.ini file for OGER to get parameters.
:param path: Path of the 'nlp' folder
:param ontList: The ontology to be used as dictionary e.g. ['ENVO', 'CHEBI']
:return: None.
- The 'Shared' section declares global variables that can be used in other sections
e.g. Data root.
root = location of the working directory
accessed in other sections using => ${Shared:root}/
- Input formats accepted:
txt, txt_json, bioc_xml, bioc_json, conll, pubmed,
pxml, pxml.gz, pmc, nxml, pubtator, pubtator_fbk,
becalmabstracts, becalmpatents
- Two iter-modes available: [collection or document]
document:- 'n' input files = 'n' output files
(provided every file has ontology terms)
collection:- n input files = 1 output file
- Export formats possible:
tsv, txt, text_tsv, xml, text_xml, bioc_xml,
bioc_json, bionlp, bionlp.ann, brat, brat.ann,
conll, pubtator, pubanno_json, pubtator, pubtator_fbk,
europepmc, europepmc.zip, odin, becalm_tsv, becalm_json
These can be passed as a list for multiple outputs too.
- Multiple Termlists can be declared in separate sections
e.g. [Termlist1], [Termlist2] ...[Termlistn] with each having
their own paths
"""

config = configparser.ConfigParser()
config['Section'] = {}
config['Shared'] = {}

# Settings required by OGER
config['Main'] = {
'include_header' : True,
'input-directory' : os.path.join(path,'input'),
'output-directory' : os.path.join(path,'output'),
'pointer-type' : 'glob',
'pointers' : '*.tsv',
'iter-mode' : 'collection',
'article-format' : 'txt_tsv',
'export_format': 'tsv',
'termlist_stopwords': os.path.join(path,'stopwords','stopWords.txt'),
'termlist_normalize': 'lowercase stem-Porter'
}

# TODO: bring in Mark/Harshad's code
# Iterate throough ontoList to register paths of corresponding termlists
for idx, ont in enumerate(ontList):
termlist_path = 'terms/'+ont.lower()+'_termlist.tsv'
config.set('Main','termlist'+str(idx+1)+'_filename', termlist_path)

# Write
with open(os.path.join(path, SETTINGS_FILENAME), 'w') as settings_file:
config.write(settings_file)

if __name__ == '__main__':
text_mining = TextMining()
text_mining.create_settings_file('.', ['ENVO'])
19 changes: 19 additions & 0 deletions sample_annotator/text_mining/settings.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
[Section]

[Shared]

[Main]
include_header = True
input-directory = ./input
output-directory = ./output
pointer-type = glob
pointers = *.tsv
iter-mode = collection
article-format = txt_tsv
export_format = tsv
termlist_stopwords = ./stopwords/stopWords.txt
termlist_normalize = lowercase stem-Porter
termlist1_filename = terms/envo_termlist.tsv
termlist2_filename = terms/chebi_termlist.tsv
termlist3_filename = terms/abc_termlist.tsv

0 comments on commit fc76d1f

Please sign in to comment.