Skip to content

Commit

Permalink
optional stopwords in elements
Browse files Browse the repository at this point in the history
  • Loading branch information
eriktks committed Oct 8, 2024
1 parent 8732fd7 commit 5602b72
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 15 deletions.
5 changes: 5 additions & 0 deletions orangecontrib/storynavigation/modules/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,11 @@
EN = 'en'
SUPPORTED_LANGUAGES = [EN, NL]

# Yes / no
YES = "yes"
NO = "no"
YES_NO_WORDS = [YES, NO]

# Number of story segments
N_STORY_SEGMENTS = list(range(1,11))

Expand Down
10 changes: 7 additions & 3 deletions orangecontrib/storynavigation/modules/tagging.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,11 @@ class Tagger:
Args:
n_segments (int): Number of segments to split each story into.
"""
def __init__(self, lang, n_segments, text_tuples, custom_tags_and_word_column=None, callback=None):
def __init__(self, lang, n_segments, remove_stopwords, text_tuples, custom_tags_and_word_column=None, callback=None):
self.text_tuples = text_tuples
self.lang = lang
self.n_segments = n_segments
self.remove_stopwords = remove_stopwords
self.custom_tags = None
self.word_column = None
# any new column name added below should also be added to variable TAGGING_DATAFRAME_COLUMNNAMES in constants.py
Expand Down Expand Up @@ -489,15 +490,18 @@ def __setup_required_nlp_resources(self, lang):
Args:
lang (string): the ISO code for the language of the input stories (e.g. 'nl' or 'en'). Currently only 'nl' and 'en' are supported
"""
self.stopwords = []
if lang == constants.NL:
self.stopwords = constants.NL_STOPWORDS_FILE.read_text(encoding="utf-8").split("\n")
if self.remove_stopwords == constants.YES:
self.stopwords = constants.NL_STOPWORDS_FILE.read_text(encoding="utf-8").split("\n")
self.pronouns = constants.NL_PRONOUNS_FILE.read_text(encoding="utf-8").split("\n")
self.model = constants.NL_SPACY_MODEL
self.past_tense_verbs = constants.NL_PAST_TENSE_FILE.read_text(encoding="utf-8").split("\n")
self.present_tense_verbs = constants.NL_PRESENT_TENSE_FILE.read_text(encoding="utf-8").split("\n")
self.false_positive_verbs = constants.NL_FALSE_POSITIVE_VERB_FILE.read_text(encoding="utf-8").split("\n")
else:
self.stopwords = constants.EN_STOPWORDS_FILE.read_text(encoding="utf-8").split("\n")
if self.remove_stopwords == constants.YES:
self.stopwords = constants.EN_STOPWORDS_FILE.read_text(encoding="utf-8").split("\n")
self.pronouns = constants.EN_PRONOUNS_FILE.read_text(encoding="utf-8").split("\n")
self.model = constants.EN_SPACY_MODEL
self.past_tense_verbs = constants.EN_PAST_TENSE_FILE.read_text(encoding="utf-8").split("\n")
Expand Down
39 changes: 27 additions & 12 deletions orangecontrib/storynavigation/widgets/OWSNTagger.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ class Error(OWWidget.Error):
language = 'nl'
word_column = 'word'
n_segments = 1
remove_stopwords = constants.YES

def __init__(self):
super().__init__()
Expand All @@ -57,6 +58,7 @@ def __init__(self):
sizePolicy=QSizePolicy(QSizePolicy.Maximum, QSizePolicy.Maximum)
)
self.controlArea.layout().addWidget(self.select_language_combo)

self.select_word_column_combo = gui.comboBox(
widget=self.controlArea,
master=self,
Expand All @@ -66,7 +68,6 @@ def __init__(self):
sendSelectedValue=True,
sizePolicy=QSizePolicy(QSizePolicy.Maximum, QSizePolicy.Maximum)
)

self.controlArea.layout().addWidget(self.select_word_column_combo)

self.select_n_segments_combo = gui.comboBox(
Expand All @@ -78,12 +79,24 @@ def __init__(self):
sendSelectedValue=True,
sizePolicy=QSizePolicy(QSizePolicy.Maximum, QSizePolicy.Maximum)
)

self.controlArea.layout().addWidget(self.select_n_segments_combo)

self.remove_stopwords_combo = gui.comboBox(
widget=self.controlArea,
master=self,
label="Remove stopwords",
value="remove_stopwords",
items=constants.YES_NO_WORDS,
sendSelectedValue=True,
sizePolicy=QSizePolicy(QSizePolicy.Maximum, QSizePolicy.Maximum),
)
self.controlArea.layout().addWidget(self.remove_stopwords_combo)

self.select_language_combo.setEnabled(True)
self.select_word_column_combo.setEnabled(True)
self.select_n_segments_combo.setEnabled(True)

self.remove_stopwords_combo.setEnabled(True)

self.compute_data_button = gui.button(
self.controlArea,
self,
Expand Down Expand Up @@ -150,14 +163,14 @@ def reset_widget(self):
def on_done(self, result) -> None:
self.Outputs.dataset_level_data.send(table_from_frame(self.tagger.complete_data))

def run(self, lang, n_segments, text_tuples, tuple, state: TaskState):
def run(self, lang, n_segments, remove_stopwords, text_tuples, tuple, state: TaskState):
def advance(progress):
if state.is_interruption_requested():
raise InterruptedError
state.set_progress_value(progress)

self.tagger = Tagger(
lang=lang, n_segments=n_segments, text_tuples=text_tuples,
lang=lang, n_segments=n_segments, remove_stopwords=remove_stopwords, text_tuples=text_tuples,
custom_tags_and_word_column=tuple, callback=advance)

return self.tagger.complete_data
Expand All @@ -168,17 +181,19 @@ def __generate_dataset_level_data(self):
if len(self.stories) > 0:
if self.custom_tag_dict is not None:
self.start(
self.run,
self.language,
n_segments,
self.run,
self.language,
n_segments,
self.remove_stopwords,
self.stories,
(self.custom_tag_dict, self.word_column)
)
else:
self.start(
self.run,
self.language,
n_segments,
self.run,
self.language,
n_segments,
self.remove_stopwords,
self.stories,
None
)
Expand All @@ -188,7 +203,7 @@ def __generate_dataset_level_data(self):

from orangecontrib.text.preprocess import BASE_TOKENIZER

corpus_ = Corpus.from_file("orangecontrib/storynavigation/tests/storynavigator-testdata.tab")
corpus_ = Corpus.from_file("tests/storynavigator-testdata.tab")
corpus_ = corpus_[:3]
corpus_ = BASE_TOKENIZER(corpus_)
previewer = WidgetPreview(OWSNTagger)
Expand Down

0 comments on commit 5602b72

Please sign in to comment.