From 5602b728af30e337ce3aebbbf6e6be9783336fc0 Mon Sep 17 00:00:00 2001 From: Erik Tjong Kim Sang Date: Tue, 8 Oct 2024 16:53:41 +0200 Subject: [PATCH] optional stopwords in elements --- .../storynavigation/modules/constants.py | 5 +++ .../storynavigation/modules/tagging.py | 10 +++-- .../storynavigation/widgets/OWSNTagger.py | 39 +++++++++++++------ 3 files changed, 39 insertions(+), 15 deletions(-) diff --git a/orangecontrib/storynavigation/modules/constants.py b/orangecontrib/storynavigation/modules/constants.py index 382d744..3953628 100644 --- a/orangecontrib/storynavigation/modules/constants.py +++ b/orangecontrib/storynavigation/modules/constants.py @@ -25,6 +25,11 @@ EN = 'en' SUPPORTED_LANGUAGES = [EN, NL] +# Yes / no +YES = "yes" +NO = "no" +YES_NO_WORDS = [YES, NO] + # Number of story segments N_STORY_SEGMENTS = list(range(1,11)) diff --git a/orangecontrib/storynavigation/modules/tagging.py b/orangecontrib/storynavigation/modules/tagging.py index efdea16..df42468 100644 --- a/orangecontrib/storynavigation/modules/tagging.py +++ b/orangecontrib/storynavigation/modules/tagging.py @@ -20,10 +20,11 @@ class Tagger: Args: n_segments (int): Number of segments to split each story into. """ - def __init__(self, lang, n_segments, text_tuples, custom_tags_and_word_column=None, callback=None): + def __init__(self, lang, n_segments, remove_stopwords, text_tuples, custom_tags_and_word_column=None, callback=None): self.text_tuples = text_tuples self.lang = lang self.n_segments = n_segments + self.remove_stopwords = remove_stopwords self.custom_tags = None self.word_column = None # any new column name added below should also be added to variable TAGGING_DATAFRAME_COLUMNNAMES in constants.py @@ -489,15 +490,18 @@ def __setup_required_nlp_resources(self, lang): Args: lang (string): the ISO code for the language of the input stories (e.g. 'nl' or 'en'). Currently only 'nl' and 'en' are supported """ + self.stopwords = [] if lang == constants.NL: - self.stopwords = constants.NL_STOPWORDS_FILE.read_text(encoding="utf-8").split("\n") + if self.remove_stopwords == constants.YES: + self.stopwords = constants.NL_STOPWORDS_FILE.read_text(encoding="utf-8").split("\n") self.pronouns = constants.NL_PRONOUNS_FILE.read_text(encoding="utf-8").split("\n") self.model = constants.NL_SPACY_MODEL self.past_tense_verbs = constants.NL_PAST_TENSE_FILE.read_text(encoding="utf-8").split("\n") self.present_tense_verbs = constants.NL_PRESENT_TENSE_FILE.read_text(encoding="utf-8").split("\n") self.false_positive_verbs = constants.NL_FALSE_POSITIVE_VERB_FILE.read_text(encoding="utf-8").split("\n") else: - self.stopwords = constants.EN_STOPWORDS_FILE.read_text(encoding="utf-8").split("\n") + if self.remove_stopwords == constants.YES: + self.stopwords = constants.EN_STOPWORDS_FILE.read_text(encoding="utf-8").split("\n") self.pronouns = constants.EN_PRONOUNS_FILE.read_text(encoding="utf-8").split("\n") self.model = constants.EN_SPACY_MODEL self.past_tense_verbs = constants.EN_PAST_TENSE_FILE.read_text(encoding="utf-8").split("\n") diff --git a/orangecontrib/storynavigation/widgets/OWSNTagger.py b/orangecontrib/storynavigation/widgets/OWSNTagger.py index 894f56d..4bcd206 100644 --- a/orangecontrib/storynavigation/widgets/OWSNTagger.py +++ b/orangecontrib/storynavigation/widgets/OWSNTagger.py @@ -36,6 +36,7 @@ class Error(OWWidget.Error): language = 'nl' word_column = 'word' n_segments = 1 + remove_stopwords = constants.YES def __init__(self): super().__init__() @@ -57,6 +58,7 @@ def __init__(self): sizePolicy=QSizePolicy(QSizePolicy.Maximum, QSizePolicy.Maximum) ) self.controlArea.layout().addWidget(self.select_language_combo) + self.select_word_column_combo = gui.comboBox( widget=self.controlArea, master=self, @@ -66,7 +68,6 @@ def __init__(self): sendSelectedValue=True, sizePolicy=QSizePolicy(QSizePolicy.Maximum, QSizePolicy.Maximum) ) - self.controlArea.layout().addWidget(self.select_word_column_combo) self.select_n_segments_combo = gui.comboBox( @@ -78,12 +79,24 @@ def __init__(self): sendSelectedValue=True, sizePolicy=QSizePolicy(QSizePolicy.Maximum, QSizePolicy.Maximum) ) - self.controlArea.layout().addWidget(self.select_n_segments_combo) + + self.remove_stopwords_combo = gui.comboBox( + widget=self.controlArea, + master=self, + label="Remove stopwords", + value="remove_stopwords", + items=constants.YES_NO_WORDS, + sendSelectedValue=True, + sizePolicy=QSizePolicy(QSizePolicy.Maximum, QSizePolicy.Maximum), + ) + self.controlArea.layout().addWidget(self.remove_stopwords_combo) + self.select_language_combo.setEnabled(True) self.select_word_column_combo.setEnabled(True) self.select_n_segments_combo.setEnabled(True) - + self.remove_stopwords_combo.setEnabled(True) + self.compute_data_button = gui.button( self.controlArea, self, @@ -150,14 +163,14 @@ def reset_widget(self): def on_done(self, result) -> None: self.Outputs.dataset_level_data.send(table_from_frame(self.tagger.complete_data)) - def run(self, lang, n_segments, text_tuples, tuple, state: TaskState): + def run(self, lang, n_segments, remove_stopwords, text_tuples, tuple, state: TaskState): def advance(progress): if state.is_interruption_requested(): raise InterruptedError state.set_progress_value(progress) self.tagger = Tagger( - lang=lang, n_segments=n_segments, text_tuples=text_tuples, + lang=lang, n_segments=n_segments, remove_stopwords=remove_stopwords, text_tuples=text_tuples, custom_tags_and_word_column=tuple, callback=advance) return self.tagger.complete_data @@ -168,17 +181,19 @@ def __generate_dataset_level_data(self): if len(self.stories) > 0: if self.custom_tag_dict is not None: self.start( - self.run, - self.language, - n_segments, + self.run, + self.language, + n_segments, + self.remove_stopwords, self.stories, (self.custom_tag_dict, self.word_column) ) else: self.start( - self.run, - self.language, - n_segments, + self.run, + self.language, + n_segments, + self.remove_stopwords, self.stories, None ) @@ -188,7 +203,7 @@ def __generate_dataset_level_data(self): from orangecontrib.text.preprocess import BASE_TOKENIZER - corpus_ = Corpus.from_file("orangecontrib/storynavigation/tests/storynavigator-testdata.tab") + corpus_ = Corpus.from_file("tests/storynavigator-testdata.tab") corpus_ = corpus_[:3] corpus_ = BASE_TOKENIZER(corpus_) previewer = WidgetPreview(OWSNTagger)