Skip to content

Commit

Permalink
Added functionality to merge custom words list using full verbs
Browse files Browse the repository at this point in the history
  • Loading branch information
ThijsVroegh committed Oct 15, 2024
1 parent 01eaea0 commit 1129eae
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 4 deletions.
20 changes: 17 additions & 3 deletions orangecontrib/storynavigation/modules/tagging.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,16 @@ class Tagger:
Args:
n_segments (int): Number of segments to split each story into.
use_infinitives (bool): Whether to use infinitives for verbs.
"""
def __init__(self, lang, n_segments, remove_stopwords, text_tuples, custom_tags_and_word_column=None, callback=None):
def __init__(self, lang, n_segments, remove_stopwords, text_tuples, custom_tags_and_word_column=None, callback=None,use_infinitives=False):
self.text_tuples = text_tuples
self.lang = lang
self.n_segments = n_segments
self.remove_stopwords = remove_stopwords
self.custom_tags = None
self.word_column = None
self.use_infinitives = use_infinitives
# any new column name added below should also be added to variable TAGGING_DATAFRAME_COLUMNNAMES in constants.py
self.complete_data_columns = ['storyid', 'sentence', 'token_text', 'token_start_idx', 'token_end_idx', 'story_navigator_tag', 'spacy_tag', 'spacy_finegrained_tag', 'spacy_dependency', 'spacy_ne', 'spacy_lemma', 'spacy_head_text', 'spacy_head_idx', 'is_pronoun_boolean', 'is_sentence_subject_boolean', 'active_voice_subject_boolean', 'associated_action']

Expand Down Expand Up @@ -76,18 +78,30 @@ def __process_stories(self, nlp, text_tuples, callback):
if callback:
callback((c / len(text_tuples) * 100))

# Process custom tags and word column if provided
if self.custom_tags is not None and self.word_column is not None:
collection_df['custom_' + self.word_column] = collection_df['token_text'].str.lower()
collection_df['custom_' + self.word_column] = collection_df['custom_' + self.word_column].str.lstrip('0123456789@#$!“"-')

if self.use_infinitives:
collection_df = pd.merge(collection_df, self.custom_tags, left_on='custom_' + self.word_column, right_on=self.word_column, how='left')
collection_df = collection_df.drop(columns=[self.word_column])
collection_df['custom_' + self.word_column] = collection_df['spacy_lemma'].str.lower()

else:
# If not using infinitives, simply copy the custom word column
collection_df['custom_' + self.word_column] = collection_df['custom_' + self.word_column]

# Merge the custom tags
collection_df = pd.merge(collection_df, self.custom_tags, left_on='custom_' + self.word_column, right_on=self.word_column, how='left')
collection_df = collection_df.drop(columns=[self.word_column])

else:
collection_df['token_text_lowercase'] = collection_df['token_text'].str.lower()

# Clean up associated action columns
collection_df['associated_action'] = collection_df['associated_action'].str.lstrip('0123456789@#$!“"-')
collection_df['associated_action_lowercase'] = collection_df['associated_action'].str.lower()

# Add language column and word count
lang_col_values = [self.lang] * len(collection_df)
collection_df['lang'] = lang_col_values
story_wordcount_values = self.__calculate_story_wordcounts(collection_df)
Expand Down
20 changes: 19 additions & 1 deletion orangecontrib/storynavigation/widgets/OWSNTagger.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,13 +37,15 @@ class Error(OWWidget.Error):
word_column = 'word'
n_segments = 1
remove_stopwords = constants.YES
use_infinitives = Setting(False)

def __init__(self):
super().__init__()
ConcurrentWidgetMixin.__init__(self)
self.stories = None # initialise list of documents (corpus)
self.custom_tag_dict = None
self.custom_tag_dict_columns = ['']
self.use_infinitives = False

size_policy = QSizePolicy(QSizePolicy.Maximum, QSizePolicy.Maximum)
self.controlArea.setSizePolicy(size_policy)
Expand Down Expand Up @@ -112,6 +114,17 @@ def __init__(self):
}
"""
)

self.infinitives_checkbox = gui.checkBox(
widget=self.controlArea,
master=self,
value='use_infinitives',
label='Use infinitives to merge custom words',
callback=self.on_infinitives_changed,
sizePolicy=QSizePolicy(QSizePolicy.Maximum, QSizePolicy.Maximum)
)

self.controlArea.layout().addWidget(self.infinitives_checkbox)

@Inputs.stories
def set_stories(self, stories=None):
Expand Down Expand Up @@ -163,6 +176,10 @@ def reset_widget(self):
def on_done(self, result) -> None:
self.Outputs.dataset_level_data.send(table_from_frame(self.tagger.complete_data))

def on_infinitives_changed(self):
#add any additional logic here if needed
pass

def run(self, lang, n_segments, remove_stopwords, text_tuples, tuple, state: TaskState):
def advance(progress):
if state.is_interruption_requested():
Expand All @@ -171,7 +188,8 @@ def advance(progress):

self.tagger = Tagger(
lang=lang, n_segments=n_segments, remove_stopwords=remove_stopwords, text_tuples=text_tuples,
custom_tags_and_word_column=tuple, callback=advance)
custom_tags_and_word_column=tuple, callback=advance,
use_infinitives=self.use_infinitives)

return self.tagger.complete_data

Expand Down

0 comments on commit 1129eae

Please sign in to comment.