From f9e96db25b9f2b43126b9f603f885ef752fc0252 Mon Sep 17 00:00:00 2001 From: Mostafa Ahangarha Date: Sun, 1 Mar 2020 16:53:23 +0430 Subject: [PATCH 1/2] Fix cleaner by removing # --- twc.py | 1 + 1 file changed, 1 insertion(+) diff --git a/twc.py b/twc.py index c49f4c8..353b316 100755 --- a/twc.py +++ b/twc.py @@ -70,6 +70,7 @@ def remove_emoji(tweet): def clean_tweet(tweet): tweet = str(tweet) tweet = tweet.lower() + tweet = tweet.replace("#", "") # remove # so we preserve hashtags for the cloud tweet = tp.clean(tweet) tweet = remove_emoji(tweet) normalizer = Normalizer() From 9b31f932eedf21b50fdb77273f62c81e5a46d8de Mon Sep 17 00:00:00 2001 From: Mostafa Ahangarha Date: Sun, 1 Mar 2020 16:54:29 +0430 Subject: [PATCH 2/2] remove some verbs from tweets --- twc.py | 1 + 1 file changed, 1 insertion(+) diff --git a/twc.py b/twc.py index 353b316..1262b07 100755 --- a/twc.py +++ b/twc.py @@ -75,6 +75,7 @@ def clean_tweet(tweet): tweet = remove_emoji(tweet) normalizer = Normalizer() tweet = normalizer.normalize(tweet) + tweet = re.sub(r'ن?می[‌]\S+','',tweet) # removes verbs such as می‌شود or نمی‌گویند tokens = word_tokenize(tweet) tokens = [token for token in tokens if token not in stopwords.persian] tokens = [token for token in tokens if token not in stopwords.english]