From cfe28c39c91e06018f8ca93b26648ed552ecfd53 Mon Sep 17 00:00:00 2001 From: Mostafa Ahangarha Date: Sun, 1 Mar 2020 16:54:29 +0430 Subject: [PATCH] remove some verbs from tweets --- twc.py | 1 + 1 file changed, 1 insertion(+) diff --git a/twc.py b/twc.py index 353b316..1262b07 100755 --- a/twc.py +++ b/twc.py @@ -75,6 +75,7 @@ def clean_tweet(tweet): tweet = remove_emoji(tweet) normalizer = Normalizer() tweet = normalizer.normalize(tweet) + tweet = re.sub(r'ن?می[‌]\S+','',tweet) # removes verbs such as می‌شود or نمی‌گویند tokens = word_tokenize(tweet) tokens = [token for token in tokens if token not in stopwords.persian] tokens = [token for token in tokens if token not in stopwords.english]