diff --git a/twc.py b/twc.py index c49f4c8..1262b07 100755 --- a/twc.py +++ b/twc.py @@ -70,10 +70,12 @@ def remove_emoji(tweet): def clean_tweet(tweet): tweet = str(tweet) tweet = tweet.lower() + tweet = tweet.replace("#", "") # remove # so we preserve hashtags for the cloud tweet = tp.clean(tweet) tweet = remove_emoji(tweet) normalizer = Normalizer() tweet = normalizer.normalize(tweet) + tweet = re.sub(r'ن?می[‌]\S+','',tweet) # removes verbs such as می‌شود or نمی‌گویند tokens = word_tokenize(tweet) tokens = [token for token in tokens if token not in stopwords.persian] tokens = [token for token in tokens if token not in stopwords.english]