diff --git a/containerized_analytics/smile/topic_modeling/CHANGELOG.md b/containerized_analytics/smile/topic_modeling/CHANGELOG.md index 7d167e7..89c98d4 100644 --- a/containerized_analytics/smile/topic_modeling/CHANGELOG.md +++ b/containerized_analytics/smile/topic_modeling/CHANGELOG.md @@ -4,6 +4,10 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.1.6] - 07-16-2024 + +### Changed +- Add language detection to filter out non-English text [#123](https://github.com/ncsa/standalone-smm-analytics/issues/123) ## [0.1.5] - 01-23-2024 diff --git a/containerized_analytics/smile/topic_modeling/gensim_topic_modeling.py b/containerized_analytics/smile/topic_modeling/gensim_topic_modeling.py index 356b573..85b67d0 100644 --- a/containerized_analytics/smile/topic_modeling/gensim_topic_modeling.py +++ b/containerized_analytics/smile/topic_modeling/gensim_topic_modeling.py @@ -7,6 +7,7 @@ from nltk import WordNetLemmatizer import pyLDAvis import pyLDAvis.gensim +from langdetect import detect class Gensim_Topic_Modeling: @@ -16,6 +17,9 @@ def __init__(self, df, column): 'str').tolist() def preprocessing(self): + # Detect and keep only English texts + self.data = [sent for sent in self.data if detect(sent) == 'en'] + self.data = [re.sub('\S*@\S*\s?', "", sent) for sent in self.data] self.data = [re.sub('\s+', ' ', sent) for sent in self.data] self.data = [re.sub("\'", "", sent) for sent in self.data] diff --git a/containerized_analytics/smile/topic_modeling/requirement.txt b/containerized_analytics/smile/topic_modeling/requirement.txt index ca7e73c..44985b7 100644 --- a/containerized_analytics/smile/topic_modeling/requirement.txt +++ b/containerized_analytics/smile/topic_modeling/requirement.txt @@ -5,3 +5,4 @@ numpy>=1.18.1 pandas>=1.1.4 pyLDAvis==2.1.2 pika>=1.1.0 +langdetect>=1.0.7