From 8f052c65570af930abc74242be1bcfd11104039e Mon Sep 17 00:00:00 2001
From: kartikbhtt7 <kartik@LAPTOP-EU5TLTIN>
Date: Fri, 26 Apr 2024 03:13:20 +0530
Subject: [PATCH] added all the code required for topic_modelling

---
 src/topic_modelling/BERTopic/Dockerfile       | 19 +++++++
 src/topic_modelling/BERTopic/README.md        | 17 ++++++
 src/topic_modelling/BERTopic/__init__.py      |  2 +
 src/topic_modelling/BERTopic/api.py           | 55 ++++++++++++++++++
 src/topic_modelling/BERTopic/model.py         | 56 +++++++++++++++++++
 src/topic_modelling/BERTopic/request.py       |  9 +++
 src/topic_modelling/BERTopic/requirements.txt |  8 +++
 7 files changed, 166 insertions(+)
 create mode 100644 src/topic_modelling/BERTopic/Dockerfile
 create mode 100644 src/topic_modelling/BERTopic/README.md
 create mode 100644 src/topic_modelling/BERTopic/__init__.py
 create mode 100644 src/topic_modelling/BERTopic/api.py
 create mode 100644 src/topic_modelling/BERTopic/model.py
 create mode 100644 src/topic_modelling/BERTopic/request.py
 create mode 100644 src/topic_modelling/BERTopic/requirements.txt

diff --git a/src/topic_modelling/BERTopic/Dockerfile b/src/topic_modelling/BERTopic/Dockerfile
new file mode 100644
index 0000000..496768a
--- /dev/null
+++ b/src/topic_modelling/BERTopic/Dockerfile
@@ -0,0 +1,19 @@
+# Base image
+FROM python:3.9
+
+# Set the working directory
+WORKDIR /app
+
+# Install dependencies
+COPY requirements.txt .
+RUN pip install -r requirements.txt
+
+# Copy all source code
+COPY . .
+COPY . /app/
+
+# Expose port for the server
+EXPOSE 8000
+
+# Command to run the server
+CMD ["hypercorn", "--bind", "0.0.0.0:8000", "api:app"]
\ No newline at end of file
diff --git a/src/topic_modelling/BERTopic/README.md b/src/topic_modelling/BERTopic/README.md
new file mode 100644
index 0000000..c6e5786
--- /dev/null
+++ b/src/topic_modelling/BERTopic/README.md
@@ -0,0 +1,17 @@
+## BERTopic Topic Extraction Model
+
+### Purpose :
+Model to extract meaningful segmentations of a query dataset
+
+### Testing the model deployment :
+To run for testing of the model for topic head generation, follow the given below steps:
+
+- Git clone the repo
+- Go to current folder location i.e. ``` cd src/topic_modelling/BERTopic ```
+- Create docker image file and test the api:
+#### (IMP) The input .csv file must have one column having preprocessed text and column name as 'text'
+'''
+docker build -t testmodel .
+docker run -p 8000:8000 testmodel
+curl -X POST -F "test.csv"  http://localhost:8000/embed -o output4.csv
+'''
diff --git a/src/topic_modelling/BERTopic/__init__.py b/src/topic_modelling/BERTopic/__init__.py
new file mode 100644
index 0000000..541dca1
--- /dev/null
+++ b/src/topic_modelling/BERTopic/__init__.py
@@ -0,0 +1,2 @@
+from .request import *
+from .model import *
\ No newline at end of file
diff --git a/src/topic_modelling/BERTopic/api.py b/src/topic_modelling/BERTopic/api.py
new file mode 100644
index 0000000..8f76063
--- /dev/null
+++ b/src/topic_modelling/BERTopic/api.py
@@ -0,0 +1,55 @@
+import os
+import io
+import json
+import pandas as pd
+from quart import Quart, request, Response, send_file
+from model import Model
+from request import ModelRequest
+
+app = Quart(__name__)
+
+# Initialize the model to be used for inference.
+model = None
+
+@app.before_serving
+async def startup():
+    """This function is called once before the server starts to initialize the model."""
+    global model
+    model = Model(app)
+
+@app.route('/embed', methods=['POST'])
+async def embed():
+    """This endpoint receives a CSV file, extracts text data from it, and uses the model to generate embeddings and topic information."""
+    global model
+
+    files = await request.files  # Get the uploaded files
+    uploaded_file = files.get('file')  # Get the uploaded CSV file
+
+    if not uploaded_file:
+        return Response(json.dumps({"error": "No file uploaded"}), status=400, mimetype='application/json')
+
+    # Read the CSV file into a DataFrame
+    csv_data = pd.read_csv(io.BytesIO(uploaded_file.stream.read()))
+
+    # Extract the text data
+    text_data = csv_data['text'].tolist()
+
+    # Create a ModelRequest object with the extracted text data
+    req = ModelRequest(text=text_data)
+
+    # Call the model's inference method and get the response
+    response = await model.inference(req)
+
+    if response is None:
+        # If an error occurred during inference, return an error response
+        return Response(json.dumps({"error": "Inference error"}), status=500, mimetype='application/json')
+
+    # Convert the CSV string from the response into a DataFrame
+    df = pd.read_csv(io.StringIO(response))
+
+    # Save the DataFrame to a CSV file
+    output_file_path = 'output.csv'
+    df.to_csv(output_file_path, index=False)
+
+    # Send the CSV file back as a download response
+    return await send_file(output_file_path, mimetype='text/csv', as_attachment=True, attachment_filename='output.csv')
\ No newline at end of file
diff --git a/src/topic_modelling/BERTopic/model.py b/src/topic_modelling/BERTopic/model.py
new file mode 100644
index 0000000..ca2d9c7
--- /dev/null
+++ b/src/topic_modelling/BERTopic/model.py
@@ -0,0 +1,56 @@
+import pandas as pd
+from sentence_transformers import SentenceTransformer
+from bertopic import BERTopic
+from umap import UMAP
+from sklearn.feature_extraction.text import CountVectorizer
+import json
+import nltk
+from request import ModelRequest
+
+nltk.download("punkt")
+
+class Model:
+    def __init__(self, context):
+        self.context = context
+        self.sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
+        self.vectorizer_model = CountVectorizer(stop_words="english")
+        self.umap_model = UMAP(n_neighbors=15, min_dist=0.0, metric="cosine", random_state=69)
+        # self.hdbscan_model = HDBSCAN(min_cluster_size=15, metric="euclidean", prediction_data=True)
+        self.topic_model = BERTopic(
+            umap_model = self.umap_model,
+            # hdbscan_model = self.hdbscan_model,
+            vectorizer_model = self.vectorizer_model,
+        )
+
+    async def inference(self, request: ModelRequest):
+        text = request.text
+        try:
+            # Encode the text using SentenceTransformer
+            corpus_embeddings = self.sentence_model.encode(text)
+            
+            # Fit the topic model
+            topics, probabilities = self.topic_model.fit_transform(text, corpus_embeddings)
+
+            # Get topic information and cluster labels
+            df_classes = self.topic_model.get_topic_info()
+            cluster_labels, _ = self.topic_model.transform(text, corpus_embeddings)
+
+            df_result = pd.DataFrame({
+                "document_text": text,
+                "predicted_class_label": cluster_labels,
+                "probabilities": probabilities,
+            }) 
+
+            # Mapping cluster names to topic labels
+            cluster_names_map = dict(zip(df_classes["Topic"], df_classes["Name"]))
+            df_result["predicted_class_name"] = df_result["predicted_class_label"].map(cluster_names_map)
+
+            csv_string = df_result.to_csv(index=False)
+            
+        except Exception as e:
+            # Log & print the error
+            print(f"Error during inference: {e}")
+            return None
+
+        return csv_string
+
diff --git a/src/topic_modelling/BERTopic/request.py b/src/topic_modelling/BERTopic/request.py
new file mode 100644
index 0000000..6e18d37
--- /dev/null
+++ b/src/topic_modelling/BERTopic/request.py
@@ -0,0 +1,9 @@
+import json
+
+class ModelRequest():
+    def __init__(self, text):
+        self.text = text
+   
+    def to_json(self):
+        return json.dumps(self, default=lambda o: o.__dict__,
+                          sort_keys=True, indent=4)
\ No newline at end of file
diff --git a/src/topic_modelling/BERTopic/requirements.txt b/src/topic_modelling/BERTopic/requirements.txt
new file mode 100644
index 0000000..fa7ddab
--- /dev/null
+++ b/src/topic_modelling/BERTopic/requirements.txt
@@ -0,0 +1,8 @@
+quart
+aiohttp
+pandas
+bertopic
+sentence_transformers
+numpy
+nltk
+scikit-learn
\ No newline at end of file