From 8f052c65570af930abc74242be1bcfd11104039e Mon Sep 17 00:00:00 2001 From: kartikbhtt7 Date: Fri, 26 Apr 2024 03:13:20 +0530 Subject: [PATCH] added all the code required for topic_modelling --- src/topic_modelling/BERTopic/Dockerfile | 19 +++++++ src/topic_modelling/BERTopic/README.md | 17 ++++++ src/topic_modelling/BERTopic/__init__.py | 2 + src/topic_modelling/BERTopic/api.py | 55 ++++++++++++++++++ src/topic_modelling/BERTopic/model.py | 56 +++++++++++++++++++ src/topic_modelling/BERTopic/request.py | 9 +++ src/topic_modelling/BERTopic/requirements.txt | 8 +++ 7 files changed, 166 insertions(+) create mode 100644 src/topic_modelling/BERTopic/Dockerfile create mode 100644 src/topic_modelling/BERTopic/README.md create mode 100644 src/topic_modelling/BERTopic/__init__.py create mode 100644 src/topic_modelling/BERTopic/api.py create mode 100644 src/topic_modelling/BERTopic/model.py create mode 100644 src/topic_modelling/BERTopic/request.py create mode 100644 src/topic_modelling/BERTopic/requirements.txt diff --git a/src/topic_modelling/BERTopic/Dockerfile b/src/topic_modelling/BERTopic/Dockerfile new file mode 100644 index 0000000..496768a --- /dev/null +++ b/src/topic_modelling/BERTopic/Dockerfile @@ -0,0 +1,19 @@ +# Base image +FROM python:3.9 + +# Set the working directory +WORKDIR /app + +# Install dependencies +COPY requirements.txt . +RUN pip install -r requirements.txt + +# Copy all source code +COPY . . +COPY . /app/ + +# Expose port for the server +EXPOSE 8000 + +# Command to run the server +CMD ["hypercorn", "--bind", "0.0.0.0:8000", "api:app"] \ No newline at end of file diff --git a/src/topic_modelling/BERTopic/README.md b/src/topic_modelling/BERTopic/README.md new file mode 100644 index 0000000..c6e5786 --- /dev/null +++ b/src/topic_modelling/BERTopic/README.md @@ -0,0 +1,17 @@ +## BERTopic Topic Extraction Model + +### Purpose : +Model to extract meaningful segmentations of a query dataset + +### Testing the model deployment : +To run for testing of the model for topic head generation, follow the given below steps: + +- Git clone the repo +- Go to current folder location i.e. ``` cd src/topic_modelling/BERTopic ``` +- Create docker image file and test the api: +#### (IMP) The input .csv file must have one column having preprocessed text and column name as 'text' +''' +docker build -t testmodel . +docker run -p 8000:8000 testmodel +curl -X POST -F "test.csv" http://localhost:8000/embed -o output4.csv +''' diff --git a/src/topic_modelling/BERTopic/__init__.py b/src/topic_modelling/BERTopic/__init__.py new file mode 100644 index 0000000..541dca1 --- /dev/null +++ b/src/topic_modelling/BERTopic/__init__.py @@ -0,0 +1,2 @@ +from .request import * +from .model import * \ No newline at end of file diff --git a/src/topic_modelling/BERTopic/api.py b/src/topic_modelling/BERTopic/api.py new file mode 100644 index 0000000..8f76063 --- /dev/null +++ b/src/topic_modelling/BERTopic/api.py @@ -0,0 +1,55 @@ +import os +import io +import json +import pandas as pd +from quart import Quart, request, Response, send_file +from model import Model +from request import ModelRequest + +app = Quart(__name__) + +# Initialize the model to be used for inference. +model = None + +@app.before_serving +async def startup(): + """This function is called once before the server starts to initialize the model.""" + global model + model = Model(app) + +@app.route('/embed', methods=['POST']) +async def embed(): + """This endpoint receives a CSV file, extracts text data from it, and uses the model to generate embeddings and topic information.""" + global model + + files = await request.files # Get the uploaded files + uploaded_file = files.get('file') # Get the uploaded CSV file + + if not uploaded_file: + return Response(json.dumps({"error": "No file uploaded"}), status=400, mimetype='application/json') + + # Read the CSV file into a DataFrame + csv_data = pd.read_csv(io.BytesIO(uploaded_file.stream.read())) + + # Extract the text data + text_data = csv_data['text'].tolist() + + # Create a ModelRequest object with the extracted text data + req = ModelRequest(text=text_data) + + # Call the model's inference method and get the response + response = await model.inference(req) + + if response is None: + # If an error occurred during inference, return an error response + return Response(json.dumps({"error": "Inference error"}), status=500, mimetype='application/json') + + # Convert the CSV string from the response into a DataFrame + df = pd.read_csv(io.StringIO(response)) + + # Save the DataFrame to a CSV file + output_file_path = 'output.csv' + df.to_csv(output_file_path, index=False) + + # Send the CSV file back as a download response + return await send_file(output_file_path, mimetype='text/csv', as_attachment=True, attachment_filename='output.csv') \ No newline at end of file diff --git a/src/topic_modelling/BERTopic/model.py b/src/topic_modelling/BERTopic/model.py new file mode 100644 index 0000000..ca2d9c7 --- /dev/null +++ b/src/topic_modelling/BERTopic/model.py @@ -0,0 +1,56 @@ +import pandas as pd +from sentence_transformers import SentenceTransformer +from bertopic import BERTopic +from umap import UMAP +from sklearn.feature_extraction.text import CountVectorizer +import json +import nltk +from request import ModelRequest + +nltk.download("punkt") + +class Model: + def __init__(self, context): + self.context = context + self.sentence_model = SentenceTransformer("all-MiniLM-L6-v2") + self.vectorizer_model = CountVectorizer(stop_words="english") + self.umap_model = UMAP(n_neighbors=15, min_dist=0.0, metric="cosine", random_state=69) + # self.hdbscan_model = HDBSCAN(min_cluster_size=15, metric="euclidean", prediction_data=True) + self.topic_model = BERTopic( + umap_model = self.umap_model, + # hdbscan_model = self.hdbscan_model, + vectorizer_model = self.vectorizer_model, + ) + + async def inference(self, request: ModelRequest): + text = request.text + try: + # Encode the text using SentenceTransformer + corpus_embeddings = self.sentence_model.encode(text) + + # Fit the topic model + topics, probabilities = self.topic_model.fit_transform(text, corpus_embeddings) + + # Get topic information and cluster labels + df_classes = self.topic_model.get_topic_info() + cluster_labels, _ = self.topic_model.transform(text, corpus_embeddings) + + df_result = pd.DataFrame({ + "document_text": text, + "predicted_class_label": cluster_labels, + "probabilities": probabilities, + }) + + # Mapping cluster names to topic labels + cluster_names_map = dict(zip(df_classes["Topic"], df_classes["Name"])) + df_result["predicted_class_name"] = df_result["predicted_class_label"].map(cluster_names_map) + + csv_string = df_result.to_csv(index=False) + + except Exception as e: + # Log & print the error + print(f"Error during inference: {e}") + return None + + return csv_string + diff --git a/src/topic_modelling/BERTopic/request.py b/src/topic_modelling/BERTopic/request.py new file mode 100644 index 0000000..6e18d37 --- /dev/null +++ b/src/topic_modelling/BERTopic/request.py @@ -0,0 +1,9 @@ +import json + +class ModelRequest(): + def __init__(self, text): + self.text = text + + def to_json(self): + return json.dumps(self, default=lambda o: o.__dict__, + sort_keys=True, indent=4) \ No newline at end of file diff --git a/src/topic_modelling/BERTopic/requirements.txt b/src/topic_modelling/BERTopic/requirements.txt new file mode 100644 index 0000000..fa7ddab --- /dev/null +++ b/src/topic_modelling/BERTopic/requirements.txt @@ -0,0 +1,8 @@ +quart +aiohttp +pandas +bertopic +sentence_transformers +numpy +nltk +scikit-learn \ No newline at end of file