Samagra-Development · xorsuyash · Feb 7, 2024 · Feb 7, 2024
diff --git a/src/chunking/MPNet/local/model.py b/src/chunking/MPNet/local/model.py
@@ -158,4 +158,4 @@ async def inference(self, request: ModelRequest):
 
         # Properly escape the CSV string
 
-        return csv_string
+        return csv_string
diff --git a/src/youtube_embedding/Dockerfile b/src/youtube_embedding/Dockerfile
@@ -0,0 +1,15 @@
+
+FROM python:3.9-slim
+
+WORKDIR /app
+
+
+#install requirements
+COPY requirements.txt requirements.txt
+RUN pip3 install -r requirements.txt
+
+# Copy the rest of the application code to the working directory
+COPY . /app/
+EXPOSE 8000
+# Set the entrypoint for the container
+CMD ["hypercorn", "--bind", "0.0.0.0:8000", "api:app"]
diff --git a/src/youtube_embedding/README.md b/src/youtube_embedding/README.md
diff --git a/src/youtube_embedding/__init__.py b/src/youtube_embedding/__init__.py
diff --git a/src/youtube_embedding/api.py b/src/youtube_embedding/api.py
@@ -0,0 +1,68 @@
+from quart import Quart, request, jsonify
+from scraper import transcript  
+import aiohttp
+import io 
+from model import Model
+from request import ModelRequest
+from chunking import TranscriptChunker
+import json
+
+app = Quart(__name__)
+
+
+model = None
+
+@app.before_serving
+async def startup():
+    app.client = aiohttp.ClientSession()
+    global model
+    model = Model(app)
+
+
+transcript_data_store = {}
+@app.route('/get_transcript', methods=['POST'])
+async def get_transcript():
+    data = await request.get_json()
+
+    if 'url' not in data:
+        return jsonify({'error': 'URL is required'}), 400
+
+    url = data['url']
+    transcript_path, transcript_content = transcript(url)
+
+    transcript_data_store[url] = {
+        'transcript_path': transcript_path,
+        'transcript_data': transcript_content
+    }
+    return jsonify({
+        'transcript_path': transcript_path,
+        'transcript_data': transcript_content
+    })
+
+
+@app.route('/Query',methods=['POST'])
+async def query():
+    global model
+    data= await request.get_json()
+
+
+    if 'url' not in data or 'query' not in data:
+        return jsonify({'error': 'URL and query are required'}), 400
+
+    url=data['url']
+    user_query = data['query']
+
+    if url not in transcript_data_store:
+        transcript_path, transcript_content = transcript(url)
+        transcript_data_store[url] = {
+        'transcript_path': transcript_path,
+        'transcript_data': transcript_content
+        }
+    else:
+        req=ModelRequest(data,transcript_data_store)
+        response=await model.inference(req)
+
+    return jsonify({
+        'search_results':response
+    })
+
diff --git a/src/youtube_embedding/chunking/__init__.py b/src/youtube_embedding/chunking/__init__.py
@@ -0,0 +1 @@
+from chunking.transform import *
diff --git a/src/youtube_embedding/chunking/transform.py b/src/youtube_embedding/chunking/transform.py
@@ -0,0 +1,128 @@
+"""
+transform transcipted data and devides it into chunks.
+Initially chunking 4mins video frame is implemented, 
+other more optimized algorithms will be implemented in 
+further iterations. 
+"""
+
+import json 
+from typing import List,Dict,Any
+import pandas as pd 
+
+
+class TranscriptChunker:
+    """
+    TranscriptChunker class for processing youtube transcript.
+
+    Attributes:
+        chunk_size_seconds (int): The size of each chunk in seconds.
+
+    Methods:
+        fit(transcript_path: str) -> None:
+            Reads the transcript data from a JSON file and prepares the class for transformation.
+
+        _transform() -> Dict[str,List[Dict[str,Any]]]:
+            Transforms the transcript data into chunks of specified size.
+
+        chunks() ->Dict[str,List[Dict[str,Any]]] :
+            Returns the resulting chunks .
+
+        metadata() -> Dict[str,Dict[str,Any]]:
+            Returns metadata about the chunks, such as the number of chunks and their durations.
+
+
+        Example:
+            chunker = TranscriptChunker(chunk_size_seconds=240)
+            chunker.fit('transcript.json')
+            chunks_df = chunker.chunks()
+            metadata = chunker.metadata()
+    """
+
+    def __init__(self,chunk_size_seconds:int=240)->None:
+        self.chunk_size_seconds:int=chunk_size_seconds
+        # transcipt will be converted to pandas dataframe for better data manipulation 
+        self.transcript_df:pd.DataFrame = None
+        self.result_chunks:Dict[str,List[Dict[str,Any]]] = None
+
+
+    def fit(self,transcript_path:str)->None:
+        with open(transcript_path,'r') as file:
+            transcript_data=json.load(file)
+        self.transcript_df=pd.DataFrame(transcript_data)
+        self.result_chunks=self._transform() 
+
+
+    def _transform(self)->List[Dict[str,Any]]:
+
+        if self.transcript_df is None:
+            raise ValueError("Transcript data not provided.")
+
+        current_chunk=[]
+        current_chunk_duration = 0 
+
+        # Dictionary to store all chunks 
+        self.all_chunks={}
+
+        chunk_counter=1
+
+        for index,row in self.transcript_df.iterrows():
+
+            if current_chunk_duration+row['duration']<=self.chunk_size_seconds:
+
+                current_chunk.append(row.to_dict())
+                current_chunk_duration+=row['duration']
+            else:
+                self.all_chunks[f'chunk{chunk_counter}']=current_chunk
+                current_chunk=[row.to_dict()]
+                current_chunk_duration = row['duration']
+                chunk_counter+=1
+
+        if current_chunk:
+            self.all_chunks[f'chunk{chunk_counter}'] = current_chunk
+
+        return self.all_chunks
+
+
+
+    def chunks(self)->Dict[str,List[Dict[str,Any]]]:
+        if self.result_chunks is None:
+            raise ValueError("Call .fit() method first to transform data into chunks")
+
+        return self.result_chunks  
+
+
+    #this method returns meta data about chunks like size of each chunk 
+    #start duration and end duration of chunk  
+    def metadata(self)->Dict[str,Dict[str,Any]]:
+        if self.result_chunks is None:
+            raise ValueError("Call .fit() method first to transform data into chunks")
+
+        self.meta_dict={}
+        for chunk in self.result_chunks.keys():
+            dict={}
+
+            #calculating length of chunk(number of words)
+            text=" "
+            for item in self.result_chunks[chunk]:
+                text=text+" "+item['text']
+                chunk_length=len(text.split())
+                dict['chunk_length']=chunk_length
+
+            #calculating duration of each chunk in minutes 
+                start_time=self.result_chunks[chunk][0]['start']
+                length=len(self.result_chunks[chunk])
+                end_time=self.result_chunks[chunk][length-1]['start']+self.result_chunks[chunk][length-1]['duration']
+
+                dict['start_time']=round((start_time)/60,2)
+                dict['end_time']=round((end_time)/60,2)
+                self.meta_dict[chunk]=dict
+
+
+        return self.meta_dict
+
+
+if __name__=='__main__':
+
+    chunks=TranscriptChunker()
+    chunks.fit('/home/suyash/samagra/ai-tools/src/youtube_embedding/scraper/transcript.json')
+    print(chunks.metadata())
diff --git a/src/youtube_embedding/model.py b/src/youtube_embedding/model.py
@@ -0,0 +1,50 @@
+from request import Modelrequest
+from ragatouille import RAGPretrainedModel
+from chunking import TranscriptChunker
+
+
+
+class Model():
+    def __new__(cls,context):
+        cls.context=context 
+        if not hasattr(cls,'instance'):
+            cls.instance= super(Model,cls).__new__(cls)
+        model_name="colbert-ir/colbertv2.0"
+        cls.model=RAGPretrainedModel.from_pretrained(model_name)
+
+        return cls.instance 
+
+    async def inference(self,request:Modelrequest):
+
+        url=request.url 
+        query=request.query
+        trasncript_data=request.transcript_data
+        transcript_path=request.transcript_path 
+
+        #chunking 
+        chunker=TranscriptChunker()
+        chunker.fit(transcript_path)
+        chunked_data=chunker.chunks()
+        chunked_meta_data=chunker.metadata()
+
+        #embeddings and index creation
+        RAG_DICT={}
+        for chunks in chunked_data.keys():
+            text_data=" "
+            for data in chunked_data[chunks]:
+                text_data=text_data+" "+data['text']
+
+            RAG_DICT[chunks]=text_data
+
+        RAG_DATA=[]
+        for chunks in RAG_DATA.keys():
+            RAG_DATA.append(RAG_DATA[chunks])
+
+        index_path=self.model.index(index_name="my-index",collection=RAG_DATA)
+
+        #query 
+        RAG=RAGPretrainedModel.from_index(index_path)
+        response=RAG.search(query)
+
+
+        return response
diff --git a/src/youtube_embedding/request.py b/src/youtube_embedding/request.py
@@ -0,0 +1,14 @@
+import json 
+
+
+class ModelRequest():
+
+    def __init__(self,data,trasnscript_data_store):
+        self.query=data['query']
+        self.url=data['url']
+        self.transcript_path=trasnscript_data_store[self.url]['transcript_path']
+        self.transcript_data=trasnscript_data_store[self.url]['transcript_data']
+
+
+    def to_json(self):
+        return json.dump(self,default=lambda o:o.__dict__,sort_keys=2,indent=4)
diff --git a/src/youtube_embedding/requirements.txt b/src/youtube_embedding/requirements.txt
@@ -0,0 +1,8 @@
+faiss-cpu==1.7.4
+openai==1.11.1
+pandas==2.2.0
+Quart==0.19.4
+RAGatouille==0.0.6b5
+youtube-dl==2021.12.17
+youtube-transcript-api==0.6.2
+
diff --git a/src/youtube_embedding/scraper/__init__.py b/src/youtube_embedding/scraper/__init__.py
@@ -0,0 +1 @@
+from scraper.scrape_transcript import * 
diff --git a/src/youtube_embedding/scraper/scrape_audio.py b/src/youtube_embedding/scraper/scrape_audio.py
@@ -0,0 +1,4 @@
+"""
+scrapes the audio from the youtube vedio and then use it 
+for transcription.  
+"""
diff --git a/src/youtube_embedding/scraper/scrape_transcript.py b/src/youtube_embedding/scraper/scrape_transcript.py
@@ -0,0 +1,56 @@
+"""
+scrape transcript of youtube vedios along with time frames.
+"""
+from youtube_transcript_api import YouTubeTranscriptApi
+import json 
+import re
+import os 
+
+
+def vid_id(Url:str)->str:
+    """
+    retrieves vedio id from url. 
+    args:
+        Url:url of the vedio in the form of string. 
+    returns: 
+        vedio_id:returns vedio_id os the url.  
+    """
+    try:
+        match = re.search(r'(?<=v=)[^&]+', Url)
+        video_id = match.group(0) if match else None
+    except AttributeError:
+        print("Video ID not foud in URL.")
+
+    return video_id
+
+def transcript(Url:str)->str:
+    """
+    retrieves the transcript from the youtube video along with timeframe 
+    and stores it in json file
+    args:
+        Url:Url of the vedio.
+    returns: 
+        output_path:returns absolute path of transcript. 
+    """
+    output_file='transcript.json'
+    v_id=vid_id(Url)
+
+    try:
+        transcript = YouTubeTranscriptApi.get_transcript(v_id)
+        with open('transcript.json', 'w') as f:
+            json.dump(transcript, f)
+
+        print("Transcript successfully saved to transcript.json")
+
+    except Exception as e:
+         print(f"An error occurred: {e}")
+
+    absolute_path=os.path.abspath(output_file)
+    return absolute_path,transcript
+
+
+
+
+if __name__=='__main__':
+
+    pass