Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dev branch #292

Open
wants to merge 2 commits into
base: restructure
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/chunking/MPNet/local/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,4 +158,4 @@ async def inference(self, request: ModelRequest):

# Properly escape the CSV string

return csv_string
return csv_string
15 changes: 15 additions & 0 deletions src/youtube_embedding/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@

FROM python:3.9-slim

WORKDIR /app


#install requirements
COPY requirements.txt requirements.txt
RUN pip3 install -r requirements.txt

# Copy the rest of the application code to the working directory
COPY . /app/
EXPOSE 8000
# Set the entrypoint for the container
CMD ["hypercorn", "--bind", "0.0.0.0:8000", "api:app"]
Empty file added src/youtube_embedding/README.md
Empty file.
Empty file.
68 changes: 68 additions & 0 deletions src/youtube_embedding/api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
from quart import Quart, request, jsonify
from scraper import transcript
import aiohttp
import io
from model import Model
from request import ModelRequest
from chunking import TranscriptChunker
import json

app = Quart(__name__)


model = None

@app.before_serving
async def startup():
app.client = aiohttp.ClientSession()
global model
model = Model(app)


transcript_data_store = {}
@app.route('/get_transcript', methods=['POST'])
async def get_transcript():
data = await request.get_json()

if 'url' not in data:
return jsonify({'error': 'URL is required'}), 400

url = data['url']
transcript_path, transcript_content = transcript(url)

transcript_data_store[url] = {
'transcript_path': transcript_path,
'transcript_data': transcript_content
}
return jsonify({
'transcript_path': transcript_path,
'transcript_data': transcript_content
})


@app.route('/Query',methods=['POST'])
async def query():
global model
data= await request.get_json()


if 'url' not in data or 'query' not in data:
return jsonify({'error': 'URL and query are required'}), 400

url=data['url']
user_query = data['query']

if url not in transcript_data_store:
transcript_path, transcript_content = transcript(url)
transcript_data_store[url] = {
'transcript_path': transcript_path,
'transcript_data': transcript_content
}
else:
req=ModelRequest(data,transcript_data_store)
response=await model.inference(req)

return jsonify({
'search_results':response
})

1 change: 1 addition & 0 deletions src/youtube_embedding/chunking/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from chunking.transform import *
128 changes: 128 additions & 0 deletions src/youtube_embedding/chunking/transform.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
"""
transform transcipted data and devides it into chunks.
Initially chunking 4mins video frame is implemented,
other more optimized algorithms will be implemented in
further iterations.
"""

import json
from typing import List,Dict,Any
import pandas as pd


class TranscriptChunker:
"""
TranscriptChunker class for processing youtube transcript.

Attributes:
chunk_size_seconds (int): The size of each chunk in seconds.

Methods:
fit(transcript_path: str) -> None:
Reads the transcript data from a JSON file and prepares the class for transformation.

_transform() -> Dict[str,List[Dict[str,Any]]]:
Transforms the transcript data into chunks of specified size.

chunks() ->Dict[str,List[Dict[str,Any]]] :
Returns the resulting chunks .

metadata() -> Dict[str,Dict[str,Any]]:
Returns metadata about the chunks, such as the number of chunks and their durations.


Example:
chunker = TranscriptChunker(chunk_size_seconds=240)
chunker.fit('transcript.json')
chunks_df = chunker.chunks()
metadata = chunker.metadata()
"""

def __init__(self,chunk_size_seconds:int=240)->None:
self.chunk_size_seconds:int=chunk_size_seconds
# transcipt will be converted to pandas dataframe for better data manipulation
self.transcript_df:pd.DataFrame = None
self.result_chunks:Dict[str,List[Dict[str,Any]]] = None


def fit(self,transcript_path:str)->None:
with open(transcript_path,'r') as file:
transcript_data=json.load(file)
self.transcript_df=pd.DataFrame(transcript_data)
self.result_chunks=self._transform()


def _transform(self)->List[Dict[str,Any]]:

if self.transcript_df is None:
raise ValueError("Transcript data not provided.")

current_chunk=[]
current_chunk_duration = 0

# Dictionary to store all chunks
self.all_chunks={}

chunk_counter=1

for index,row in self.transcript_df.iterrows():

if current_chunk_duration+row['duration']<=self.chunk_size_seconds:

current_chunk.append(row.to_dict())
current_chunk_duration+=row['duration']
else:
self.all_chunks[f'chunk{chunk_counter}']=current_chunk
current_chunk=[row.to_dict()]
current_chunk_duration = row['duration']
chunk_counter+=1

if current_chunk:
self.all_chunks[f'chunk{chunk_counter}'] = current_chunk

return self.all_chunks



def chunks(self)->Dict[str,List[Dict[str,Any]]]:
if self.result_chunks is None:
raise ValueError("Call .fit() method first to transform data into chunks")

return self.result_chunks


#this method returns meta data about chunks like size of each chunk
#start duration and end duration of chunk
def metadata(self)->Dict[str,Dict[str,Any]]:
if self.result_chunks is None:
raise ValueError("Call .fit() method first to transform data into chunks")

self.meta_dict={}
for chunk in self.result_chunks.keys():
dict={}

#calculating length of chunk(number of words)
text=" "
for item in self.result_chunks[chunk]:
text=text+" "+item['text']
chunk_length=len(text.split())
dict['chunk_length']=chunk_length

#calculating duration of each chunk in minutes
start_time=self.result_chunks[chunk][0]['start']
length=len(self.result_chunks[chunk])
end_time=self.result_chunks[chunk][length-1]['start']+self.result_chunks[chunk][length-1]['duration']

dict['start_time']=round((start_time)/60,2)
dict['end_time']=round((end_time)/60,2)
self.meta_dict[chunk]=dict


return self.meta_dict


if __name__=='__main__':

chunks=TranscriptChunker()
chunks.fit('/home/suyash/samagra/ai-tools/src/youtube_embedding/scraper/transcript.json')
print(chunks.metadata())
50 changes: 50 additions & 0 deletions src/youtube_embedding/model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
from request import Modelrequest
from ragatouille import RAGPretrainedModel
from chunking import TranscriptChunker



class Model():
def __new__(cls,context):
cls.context=context
if not hasattr(cls,'instance'):
cls.instance= super(Model,cls).__new__(cls)
model_name="colbert-ir/colbertv2.0"
cls.model=RAGPretrainedModel.from_pretrained(model_name)

return cls.instance

async def inference(self,request:Modelrequest):

url=request.url
query=request.query
trasncript_data=request.transcript_data
transcript_path=request.transcript_path

#chunking
chunker=TranscriptChunker()
chunker.fit(transcript_path)
chunked_data=chunker.chunks()
chunked_meta_data=chunker.metadata()

#embeddings and index creation
RAG_DICT={}
for chunks in chunked_data.keys():
text_data=" "
for data in chunked_data[chunks]:
text_data=text_data+" "+data['text']

RAG_DICT[chunks]=text_data

RAG_DATA=[]
for chunks in RAG_DATA.keys():
RAG_DATA.append(RAG_DATA[chunks])

index_path=self.model.index(index_name="my-index",collection=RAG_DATA)

#query
RAG=RAGPretrainedModel.from_index(index_path)
response=RAG.search(query)


return response
14 changes: 14 additions & 0 deletions src/youtube_embedding/request.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import json


class ModelRequest():

def __init__(self,data,trasnscript_data_store):
self.query=data['query']
self.url=data['url']
self.transcript_path=trasnscript_data_store[self.url]['transcript_path']
self.transcript_data=trasnscript_data_store[self.url]['transcript_data']


def to_json(self):
return json.dump(self,default=lambda o:o.__dict__,sort_keys=2,indent=4)
8 changes: 8 additions & 0 deletions src/youtube_embedding/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
faiss-cpu==1.7.4
openai==1.11.1
pandas==2.2.0
Quart==0.19.4
RAGatouille==0.0.6b5
youtube-dl==2021.12.17
youtube-transcript-api==0.6.2

1 change: 1 addition & 0 deletions src/youtube_embedding/scraper/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from scraper.scrape_transcript import *
4 changes: 4 additions & 0 deletions src/youtube_embedding/scraper/scrape_audio.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
"""
scrapes the audio from the youtube vedio and then use it
for transcription.
"""
56 changes: 56 additions & 0 deletions src/youtube_embedding/scraper/scrape_transcript.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
"""
scrape transcript of youtube vedios along with time frames.
"""
from youtube_transcript_api import YouTubeTranscriptApi
import json
import re
import os


def vid_id(Url:str)->str:
"""
retrieves vedio id from url.
args:
Url:url of the vedio in the form of string.
returns:
vedio_id:returns vedio_id os the url.
"""
try:
match = re.search(r'(?<=v=)[^&]+', Url)
video_id = match.group(0) if match else None
except AttributeError:
print("Video ID not foud in URL.")

return video_id

def transcript(Url:str)->str:
"""
retrieves the transcript from the youtube video along with timeframe
and stores it in json file
args:
Url:Url of the vedio.
returns:
output_path:returns absolute path of transcript.
"""
output_file='transcript.json'
v_id=vid_id(Url)

try:
transcript = YouTubeTranscriptApi.get_transcript(v_id)
with open('transcript.json', 'w') as f:
json.dump(transcript, f)

print("Transcript successfully saved to transcript.json")

except Exception as e:
print(f"An error occurred: {e}")

absolute_path=os.path.abspath(output_file)
return absolute_path,transcript




if __name__=='__main__':

pass