Skip to content

Commit

Permalink
Merge pull request #2 from biocypher:podcast
Browse files Browse the repository at this point in the history
Podcast
  • Loading branch information
slobentanzer authored Jul 6, 2023
2 parents 494891d + 1d8b723 commit d2b3d62
Show file tree
Hide file tree
Showing 10 changed files with 571 additions and 244 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@ __pycache__/
.venv
.pytest_cache
.env
*.mp3
13 changes: 11 additions & 2 deletions biochatter/llm_connect.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,12 +63,14 @@ def __init__(
self,
model_name: str,
prompts: dict,
correct: bool = True,
split_correction: bool = False,
docsum: DocumentEmbedder = None,
):
super().__init__()
self.model_name = model_name
self.prompts = prompts
self.correct = correct
self.split_correction = split_correction
self.docsum = docsum
self.history = []
Expand Down Expand Up @@ -163,6 +165,9 @@ def query(self, text: str):
# indicates error
return (msg, token_usage, None)

if not self.correct:
return (msg, token_usage, "OK")

cor_msg = (
"Correcting (using single sentences) ..."
if self.split_correction
Expand Down Expand Up @@ -292,7 +297,8 @@ def __init__(
self,
model_name: str,
prompts: dict,
split_correction: bool,
correct: bool = True,
split_correction: bool = False,
docsum: DocumentEmbedder = None,
):
"""
Expand All @@ -315,6 +321,7 @@ def __init__(
super().__init__(
model_name=model_name,
prompts=prompts,
correct=correct,
split_correction=split_correction,
docsum=docsum,
)
Expand Down Expand Up @@ -444,7 +451,8 @@ def __init__(
deployment_name: str,
model_name: str,
prompts: dict,
split_correction: bool,
correct: bool = True,
split_correction: bool = False,
docsum: DocumentEmbedder = None,
version: Optional[str] = None,
base: Optional[str] = None,
Expand Down Expand Up @@ -475,6 +483,7 @@ def __init__(
super().__init__(
model_name=model_name,
prompts=prompts,
correct=correct,
split_correction=split_correction,
docsum=docsum,
)
Expand Down
174 changes: 174 additions & 0 deletions biochatter/podcast.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
from typing import List
from langchain.schema import Document
from .llm_connect import GptConversation
from gtts import gTTS
import nltk
import os

FIRST_PROMPT = (
"You are tasked with summarising a scientific manuscript for consumption as"
"a podcast. As a first step, extract title and authors of the manuscript"
"from the following text. Return them in the format 'Title: <title>,"
"Authors: <authors>'."
)

PROMPT = (
"You are tasked with summarising a scientific manuscript for consumption as"
"a podcast. You will receive a collection of sentences from the"
"manuscript, from which you will remove any information not relevant to the"
"content, such as references, figure legends, tables, author information, "
"journal metadata, and so on. You will then be asked to summarise the"
"section of the manuscript, making the wording more suitable for listening."
"Remove all content in brackets that is of technical nature, such as"
"p-values, statistical tests, and so on."
)


class Podcaster:
def __init__(self, document: Document) -> None:
"""
Orchestrates the podcasting of a document.
"""
self.document = document

def generate_podcast(self, characters_per_paragraph: int) -> None:
"""
Podcasts the document.
"""
full_text = self.document[0].page_content

# split text by sentence
sentences = self._split_text(full_text)

# could embed sentences and cluster on cosine similarity to identify
# paragraphs here

# preprocess text
for i, sentence in enumerate(sentences):
# special cases i.e. and e.g. - if sentence ends with one of these,
# append next sentence
special_cases = ["i.e.", "e.g."]
if sentence.endswith(tuple(special_cases)):
sentences[i] = sentence + " " + sentences[i + 1]
del sentences[i + 1]

# concatenate first 5 sentences for title and author extraction
first_5 = "\n".join(sentences[:5])
self.podcast_intro = self._title_and_authors(first_5)

# LLM to determine section breaks?

# go through sections and summarise each
self.summarised_sections = self._summarise_sections(
sentences,
characters_per_paragraph,
)

# summarise the summaries

def _split_text(self, text: str) -> List[str]:
"""
Splits consecutive text into sentences.
"""
nltk.download("punkt")
tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")
return tokenizer.tokenize(text)

def _title_and_authors(self, text: str) -> str:
"""
Extracts title and authors from document.
Args:
text (str): text to extract title and authors from
Returns:
str: title and authors
"""
# first sentence - extract title, authors
c_first = GptConversation(
model_name="gpt-3.5-turbo",
prompts={},
correct=False,
)
c_first.set_api_key(api_key=os.getenv("OPENAI_API_KEY"), user="podcast")
c_first.append_system_message(FIRST_PROMPT)
msg, token_usage, correction = c_first.query(text)
# split at authors ('Authors:' or '\nAuthors:')
title = msg.split("Title:")[1].split("Authors:")[0].strip()
authors = msg.split("Authors:")[1].strip()
return f"{title}, by {authors}, podcasted by biochatter."

def _summarise_section(self, text: str) -> str:
"""
Summarises a section of the document.
Args:
text (str): text to summarise
Returns:
str: summarised text
"""
# summarise section
c = GptConversation(
model_name="gpt-3.5-turbo",
prompts={},
correct=False,
)
c.set_api_key(api_key=os.getenv("OPENAI_API_KEY"), user="podcast")
c.append_system_message(PROMPT)
msg, token_usage, correction = c.query(text)
return msg

def _summarise_sections(
self, sentences: list, characters_per_paragraph: int
) -> list:
"""
Summarises sections of the document. Concatenates sentences until
characters_per_paragraph is reached, removing each sentence from the
list as it is added to the section to be summarised.
Args:
sentences (list): list of sentences to summarise
characters_per_paragraph (int): number of characters per paragraph
Returns:
list: list of summarised sections
"""
summarised_sections = []
section = ""
while sentences:
sentence = sentences.pop(0)
tmp = section + sentence
if len(tmp) < characters_per_paragraph and sentences:
section += sentence
else:
if sentences:
sentences.insert(0, sentence)
summarised_section = self._summarise_section(section)
summarised_sections.append(summarised_section)
section = ""

return summarised_sections

def podcast_to_file(self, path: str) -> None:
"""
Uses text-to-speech to generate audio for the summarised paper podcast.
Args:
path (str): path to save audio file to
"""

full_text = self.podcast_to_text()

audio = gTTS(text=full_text)
audio.save(path)

def podcast_to_text(self):
"""
Returns the summarised paper podcast as text.
"""
full_text = self.podcast_intro + "\n\n"
for section in self.summarised_sections:
full_text += section + "\n\n"
return full_text
135 changes: 76 additions & 59 deletions biochatter/vectorstore.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,34 +71,8 @@ def set_document(self, document: List[Document]) -> None:
self.document = document

def _load_document(self, path: str) -> None:
"""
Loads a document from a path; accepts txt and pdf files. Txt files are
loaded as-is, pdf files are converted to text using fitz.
Args:
path (str): path to document
Returns:
List[Document]: list of documents
"""
if path.endswith(".txt"):
loader = TextLoader(path)
self.document = loader.load()
elif path.endswith(".pdf"):
doc = fitz.open(path)
text = ""
for page in doc:
text += page.get_text()

meta = {k: v for k, v in doc.metadata.items() if v}
meta.update({"source": path})

self.document = [
Document(
page_content=text,
metadata=meta,
)
]
reader = DocumentReader()
self.document = reader.load_document(path)

def split_document(self) -> None:
text_splitter = RecursiveCharacterTextSplitter(
Expand Down Expand Up @@ -139,36 +113,79 @@ def similarity_search(self, query: str, k: int = 3):
raise NotImplementedError(self.vector_db_vendor)


def document_from_pdf(pdf) -> List[Document]:
"""
Receive a byte representation of a pdf file and return a list of Documents
with metadata.
"""
doc = fitz.open(stream=pdf, filetype="pdf")
text = ""
for page in doc:
text += page.get_text()
class DocumentReader:
def load_document(self, path: str) -> List[Document]:
"""
Loads a document from a path; accepts txt and pdf files. Txt files are
loaded as-is, pdf files are converted to text using fitz.
meta = {k: v for k, v in doc.metadata.items() if v}
meta.update({"source": "pdf"})
Args:
path (str): path to document
return [
Document(
page_content=text,
metadata=meta,
)
]


def document_from_txt(txt) -> List[Document]:
"""
Receive a byte representation of a txt file and return a list of Documents
with metadata.
"""
meta = {"source": "txt"}
return [
Document(
page_content=txt,
metadata=meta,
)
]
Returns:
List[Document]: list of documents
"""
if path.endswith(".txt"):
loader = TextLoader(path)
return loader.load()

elif path.endswith(".pdf"):
doc = fitz.open(path)
text = ""
for page in doc:
text += page.get_text()

meta = {k: v for k, v in doc.metadata.items() if v}
meta.update({"source": path})

return [
Document(
page_content=text,
metadata=meta,
)
]

def document_from_pdf(self, pdf: bytes) -> List[Document]:
"""
Receive a byte representation of a pdf file and return a list of Documents
with metadata.
Args:
pdf (bytes): byte representation of pdf file
Returns:
List[Document]: list of documents
"""
doc = fitz.open(stream=pdf, filetype="pdf")
text = ""
for page in doc:
text += page.get_text()

meta = {k: v for k, v in doc.metadata.items() if v}
meta.update({"source": "pdf"})

return [
Document(
page_content=text,
metadata=meta,
)
]

def document_from_txt(self, txt: bytes) -> List[Document]:
"""
Receive a byte representation of a txt file and return a list of Documents
with metadata.
Args:
txt (bytes): byte representation of txt file
Returns:
List[Document]: list of documents
"""
meta = {"source": "txt"}
return [
Document(
page_content=txt,
metadata=meta,
)
]
Loading

0 comments on commit d2b3d62

Please sign in to comment.