Merge pull request #2 from biocypher:podcast

Podcast
biocypher · Jul 6, 2023 · d2b3d62 · d2b3d62
2 parents 494891d + 1d8b723
commit d2b3d62
Show file tree

Hide file tree

Showing 10 changed files with 571 additions and 244 deletions.
diff --git a/.gitignore b/.gitignore
@@ -5,3 +5,4 @@ __pycache__/
 .venv
 .pytest_cache
 .env
+*.mp3
diff --git a/biochatter/llm_connect.py b/biochatter/llm_connect.py
@@ -63,12 +63,14 @@ def __init__(
         self,
         model_name: str,
         prompts: dict,
+        correct: bool = True,
         split_correction: bool = False,
         docsum: DocumentEmbedder = None,
     ):
         super().__init__()
         self.model_name = model_name
         self.prompts = prompts
+        self.correct = correct
         self.split_correction = split_correction
         self.docsum = docsum
         self.history = []
@@ -163,6 +165,9 @@ def query(self, text: str):
             # indicates error
             return (msg, token_usage, None)
 
+        if not self.correct:
+            return (msg, token_usage, "OK")
+
         cor_msg = (
             "Correcting (using single sentences) ..."
             if self.split_correction
@@ -292,7 +297,8 @@ def __init__(
         self,
         model_name: str,
         prompts: dict,
-        split_correction: bool,
+        correct: bool = True,
+        split_correction: bool = False,
         docsum: DocumentEmbedder = None,
     ):
         """
@@ -315,6 +321,7 @@ def __init__(
         super().__init__(
             model_name=model_name,
             prompts=prompts,
+            correct=correct,
             split_correction=split_correction,
             docsum=docsum,
         )
@@ -444,7 +451,8 @@ def __init__(
         deployment_name: str,
         model_name: str,
         prompts: dict,
-        split_correction: bool,
+        correct: bool = True,
+        split_correction: bool = False,
         docsum: DocumentEmbedder = None,
         version: Optional[str] = None,
         base: Optional[str] = None,
@@ -475,6 +483,7 @@ def __init__(
         super().__init__(
             model_name=model_name,
             prompts=prompts,
+            correct=correct,
             split_correction=split_correction,
             docsum=docsum,
         )

diff --git a/biochatter/podcast.py b/biochatter/podcast.py
@@ -0,0 +1,174 @@
+from typing import List
+from langchain.schema import Document
+from .llm_connect import GptConversation
+from gtts import gTTS
+import nltk
+import os
+
+FIRST_PROMPT = (
+    "You are tasked with summarising a scientific manuscript for consumption as"
+    "a podcast. As a first step, extract title and authors of the manuscript"
+    "from the following text. Return them in the format 'Title: <title>,"
+    "Authors: <authors>'."
+)
+
+PROMPT = (
+    "You are tasked with summarising a scientific manuscript for consumption as"
+    "a podcast. You will receive a collection of sentences from the"
+    "manuscript, from which you will remove any information not relevant to the"
+    "content, such as references, figure legends, tables, author information, "
+    "journal metadata, and so on. You will then be asked to summarise the"
+    "section of the manuscript, making the wording more suitable for listening."
+    "Remove all content in brackets that is of technical nature, such as"
+    "p-values, statistical tests, and so on."
+)
+
+
+class Podcaster:
+    def __init__(self, document: Document) -> None:
+        """
+        Orchestrates the podcasting of a document.
+        """
+        self.document = document
+
+    def generate_podcast(self, characters_per_paragraph: int) -> None:
+        """
+        Podcasts the document.
+        """
+        full_text = self.document[0].page_content
+
+        # split text by sentence
+        sentences = self._split_text(full_text)
+
+        # could embed sentences and cluster on cosine similarity to identify
+        # paragraphs here
+
+        # preprocess text
+        for i, sentence in enumerate(sentences):
+            # special cases i.e. and e.g. - if sentence ends with one of these,
+            # append next sentence
+            special_cases = ["i.e.", "e.g."]
+            if sentence.endswith(tuple(special_cases)):
+                sentences[i] = sentence + " " + sentences[i + 1]
+                del sentences[i + 1]
+
+        # concatenate first 5 sentences for title and author extraction
+        first_5 = "\n".join(sentences[:5])
+        self.podcast_intro = self._title_and_authors(first_5)
+
+        # LLM to determine section breaks?
+
+        # go through sections and summarise each
+        self.summarised_sections = self._summarise_sections(
+            sentences,
+            characters_per_paragraph,
+        )
+
+        # summarise the summaries
+
+    def _split_text(self, text: str) -> List[str]:
+        """
+        Splits consecutive text into sentences.
+        """
+        nltk.download("punkt")
+        tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")
+        return tokenizer.tokenize(text)
+
+    def _title_and_authors(self, text: str) -> str:
+        """
+        Extracts title and authors from document.
+
+        Args:
+            text (str): text to extract title and authors from
+
+        Returns:
+            str: title and authors
+        """
+        # first sentence - extract title, authors
+        c_first = GptConversation(
+            model_name="gpt-3.5-turbo",
+            prompts={},
+            correct=False,
+        )
+        c_first.set_api_key(api_key=os.getenv("OPENAI_API_KEY"), user="podcast")
+        c_first.append_system_message(FIRST_PROMPT)
+        msg, token_usage, correction = c_first.query(text)
+        # split at authors ('Authors:' or '\nAuthors:')
+        title = msg.split("Title:")[1].split("Authors:")[0].strip()
+        authors = msg.split("Authors:")[1].strip()
+        return f"{title}, by {authors}, podcasted by biochatter."
+
+    def _summarise_section(self, text: str) -> str:
+        """
+        Summarises a section of the document.
+
+        Args:
+            text (str): text to summarise
+
+        Returns:
+            str: summarised text
+        """
+        # summarise section
+        c = GptConversation(
+            model_name="gpt-3.5-turbo",
+            prompts={},
+            correct=False,
+        )
+        c.set_api_key(api_key=os.getenv("OPENAI_API_KEY"), user="podcast")
+        c.append_system_message(PROMPT)
+        msg, token_usage, correction = c.query(text)
+        return msg
+
+    def _summarise_sections(
+        self, sentences: list, characters_per_paragraph: int
+    ) -> list:
+        """
+
+        Summarises sections of the document. Concatenates sentences until
+        characters_per_paragraph is reached, removing each sentence from the
+        list as it is added to the section to be summarised.
+
+        Args:
+            sentences (list): list of sentences to summarise
+            characters_per_paragraph (int): number of characters per paragraph
+
+        Returns:
+            list: list of summarised sections
+        """
+        summarised_sections = []
+        section = ""
+        while sentences:
+            sentence = sentences.pop(0)
+            tmp = section + sentence
+            if len(tmp) < characters_per_paragraph and sentences:
+                section += sentence
+            else:
+                if sentences:
+                    sentences.insert(0, sentence)
+                summarised_section = self._summarise_section(section)
+                summarised_sections.append(summarised_section)
+                section = ""
+
+        return summarised_sections
+
+    def podcast_to_file(self, path: str) -> None:
+        """
+        Uses text-to-speech to generate audio for the summarised paper podcast.
+
+        Args:
+            path (str): path to save audio file to
+        """
+
+        full_text = self.podcast_to_text()
+
+        audio = gTTS(text=full_text)
+        audio.save(path)
+
+    def podcast_to_text(self):
+        """
+        Returns the summarised paper podcast as text.
+        """
+        full_text = self.podcast_intro + "\n\n"
+        for section in self.summarised_sections:
+            full_text += section + "\n\n"
+        return full_text
diff --git a/biochatter/vectorstore.py b/biochatter/vectorstore.py
@@ -71,34 +71,8 @@ def set_document(self, document: List[Document]) -> None:
         self.document = document
 
     def _load_document(self, path: str) -> None:
-        """
-        Loads a document from a path; accepts txt and pdf files. Txt files are
-        loaded as-is, pdf files are converted to text using fitz.
-
-        Args:
-            path (str): path to document
-
-        Returns:
-            List[Document]: list of documents
-        """
-        if path.endswith(".txt"):
-            loader = TextLoader(path)
-            self.document = loader.load()
-        elif path.endswith(".pdf"):
-            doc = fitz.open(path)
-            text = ""
-            for page in doc:
-                text += page.get_text()
-
-            meta = {k: v for k, v in doc.metadata.items() if v}
-            meta.update({"source": path})
-
-            self.document = [
-                Document(
-                    page_content=text,
-                    metadata=meta,
-                )
-            ]
+        reader = DocumentReader()
+        self.document = reader.load_document(path)
 
     def split_document(self) -> None:
         text_splitter = RecursiveCharacterTextSplitter(
@@ -139,36 +113,79 @@ def similarity_search(self, query: str, k: int = 3):
             raise NotImplementedError(self.vector_db_vendor)
 
 
-def document_from_pdf(pdf) -> List[Document]:
-    """
-    Receive a byte representation of a pdf file and return a list of Documents
-    with metadata.
-    """
-    doc = fitz.open(stream=pdf, filetype="pdf")
-    text = ""
-    for page in doc:
-        text += page.get_text()
+class DocumentReader:
+    def load_document(self, path: str) -> List[Document]:
+        """
+        Loads a document from a path; accepts txt and pdf files. Txt files are
+        loaded as-is, pdf files are converted to text using fitz.
 
-    meta = {k: v for k, v in doc.metadata.items() if v}
-    meta.update({"source": "pdf"})
+        Args:
+            path (str): path to document
 
-    return [
-        Document(
-            page_content=text,
-            metadata=meta,
-        )
-    ]
-
-
-def document_from_txt(txt) -> List[Document]:
-    """
-    Receive a byte representation of a txt file and return a list of Documents
-    with metadata.
-    """
-    meta = {"source": "txt"}
-    return [
-        Document(
-            page_content=txt,
-            metadata=meta,
-        )
-    ]
+        Returns:
+            List[Document]: list of documents
+        """
+        if path.endswith(".txt"):
+            loader = TextLoader(path)
+            return loader.load()
+
+        elif path.endswith(".pdf"):
+            doc = fitz.open(path)
+            text = ""
+            for page in doc:
+                text += page.get_text()
+
+            meta = {k: v for k, v in doc.metadata.items() if v}
+            meta.update({"source": path})
+
+            return [
+                Document(
+                    page_content=text,
+                    metadata=meta,
+                )
+            ]
+
+    def document_from_pdf(self, pdf: bytes) -> List[Document]:
+        """
+        Receive a byte representation of a pdf file and return a list of Documents
+        with metadata.
+
+        Args:
+            pdf (bytes): byte representation of pdf file
+
+        Returns:
+            List[Document]: list of documents
+        """
+        doc = fitz.open(stream=pdf, filetype="pdf")
+        text = ""
+        for page in doc:
+            text += page.get_text()
+
+        meta = {k: v for k, v in doc.metadata.items() if v}
+        meta.update({"source": "pdf"})
+
+        return [
+            Document(
+                page_content=text,
+                metadata=meta,
+            )
+        ]
+
+    def document_from_txt(self, txt: bytes) -> List[Document]:
+        """
+        Receive a byte representation of a txt file and return a list of Documents
+        with metadata.
+
+        Args:
+            txt (bytes): byte representation of txt file
+
+        Returns:
+            List[Document]: list of documents
+        """
+        meta = {"source": "txt"}
+        return [
+            Document(
+                page_content=txt,
+                metadata=meta,
+            )
+        ]
-Original file line number
+Diff line change
@@ Expand Up / @@ -5,3 +5,4 @@ __pycache__/ @@
     .venv
     .pytest_cache
     .env
+    *.mp3