Merge pull request #153 from souzatharsis/feat/litellm

#115 integrates with litellm
souzatharsis · Nov 6, 2024 · a7a16f9 · a7a16f9
2 parents d8562bf + c1fd8c8
commit a7a16f9
Show file tree

Hide file tree

Showing 10 changed files with 263 additions and 30 deletions.
diff --git a/README.md b/README.md
@@ -63,25 +63,17 @@ This sample collection is also [available at audio.com](https://audio.com/thatup
 
 - Generate conversational content from multiple sources and formats (images, websites, YouTube, and PDFs).
 - Customize transcript and audio generation (e.g., style, language, structure, length).
-- Create podcasts from pre-existing or edited transcripts.
-- Leverage cloud-based and local LLMs for transcript generation (increased privacy and control).
-- Integrate with advanced text-to-speech models (OpenAI, Google,ElevenLabs, and Microsoft Edge).
+- Generate transcripts using 100+ LLM models (OpenAI, Anthropic, Google etc).
+- Leverage local LLMs for transcript generation for increased privacy and control.
+- Integrate with advanced text-to-speech models (OpenAI, Google, ElevenLabs, and Microsoft Edge).
 - Provide multi-language support for global content creation.
 - Integrate seamlessly with CLI and Python packages for automated workflows.
 
 ## Updates 🚀
 
-### v0.2.3 release
-- Add support for running LLMs locally
-- Enable config for running podcastfy with no API KEYs
-- and [more...](https://github.com/souzatharsis/podcastfy/blob/main/CHANGELOG.md#023---2024-10-15)
-
-### v0.2.2 release
-- Podcastfy is now multi-modal! Users can generate audio from images + text inputs!
-
-### v0.2.0 release
-- Users can now customize podcast style, structure, and content
-- Integration with LangChain for better LLM management
+### v0.3.0+ release
+- Integrate with 100+ LLM models (OpenAI, Anthropic, Google etc) for transcript generation
+- Integrate with Google's Multispeaker TTS model for high-quality audio generation
 
 ## Quickstart 💻
 

diff --git a/podcastfy.ipynb b/podcastfy.ipynb
@@ -28,7 +28,7 @@
     "- Multilingual Support\n",
     "  - French (fr)\n",
     "  - Portugue (pt-br)\n",
-    "- Local LLM Support"
+    "- Custom LLM Support"
    ]
   },
   {
@@ -818,6 +818,47 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
+    "## Custom LLM Support\n",
+    "\n",
+    "Podcastfy offers a range of LLM models for generating transcripts including OpenAI, Anthropic, Google as well as local LLM models.\n",
+    "\n",
+    "### Cloud-based LLMs\n",
+    "\n",
+    "To select a particular cloud-based LLM model, users can pass the `llm_model_name` and `api_key_label` parameters to the `generate_podcast` function.\n",
+    "\n",
+    "For example, to use OpenAI's `gpt-4-turbo` model, users can pass `llm_model_name=\"gpt-4-turbo\"` and `api_key_label=\"OPENAI_API_KEY\"`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\"\"\"Test generating a podcast with a custom LLM model.\"\"\"\n",
+    "urls = [\"https://en.wikipedia.org/wiki/Artificial_intelligence\"]\n",
+    "\n",
+    "audio_file = generate_podcast(\n",
+    "    urls=urls,\n",
+    "    tts_model=\"edge\",\n",
+    "    llm_model_name=\"gpt-4-turbo\",\n",
+    "    api_key_label=\"OPENAI_API_KEY\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Remember to have the correct API key label and value in your environment variables (`.env` file)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Local LLM Support\n",
+    "\n",
     "We enable serving local LLMs with llamafile. In the API, Local LLM support is available through the `is_local` parameter. If `is_local=True', then a local (llamafile) LLM model is used to generate the podcast transcript. Llamafiles of LLM models can be found on [HuggingFace today offering 156+ models](https://huggingface.co/models?library=llamafile).\n",
     "\n",
     "All you need to do is:\n",

diff --git a/podcastfy/client.py b/podcastfy/client.py
@@ -14,10 +14,7 @@
 from podcastfy.content_generator import ContentGenerator
 from podcastfy.text_to_speech import TextToSpeech
 from podcastfy.utils.config import Config, load_config
-from podcastfy.utils.config_conversation import (
-    ConversationConfig,
-    load_conversation_config,
-)
+from podcastfy.utils.config_conversation import load_conversation_config
 from podcastfy.utils.logger import setup_logger
 from typing import List, Optional, Dict, Any
 import copy
@@ -40,6 +37,8 @@ def process_content(
     image_paths: Optional[List[str]] = None,
     is_local: bool = False,
     text: Optional[str] = None,
+    model_name: Optional[str] = None,
+    api_key_label: Optional[str] = None,
 ):
     """
     Process URLs, a transcript file, image paths, or raw text to generate a podcast or transcript.
@@ -90,6 +89,8 @@ def process_content(
                 image_file_paths=image_paths or [],
                 output_filepath=transcript_filepath,
                 is_local=is_local,
+                model_name=model_name,
+                api_key_label=api_key_label,
             )
 
         if generate_audio:
@@ -98,8 +99,8 @@ def process_content(
                 api_key = getattr(config, f"{tts_model.upper()}_API_KEY")
 
             text_to_speech = TextToSpeech(
-                api_key=api_key,
                 model=tts_model,
+                api_key=api_key,
                 conversation_config=conv_config.to_dict(),
             )
 
@@ -155,6 +156,12 @@ def main(
     text: str = typer.Option(
         None, "--text", "-txt", help="Raw text input to be processed"
     ),
+    llm_model_name: str = typer.Option(
+        None, "--llm-model-name", "-m", help="LLM model name for transcript generation"
+    ),
+    api_key_label: str = typer.Option(
+        None, "--api-key-label", "-k", help="Environment variable name for LLMAPI key"
+    ),
 ):
     """
     Generate a podcast or transcript from a list of URLs, a file containing URLs, a transcript file, image files, or raw text.
@@ -185,6 +192,8 @@ def main(
                 config=config,
                 is_local=is_local,
                 text=text,
+                model_name=llm_model_name,
+                api_key_label=api_key_label,
             )
         else:
             urls_list = urls or []
@@ -205,6 +214,8 @@ def main(
                 image_paths=image_paths,
                 is_local=is_local,
                 text=text,
+                model_name=llm_model_name,
+                api_key_label=api_key_label,
             )
 
         if transcript_only:
@@ -234,6 +245,8 @@ def generate_podcast(
     image_paths: Optional[List[str]] = None,
     is_local: bool = False,
     text: Optional[str] = None,
+    llm_model_name: Optional[str] = None,
+    api_key_label: Optional[str] = None,
 ) -> Optional[str]:
     """
     Generate a podcast or transcript from a list of URLs, a file containing URLs, a transcript file, or image files.
@@ -242,13 +255,15 @@ def generate_podcast(
         urls (Optional[List[str]]): List of URLs to process.
         url_file (Optional[str]): Path to a file containing URLs, one per line.
         transcript_file (Optional[str]): Path to a transcript file.
-        tts_model (Optional[str]): TTS model to use ('openai' [default], 'elevenlabs' or 'edge').
+        tts_model (Optional[str]): TTS model to use ('openai' [default], 'elevenlabs', 'edge', or 'gemini').
         transcript_only (bool): Generate only a transcript without audio. Defaults to False.
         config (Optional[Dict[str, Any]]): User-provided configuration dictionary.
         conversation_config (Optional[Dict[str, Any]]): User-provided conversation configuration dictionary.
         image_paths (Optional[List[str]]): List of image file paths to process.
         is_local (bool): Whether to use a local LLM. Defaults to False.
         text (Optional[str]): Raw text input to be processed.
+        llm_model_name (Optional[str]): LLM model name for content generation.
+        api_key_label (Optional[str]): Environment variable name for LLM API key.
 
     Returns:
         Optional[str]: Path to the final podcast audio file, or None if only generating a transcript.
@@ -272,6 +287,7 @@ def generate_podcast(
                 raise ValueError(
                     "Config must be either a dictionary or a Config object"
                 )
+
         if not conversation_config:
             conversation_config = load_conversation_config().to_dict()
 
@@ -292,6 +308,8 @@ def generate_podcast(
                 conversation_config=conversation_config,
                 is_local=is_local,
                 text=text,
+                model_name=llm_model_name,
+                api_key_label=api_key_label,
             )
         else:
             urls_list = urls or []
@@ -313,6 +331,8 @@ def generate_podcast(
                 image_paths=image_paths,
                 is_local=is_local,
                 text=text,
+                model_name=llm_model_name,
+                api_key_label=api_key_label,
             )
 
     except Exception as e:

diff --git a/podcastfy/content_generator.py b/podcastfy/content_generator.py
@@ -10,6 +10,7 @@
 from typing import Optional, Dict, Any, List
 import re
 
+from langchain_community.chat_models import ChatLiteLLM
 from langchain_google_genai import ChatGoogleGenerativeAI
 from langchain_community.llms.llamafile import Llamafile
 from langchain_core.prompts import ChatPromptTemplate
@@ -30,6 +31,7 @@ def __init__(
         temperature: float,
         max_output_tokens: int,
         model_name: str,
+        api_key_label: str = "OPENAI_API_KEY",
     ):
         """
         Initialize the LLMBackend.
@@ -48,12 +50,16 @@ def __init__(
 
         if is_local:
             self.llm = Llamafile()
-        else:
+        elif "gemini" in self.model_name.lower(): #keeping original gemini as a special case while we build confidence on LiteLLM
             self.llm = ChatGoogleGenerativeAI(
                 model=model_name,
                 temperature=temperature,
                 max_output_tokens=max_output_tokens,
             )
+        else: # user should set api_key_label from input
+            self.llm = ChatLiteLLM(model=self.model_name,
+                                   temperature=temperature,
+                                   api_key=os.environ[api_key_label])
 
 
 class ContentGenerator:
@@ -217,6 +223,8 @@ def generate_qa_content(
         image_file_paths: List[str] = [],
         output_filepath: Optional[str] = None,
         is_local: bool = False,
+        model_name: str = None,
+        api_key_label: str = "OPENAI_API_KEY"
     ) -> str:
         """
         Generate Q&A content based on input texts.
@@ -234,19 +242,21 @@ def generate_qa_content(
                 Exception: If there's an error in generating content.
         """
         try:
+            if not model_name:
+                model_name = self.content_generator_config.get(
+                    "gemini_model", "gemini-1.5-pro-latest"
+                )
+            if is_local:
+                model_name = "User provided local model"
+
             llmbackend = LLMBackend(
                 is_local=is_local,
                 temperature=self.config_conversation.get("creativity", 0),
                 max_output_tokens=self.content_generator_config.get(
                     "max_output_tokens", 8192
                 ),
-                model_name=(
-                    self.content_generator_config.get(
-                        "gemini_model", "gemini-1.5-pro-latest"
-                    )
-                    if not is_local
-                    else "User provided model"
-                ),
+                model_name=model_name,
+                api_key_label=api_key_label
             )
 
             num_images = 0 if is_local else len(image_file_paths)

diff --git a/pyproject.toml b/pyproject.toml
@@ -46,6 +46,7 @@ ffmpeg = "^1.4"
 pytest = "^8.3.3"
 pytest-xdist = "^3.6.1"
 google-cloud-texttospeech = "^2.21.0"
+litellm = "^1.52.0"
 
 
 [tool.poetry.group.dev.dependencies]

diff --git a/requirements.txt b/requirements.txt
@@ -25,7 +25,9 @@ elevenlabs==1.10.0 ; python_version >= "3.11" and python_version < "4.0"
 execnet==2.1.1 ; python_version >= "3.11" and python_version < "4.0"
 fastjsonschema==2.20.0 ; python_version >= "3.11" and python_version < "4.0"
 ffmpeg==1.4 ; python_version >= "3.11" and python_version < "4.0"
+filelock==3.16.1 ; python_version >= "3.11" and python_version < "4.0"
 frozenlist==1.5.0 ; python_version >= "3.11" and python_version < "4.0"
+fsspec==2024.10.0 ; python_version >= "3.11" and python_version < "4.0"
 fuzzywuzzy==0.18.0 ; python_version >= "3.11" and python_version < "4.0"
 google-ai-generativelanguage==0.6.10 ; python_version >= "3.11" and python_version < "4.0"
 google-api-core==2.21.0 ; python_version >= "3.11" and python_version < "4.0"
@@ -53,8 +55,10 @@ httpcore==1.0.6 ; python_version >= "3.11" and python_version < "4.0"
 httplib2==0.22.0 ; python_version >= "3.11" and python_version < "4.0"
 httpx-sse==0.4.0 ; python_version >= "3.11" and python_version < "4.0"
 httpx==0.27.2 ; python_version >= "3.11" and python_version < "4.0"
+huggingface-hub==0.26.2 ; python_version >= "3.11" and python_version < "4.0"
 idna==3.10 ; python_version >= "3.11" and python_version < "4.0"
 imagesize==1.4.1 ; python_version >= "3.11" and python_version < "4.0"
+importlib-metadata==8.5.0 ; python_version >= "3.11" and python_version < "4.0"
 iniconfig==2.0.0 ; python_version >= "3.11" and python_version < "4.0"
 jinja2==3.1.4 ; python_version >= "3.11" and python_version < "4.0"
 jiter==0.6.1 ; python_version >= "3.11" and python_version < "4.0"
@@ -73,6 +77,7 @@ langchain-text-splitters==0.3.0 ; python_version >= "3.11" and python_version <
 langchain==0.3.4 ; python_version >= "3.11" and python_version < "4.0"
 langsmith==0.1.137 ; python_version >= "3.11" and python_version < "4.0"
 levenshtein==0.26.0 ; python_version >= "3.11" and python_version < "4.0"
+litellm==1.52.0 ; python_version >= "3.11" and python_version < "4.0"
 markdown-it-py==3.0.0 ; python_version >= "3.11" and python_version < "4.0"
 markupsafe==3.0.2 ; python_version >= "3.11" and python_version < "4.0"
 marshmallow==3.23.0 ; python_version >= "3.11" and python_version < "4.0"
@@ -86,7 +91,7 @@ nbformat==5.10.4 ; python_version >= "3.11" and python_version < "4.0"
 nbsphinx==0.9.5 ; python_version >= "3.11" and python_version < "4.0"
 nest-asyncio==1.6.0 ; python_version >= "3.11" and python_version < "4.0"
 numpy==1.26.4 ; python_version >= "3.11" and python_version < "4.0"
-openai==1.52.2 ; python_version >= "3.11" and python_version < "4.0"
+openai==1.54.2 ; python_version >= "3.11" and python_version < "4.0"
 orjson==3.10.10 ; python_version >= "3.11" and python_version < "4.0"
 packaging==24.1 ; python_version >= "3.11" and python_version < "4.0"
 pandas==2.2.3 ; python_version >= "3.11" and python_version < "4.0"
@@ -120,6 +125,7 @@ pyyaml==6.0.2 ; python_version >= "3.11" and python_version < "4.0"
 pyzmq==26.2.0 ; python_version >= "3.11" and python_version < "4.0"
 rapidfuzz==3.10.1 ; python_version >= "3.11" and python_version < "4.0"
 referencing==0.35.1 ; python_version >= "3.11" and python_version < "4.0"
+regex==2024.9.11 ; python_version >= "3.11" and python_version < "4.0"
 requests-toolbelt==1.0.0 ; python_version >= "3.11" and python_version < "4.0"
 requests==2.32.3 ; python_version >= "3.11" and python_version < "4.0"
 rich==13.9.3 ; python_version >= "3.11" and python_version < "4.0"
@@ -144,7 +150,9 @@ sphinxcontrib-qthelp==2.0.0 ; python_version >= "3.11" and python_version < "4.0
 sphinxcontrib-serializinghtml==2.0.0 ; python_version >= "3.11" and python_version < "4.0"
 sqlalchemy==2.0.36 ; python_version >= "3.11" and python_version < "4.0"
 tenacity==9.0.0 ; python_version >= "3.11" and python_version < "4.0"
+tiktoken==0.8.0 ; python_version >= "3.11" and python_version < "4.0"
 tinycss2==1.4.0 ; python_version >= "3.11" and python_version < "4.0"
+tokenizers==0.20.3 ; python_version >= "3.11" and python_version < "4.0"
 tornado==6.4.1 ; python_version >= "3.11" and python_version < "4.0"
 tqdm==4.66.5 ; python_version >= "3.11" and python_version < "4.0"
 traitlets==5.14.3 ; python_version >= "3.11" and python_version < "4.0"
@@ -160,3 +168,4 @@ websockets==13.1 ; python_version >= "3.11" and python_version < "4.0"
 wheel==0.44.0 ; python_version >= "3.11" and python_version < "4.0"
 yarl==1.16.0 ; python_version >= "3.11" and python_version < "4.0"
 youtube-transcript-api==0.6.2 ; python_version >= "3.11" and python_version < "4.0"
+zipp==3.20.2 ; python_version >= "3.11" and python_version < "4.0"