From 6a255943b4768f7fb606d013932d8533d8f638d4 Mon Sep 17 00:00:00 2001 From: Rohit Prasad Date: Wed, 13 Nov 2024 14:46:54 -0800 Subject: [PATCH] Cleaning up old file. (#48) --- examples/RAG.ipynb | 286 --------------------------------------------- 1 file changed, 286 deletions(-) delete mode 100644 examples/RAG.ipynb diff --git a/examples/RAG.ipynb b/examples/RAG.ipynb deleted file mode 100644 index 0f9cd445..00000000 --- a/examples/RAG.ipynb +++ /dev/null @@ -1,286 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "metadata": { - "ExecuteTime": { - "end_time": "2024-07-28T19:09:30.151056Z", - "start_time": "2024-07-28T19:09:30.144028Z" - } - }, - "source": [ - "# load api keys from the `.env` file. An example of this can be found in `.env.sample`\n", - "from dotenv import load_dotenv; load_dotenv()" - ], - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "execution_count": 9 - }, - { - "cell_type": "code", - "metadata": { - "ExecuteTime": { - "end_time": "2024-07-28T18:59:06.786489Z", - "start_time": "2024-07-28T18:56:34.662611Z" - } - }, - "source": [ - "# Get Data\n", - "#import csv\n", - "\n", - "from datasets import load_dataset\n", - "ds = load_dataset(\"stanfordnlp/sentiment140\")\n" - ], - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Downloading data: 100%|██████████| 81.4M/81.4M [01:55<00:00, 706kB/s] \n", - "Generating train split: 100%|██████████| 1600000/1600000 [00:34<00:00, 46744.97 examples/s]\n", - "Generating test split: 100%|██████████| 498/498 [00:00<00:00, 30145.67 examples/s]\n" - ] - } - ], - "execution_count": 3 - }, - { - "cell_type": "code", - "metadata": { - "ExecuteTime": { - "end_time": "2024-07-28T18:59:53.134465Z", - "start_time": "2024-07-28T18:59:53.114471Z" - } - }, - "source": [ - "docs = ds['train'][:2000]['text']" - ], - "outputs": [], - "execution_count": 4 - }, - { - "cell_type": "code", - "metadata": { - "ExecuteTime": { - "end_time": "2024-07-28T19:02:35.199055Z", - "start_time": "2024-07-28T18:59:58.617763Z" - } - }, - "source": [ - "# Get our encoder to encode \n", - "from sentence_transformers import SentenceTransformer \n", - "model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')\n", - "data_emb = model.encode(docs) #16 seconds on M1 Mac 8gb\n" - ], - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/ksolo/Library/Caches/pypoetry/virtualenvs/aisuite-HUywTnIy-py3.12/lib/python3.12/site-packages/sentence_transformers/evaluation/SentenceEvaluator.py:81: SyntaxWarning: invalid escape sequence '\\g'\n", - " return re.sub(r\"([a-z])([A-Z])\", \"\\g<1> \\g<2>\", class_name)\n", - "/Users/ksolo/Library/Caches/pypoetry/virtualenvs/aisuite-HUywTnIy-py3.12/lib/python3.12/site-packages/sentence_transformers/model_card.py:524: SyntaxWarning: invalid escape sequence '\\d'\n", - " if dataset_name and re.match(\"_dataset_\\d+\", dataset_name):\n", - "/Users/ksolo/Library/Caches/pypoetry/virtualenvs/aisuite-HUywTnIy-py3.12/lib/python3.12/site-packages/sentence_transformers/losses/DenoisingAutoEncoderLoss.py:16: SyntaxWarning: invalid escape sequence '\\_'\n", - " \"\"\"\n" - ] - } - ], - "execution_count": 5 - }, - { - "cell_type": "code", - "metadata": { - "ExecuteTime": { - "end_time": "2024-07-28T19:03:06.618999Z", - "start_time": "2024-07-28T19:03:03.296772Z" - } - }, - "source": [ - "# Now set up the vector store to accept the data\n", - "\n", - "import chromadb\n", - "\n", - "chroma_client = chromadb.Client()\n", - "collection = chroma_client.create_collection(name=\"SampleDB\")\n", - "\n", - "collection.add(\n", - " embeddings=data_emb.tolist(),\n", - " documents=docs,\n", - " ids=[str(idx) for idx in range(len(data_emb))]) # Doc ID's are required\n", - "\n" - ], - "outputs": [], - "execution_count": 6 - }, - { - "cell_type": "code", - "metadata": { - "ExecuteTime": { - "end_time": "2024-07-28T19:03:49.751009Z", - "start_time": "2024-07-28T19:03:49.727998Z" - } - }, - "source": [ - "#prep the question in the encoding space\n", - "\n", - "question = 'What is the status of the chalkboard?'\n", - "question_emb = model.encode(question)\n", - "\n", - "results = collection.query(query_embeddings=question_emb.tolist(), n_results=10)\n", - "\n", - "context = ' '.join(results['documents'][0]) # Pulling out a lists of lists\n" - ], - "outputs": [], - "execution_count": 7 - }, - { - "cell_type": "code", - "metadata": { - "ExecuteTime": { - "end_time": "2024-07-28T19:04:36.973296Z", - "start_time": "2024-07-28T19:04:36.952794Z" - } - }, - "source": [ - "# Call the models to determine the answer by response\n", - "\n", - "prompt = f'Given the following data, Please answer the question: \\n\\n ##question \\n {question}\\n\\n ##context \\n {context}'\n", - "\n", - "import aisuite as ai\n", - "client = ai.Client()\n", - "\n", - "messages = [\n", - " {\"role\": \"system\", \"content\": \"You are a helpful agent, who answers with brevity. \"},\n", - " {\"role\": \"user\", \"content\": prompt},\n", - "]\n" - ], - "outputs": [], - "execution_count": 8 - }, - { - "cell_type": "code", - "metadata": { - "ExecuteTime": { - "end_time": "2024-07-28T19:09:41.132595Z", - "start_time": "2024-07-28T19:09:40.634038Z" - } - }, - "source": [ - "#groq_llama3_8b = \"groq:llama3-8b-8192\" \n", - "response = client.chat.completions.create(model=\"groq:llama3-70b-8192\", messages=messages)\n", - "\n", - "print(response.choices[0].message.content)\n" - ], - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", - "To disable this warning, you can either:\n", - "\t- Avoid using `tokenizers` before the fork if possible\n", - "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The status of the chalkboard is: USELESS (because there is no chalk).\n" - ] - } - ], - "execution_count": 10 - }, - { - "cell_type": "code", - "metadata": { - "ExecuteTime": { - "end_time": "2024-07-28T19:10:50.448134Z", - "start_time": "2024-07-28T19:10:50.443295Z" - } - }, - "source": [ - "results['documents'][0][0]" - ], - "outputs": [ - { - "data": { - "text/plain": [ - "\"Damn... I don't have any chalk! MY CHALKBOARD IS USELESS \"" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "execution_count": 11 - }, - { - "metadata": { - "ExecuteTime": { - "end_time": "2024-07-28T19:11:15.766909Z", - "start_time": "2024-07-28T19:11:12.732107Z" - } - }, - "cell_type": "code", - "source": [ - "response = client.chat.completions.create(model=\"anthropic:claude-3-opus-20240229\", messages=messages)\n", - "print(response.choices[0].message.content)" - ], - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", - "To disable this warning, you can either:\n", - "\t- Avoid using `tokenizers` before the fork if possible\n", - "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Based on the context provided, the status of the chalkboard is useless because the person does not have any chalk.\n" - ] - } - ], - "execution_count": 12 - } - ], - "metadata": { - "kernelspec": { - "display_name": ".venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.1" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -}