diff --git a/src/llm_utilikit/LangChain/notebooks/langchain-embeddings-retrieval-agent.ipynb b/src/llm_utilikit/LangChain/notebooks/langchain-embeddings-retrieval-agent.ipynb index 97b4e31..75d486b 100644 --- a/src/llm_utilikit/LangChain/notebooks/langchain-embeddings-retrieval-agent.ipynb +++ b/src/llm_utilikit/LangChain/notebooks/langchain-embeddings-retrieval-agent.ipynb @@ -3,8 +3,8 @@ { "cell_type": "markdown", "metadata": { - "id": "view-in-github", - "colab_type": "text" + "colab_type": "text", + "id": "view-in-github" }, "source": [ "\"Open" @@ -44,84 +44,38 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": { - "id": "pva9ehKXUpU2", "colab": { "base_uri": "https://localhost:8080/" }, + "id": "pva9ehKXUpU2", "outputId": "21af5614-b078-415d-8aa3-9efd125b4757" }, "outputs": [ { - "output_type": "stream", "name": "stdout", + "output_type": "stream", + "text": [ + "Note: you may need to restart the kernel to use updated packages.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", "text": [ - "\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/72.0 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m \u001b[32m71.7/72.0 kB\u001b[0m \u001b[31m2.5 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m72.0/72.0 kB\u001b[0m \u001b[31m1.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m177.2/177.2 kB\u001b[0m \u001b[31m5.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m770.9/770.9 kB\u001b[0m \u001b[31m9.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.7/1.7 MB\u001b[0m \u001b[31m12.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m62.5/62.5 kB\u001b[0m \u001b[31m5.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m300.4/300.4 kB\u001b[0m \u001b[31m16.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m20.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.1/1.1 MB\u001b[0m \u001b[31m24.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m12.3/12.3 MB\u001b[0m \u001b[31m30.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m16.4/16.4 MB\u001b[0m \u001b[31m20.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m34.9/34.9 MB\u001b[0m \u001b[31m28.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m90.0/90.0 kB\u001b[0m \u001b[31m11.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m49.4/49.4 kB\u001b[0m \u001b[31m6.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m224.5/224.5 kB\u001b[0m \u001b[31m25.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m223.6/223.6 kB\u001b[0m \u001b[31m25.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m223.0/223.0 kB\u001b[0m \u001b[31m23.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m218.0/218.0 kB\u001b[0m \u001b[31m24.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m218.0/218.0 kB\u001b[0m \u001b[31m24.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m211.7/211.7 kB\u001b[0m \u001b[31m23.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m341.8/341.8 kB\u001b[0m \u001b[31m30.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m73.4/73.4 kB\u001b[0m \u001b[31m9.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m11.1/11.1 MB\u001b[0m \u001b[31m42.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m143.4/143.4 kB\u001b[0m \u001b[31m17.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m121.4/121.4 kB\u001b[0m \u001b[31m15.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m120.3/120.3 kB\u001b[0m \u001b[31m15.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m115.6/115.6 kB\u001b[0m \u001b[31m14.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m115.5/115.5 kB\u001b[0m \u001b[31m12.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m115.3/115.3 kB\u001b[0m \u001b[31m13.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m115.1/115.1 kB\u001b[0m \u001b[31m14.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m114.6/114.6 kB\u001b[0m \u001b[31m13.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", - "\u001b[?25h\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", - "google-cloud-bigquery 3.10.0 requires protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.19.5, but you have protobuf 3.19.3 which is incompatible.\n", - "google-cloud-bigquery-connection 1.12.1 requires google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.10.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3.0.0dev,>=1.34.0, but you have google-api-core 2.8.2 which is incompatible.\n", - "google-cloud-bigquery-connection 1.12.1 requires protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.19.5, but you have protobuf 3.19.3 which is incompatible.\n", - "google-cloud-bigquery-storage 2.22.0 requires google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.10.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3.0.0dev,>=1.34.0, but you have google-api-core 2.8.2 which is incompatible.\n", - "google-cloud-bigquery-storage 2.22.0 requires protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.19.5, but you have protobuf 3.19.3 which is incompatible.\n", - "google-cloud-datastore 2.15.2 requires google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.10.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3.0.0dev,>=1.34.0, but you have google-api-core 2.8.2 which is incompatible.\n", - "google-cloud-datastore 2.15.2 requires protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.19.5, but you have protobuf 3.19.3 which is incompatible.\n", - "google-cloud-firestore 2.11.1 requires google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.10.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3.0.0dev,>=1.34.0, but you have google-api-core 2.8.2 which is incompatible.\n", - "google-cloud-firestore 2.11.1 requires protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.19.5, but you have protobuf 3.19.3 which is incompatible.\n", - "google-cloud-functions 1.13.3 requires google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.10.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3.0.0dev,>=1.34.0, but you have google-api-core 2.8.2 which is incompatible.\n", - "google-cloud-functions 1.13.3 requires protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.19.5, but you have protobuf 3.19.3 which is incompatible.\n", - "google-cloud-language 2.9.1 requires google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.10.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3.0.0dev,>=1.34.0, but you have google-api-core 2.8.2 which is incompatible.\n", - "google-cloud-language 2.9.1 requires protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.19.5, but you have protobuf 3.19.3 which is incompatible.\n", - "google-cloud-translate 3.11.3 requires google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.10.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3.0.0dev,>=1.34.0, but you have google-api-core 2.8.2 which is incompatible.\n", - "google-cloud-translate 3.11.3 requires protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.19.5, but you have protobuf 3.19.3 which is incompatible.\n", - "google-colab 1.0.0 requires pandas==1.5.3, but you have pandas 2.1.1 which is incompatible.\n", - "grpc-google-iam-v1 0.12.6 requires protobuf!=3.20.0,!=3.20.1,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.19.5, but you have protobuf 3.19.3 which is incompatible.\n", - "pandas-gbq 0.17.9 requires pyarrow<10.0dev,>=3.0.0, but you have pyarrow 11.0.0 which is incompatible.\n", - "tensorboard 2.13.0 requires protobuf>=3.19.6, but you have protobuf 3.19.3 which is incompatible.\n", - "tensorflow 2.13.0 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3, but you have protobuf 3.19.3 which is incompatible.\n", - "tensorflow-datasets 4.9.3 requires protobuf>=3.20, but you have protobuf 3.19.3 which is incompatible.\n", - "tensorflow-hub 0.14.0 requires protobuf>=3.19.6, but you have protobuf 3.19.3 which is incompatible.\n", - "tensorflow-metadata 1.14.0 requires protobuf<4.21,>=3.20.3, but you have protobuf 3.19.3 which is incompatible.\u001b[0m\u001b[31m\n", - "\u001b[0m" + " WARNING: Failed to remove contents in a temporary directory 'C:\\Users\\dae\\.vscode\\Software\\.venv\\Lib\\site-packages\\google\\~rotobuf'.\n", + " You can safely remove it manually.\n" ] } ], "source": [ - "!pip install -qU \\\n", - " openai==0.27.7 \\\n", - " \"pinecone-client[grpc]\"==2.2.1 \\\n", - " pinecone-datasets==0.5.1 \\\n", - " langchain==0.0.162 \\\n", - " tiktoken==0.4.0" + "%pip install -qU \\\n", + " openai \\\n", + " \"pinecone-client[grpc]\" \\\n", + " pinecone-datasets \\\n", + " langchain \\\n", + " tiktoken" ] }, { @@ -144,7 +98,54 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['ANN_DEEP1B_d96_angular',\n", + " 'ANN_Fashion-MNIST_d784_euclidean',\n", + " 'ANN_GIST_d960_euclidean',\n", + " 'ANN_GloVe_d100_angular',\n", + " 'ANN_GloVe_d200_angular',\n", + " 'ANN_GloVe_d25_angular',\n", + " 'ANN_GloVe_d50_angular',\n", + " 'ANN_LastFM_d64_angular',\n", + " 'ANN_MNIST_d784_euclidean',\n", + " 'ANN_NYTimes_d256_angular',\n", + " 'ANN_SIFT1M_d128_euclidean',\n", + " 'amazon_toys_quora_all-MiniLM-L6-bm25',\n", + " 'it-threat-data-test',\n", + " 'it-threat-data-train',\n", + " 'langchain-python-docs-text-embedding-ada-002',\n", + " 'movielens-user-ratings',\n", + " 'msmarco-v1-bm25-allMiniLML6V2',\n", + " 'quora_all-MiniLM-L6-bm25-100K',\n", + " 'quora_all-MiniLM-L6-bm25',\n", + " 'quora_all-MiniLM-L6-v2_Splade-100K',\n", + " 'quora_all-MiniLM-L6-v2_Splade',\n", + " 'squad-text-embedding-ada-002',\n", + " 'wikipedia-simple-text-embedding-ada-002-100K',\n", + " 'wikipedia-simple-text-embedding-ada-002',\n", + " 'youtube-transcripts-text-embedding-ada-002']" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from pinecone_datasets import load_dataset, list_datasets\n", + "\n", + "# Check available datasets\n", + "list_datasets()" + ] + }, + { + "cell_type": "code", + "execution_count": 10, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -155,33 +156,103 @@ }, "outputs": [ { - "output_type": "execute_result", + "name": "stderr", + "output_type": "stream", + "text": [ + "_request non-retriable exception: Invalid bucket name: 'pinecone-datasets-dev\\squad-text-embedding-ada-002', 400\n", + "Traceback (most recent call last):\n", + " File \"c:\\Users\\dae\\.vscode\\Software\\.venv\\lib\\site-packages\\gcsfs\\retry.py\", line 123, in retry_request\n", + " return await func(*args, **kwargs)\n", + " File \"c:\\Users\\dae\\.vscode\\Software\\.venv\\lib\\site-packages\\gcsfs\\core.py\", line 430, in _request\n", + " validate_response(status, contents, path, args)\n", + " File \"c:\\Users\\dae\\.vscode\\Software\\.venv\\lib\\site-packages\\gcsfs\\retry.py\", line 110, in validate_response\n", + " raise HttpError(error)\n", + "gcsfs.retry.HttpError: Invalid bucket name: 'pinecone-datasets-dev\\squad-text-embedding-ada-002', 400\n" + ] + }, + { + "ename": "HttpError", + "evalue": "Invalid bucket name: 'pinecone-datasets-dev\\squad-text-embedding-ada-002', 400", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mHttpError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[10], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m dataset \u001b[38;5;241m=\u001b[39m \u001b[43mload_dataset\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43msquad-text-embedding-ada-002\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[0;32m 2\u001b[0m dataset\u001b[38;5;241m.\u001b[39mhead()\n", + "File \u001b[1;32mc:\\Users\\dae\\.vscode\\Software\\.venv\\lib\\site-packages\\pinecone_datasets\\public.py:59\u001b[0m, in \u001b[0;36mload_dataset\u001b[1;34m(dataset_id, **kwargs)\u001b[0m\n\u001b[0;32m 57\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mFileNotFoundError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDataset \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mdataset_id\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m not found in catalog\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 58\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m---> 59\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m Dataset\u001b[38;5;241m.\u001b[39mfrom_catalog(dataset_id, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n", + "File \u001b[1;32mc:\\Users\\dae\\.vscode\\Software\\.venv\\lib\\site-packages\\pinecone_datasets\\dataset.py:112\u001b[0m, in \u001b[0;36mDataset.from_catalog\u001b[1;34m(cls, dataset_id, catalog_base_path, **kwargs)\u001b[0m\n\u001b[0;32m 106\u001b[0m catalog_base_path \u001b[38;5;241m=\u001b[39m (\n\u001b[0;32m 107\u001b[0m catalog_base_path\n\u001b[0;32m 108\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m catalog_base_path\n\u001b[0;32m 109\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m os\u001b[38;5;241m.\u001b[39menviron\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDATASETS_CATALOG_BASEPATH\u001b[39m\u001b[38;5;124m\"\u001b[39m, cfg\u001b[38;5;241m.\u001b[39mStorage\u001b[38;5;241m.\u001b[39mendpoint)\n\u001b[0;32m 110\u001b[0m )\n\u001b[0;32m 111\u001b[0m dataset_path \u001b[38;5;241m=\u001b[39m os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mjoin(catalog_base_path, \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mdataset_id\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m--> 112\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mcls\u001b[39m(dataset_path\u001b[38;5;241m=\u001b[39mdataset_path, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n", + "File \u001b[1;32mc:\\Users\\dae\\.vscode\\Software\\.venv\\lib\\site-packages\\pinecone_datasets\\dataset.py:212\u001b[0m, in \u001b[0;36mDataset.__init__\u001b[1;34m(self, dataset_path, **kwargs)\u001b[0m\n\u001b[0;32m 210\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_fs \u001b[38;5;241m=\u001b[39m get_cloud_fs(endpoint, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m 211\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_dataset_path \u001b[38;5;241m=\u001b[39m dataset_path\n\u001b[1;32m--> 212\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_fs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mexists\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_dataset_path\u001b[49m\u001b[43m)\u001b[49m:\n\u001b[0;32m 213\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mFileNotFoundError\u001b[39;00m(\n\u001b[0;32m 214\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDataset does not exist. Please check the path or dataset_id\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 215\u001b[0m )\n\u001b[0;32m 216\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n", + "File \u001b[1;32mc:\\Users\\dae\\.vscode\\Software\\.venv\\lib\\site-packages\\fsspec\\asyn.py:118\u001b[0m, in \u001b[0;36msync_wrapper..wrapper\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 115\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(func)\n\u001b[0;32m 116\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mwrapper\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[0;32m 117\u001b[0m \u001b[38;5;28mself\u001b[39m \u001b[38;5;241m=\u001b[39m obj \u001b[38;5;129;01mor\u001b[39;00m args[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m--> 118\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m sync(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mloop, func, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n", + "File \u001b[1;32mc:\\Users\\dae\\.vscode\\Software\\.venv\\lib\\site-packages\\fsspec\\asyn.py:103\u001b[0m, in \u001b[0;36msync\u001b[1;34m(loop, func, timeout, *args, **kwargs)\u001b[0m\n\u001b[0;32m 101\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m FSTimeoutError \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mreturn_result\u001b[39;00m\n\u001b[0;32m 102\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(return_result, \u001b[38;5;167;01mBaseException\u001b[39;00m):\n\u001b[1;32m--> 103\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m return_result\n\u001b[0;32m 104\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 105\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m return_result\n", + "File \u001b[1;32mc:\\Users\\dae\\.vscode\\Software\\.venv\\lib\\site-packages\\fsspec\\asyn.py:56\u001b[0m, in \u001b[0;36m_runner\u001b[1;34m(event, coro, result, timeout)\u001b[0m\n\u001b[0;32m 54\u001b[0m coro \u001b[38;5;241m=\u001b[39m asyncio\u001b[38;5;241m.\u001b[39mwait_for(coro, timeout\u001b[38;5;241m=\u001b[39mtimeout)\n\u001b[0;32m 55\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m---> 56\u001b[0m result[\u001b[38;5;241m0\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m coro\n\u001b[0;32m 57\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mException\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m ex:\n\u001b[0;32m 58\u001b[0m result[\u001b[38;5;241m0\u001b[39m] \u001b[38;5;241m=\u001b[39m ex\n", + "File \u001b[1;32mc:\\Users\\dae\\.vscode\\Software\\.venv\\lib\\site-packages\\fsspec\\asyn.py:677\u001b[0m, in \u001b[0;36mAsyncFileSystem._exists\u001b[1;34m(self, path, **kwargs)\u001b[0m\n\u001b[0;32m 675\u001b[0m \u001b[38;5;28;01masync\u001b[39;00m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_exists\u001b[39m(\u001b[38;5;28mself\u001b[39m, path, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[0;32m 676\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m--> 677\u001b[0m \u001b[38;5;28;01mawait\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_info(path, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m 678\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[0;32m 679\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mFileNotFoundError\u001b[39;00m:\n", + "File \u001b[1;32mc:\\Users\\dae\\.vscode\\Software\\.venv\\lib\\site-packages\\gcsfs\\core.py:926\u001b[0m, in \u001b[0;36mGCSFileSystem._info\u001b[1;34m(self, path, generation, **kwargs)\u001b[0m\n\u001b[0;32m 924\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m/\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m path:\n\u001b[0;32m 925\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m--> 926\u001b[0m out \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_call(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mGET\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb/\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpath\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m, json_out\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n\u001b[0;32m 927\u001b[0m out\u001b[38;5;241m.\u001b[39mupdate(size\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0\u001b[39m, \u001b[38;5;28mtype\u001b[39m\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdirectory\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 928\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mOSError\u001b[39;00m:\n\u001b[0;32m 929\u001b[0m \u001b[38;5;66;03m# GET bucket failed, try ls; will have no metadata\u001b[39;00m\n", + "File \u001b[1;32mc:\\Users\\dae\\.vscode\\Software\\.venv\\lib\\site-packages\\gcsfs\\core.py:437\u001b[0m, in \u001b[0;36mGCSFileSystem._call\u001b[1;34m(self, method, path, json_out, info_out, *args, **kwargs)\u001b[0m\n\u001b[0;32m 433\u001b[0m \u001b[38;5;28;01masync\u001b[39;00m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_call\u001b[39m(\n\u001b[0;32m 434\u001b[0m \u001b[38;5;28mself\u001b[39m, method, path, \u001b[38;5;241m*\u001b[39margs, json_out\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m, info_out\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs\n\u001b[0;32m 435\u001b[0m ):\n\u001b[0;32m 436\u001b[0m logger\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mmethod\u001b[38;5;241m.\u001b[39mupper()\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpath\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m, \u001b[39m\u001b[38;5;132;01m{\u001b[39;00margs\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m, \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mkwargs\u001b[38;5;241m.\u001b[39mget(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mheaders\u001b[39m\u001b[38;5;124m'\u001b[39m)\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m--> 437\u001b[0m status, headers, info, contents \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_request(\n\u001b[0;32m 438\u001b[0m method, path, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs\n\u001b[0;32m 439\u001b[0m )\n\u001b[0;32m 440\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m json_out:\n\u001b[0;32m 441\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m json\u001b[38;5;241m.\u001b[39mloads(contents)\n", + "File \u001b[1;32mc:\\Users\\dae\\.vscode\\Software\\.venv\\lib\\site-packages\\decorator.py:221\u001b[0m, in \u001b[0;36mdecorate..fun\u001b[1;34m(*args, **kw)\u001b[0m\n\u001b[0;32m 219\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m kwsyntax:\n\u001b[0;32m 220\u001b[0m args, kw \u001b[38;5;241m=\u001b[39m fix(args, kw, sig)\n\u001b[1;32m--> 221\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mawait\u001b[39;00m caller(func, \u001b[38;5;241m*\u001b[39m(extras \u001b[38;5;241m+\u001b[39m args), \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkw)\n", + "File \u001b[1;32mc:\\Users\\dae\\.vscode\\Software\\.venv\\lib\\site-packages\\gcsfs\\retry.py:158\u001b[0m, in \u001b[0;36mretry_request\u001b[1;34m(func, retries, *args, **kwargs)\u001b[0m\n\u001b[0;32m 156\u001b[0m \u001b[38;5;28;01mcontinue\u001b[39;00m\n\u001b[0;32m 157\u001b[0m logger\u001b[38;5;241m.\u001b[39mexception(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfunc\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m non-retriable exception: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00me\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m--> 158\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m e\n", + "File \u001b[1;32mc:\\Users\\dae\\.vscode\\Software\\.venv\\lib\\site-packages\\gcsfs\\retry.py:123\u001b[0m, in \u001b[0;36mretry_request\u001b[1;34m(func, retries, *args, **kwargs)\u001b[0m\n\u001b[0;32m 121\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m retry \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[0;32m 122\u001b[0m \u001b[38;5;28;01mawait\u001b[39;00m asyncio\u001b[38;5;241m.\u001b[39msleep(\u001b[38;5;28mmin\u001b[39m(random\u001b[38;5;241m.\u001b[39mrandom() \u001b[38;5;241m+\u001b[39m \u001b[38;5;241m2\u001b[39m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39m (retry \u001b[38;5;241m-\u001b[39m \u001b[38;5;241m1\u001b[39m), \u001b[38;5;241m32\u001b[39m))\n\u001b[1;32m--> 123\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mawait\u001b[39;00m func(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[0;32m 124\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m (\n\u001b[0;32m 125\u001b[0m HttpError,\n\u001b[0;32m 126\u001b[0m requests\u001b[38;5;241m.\u001b[39mexceptions\u001b[38;5;241m.\u001b[39mRequestException,\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 129\u001b[0m aiohttp\u001b[38;5;241m.\u001b[39mclient_exceptions\u001b[38;5;241m.\u001b[39mClientError,\n\u001b[0;32m 130\u001b[0m ) \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[0;32m 131\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[0;32m 132\u001b[0m \u001b[38;5;28misinstance\u001b[39m(e, HttpError)\n\u001b[0;32m 133\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m e\u001b[38;5;241m.\u001b[39mcode \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m400\u001b[39m\n\u001b[0;32m 134\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mrequester pays\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m e\u001b[38;5;241m.\u001b[39mmessage\n\u001b[0;32m 135\u001b[0m ):\n", + "File \u001b[1;32mc:\\Users\\dae\\.vscode\\Software\\.venv\\lib\\site-packages\\gcsfs\\core.py:430\u001b[0m, in \u001b[0;36mGCSFileSystem._request\u001b[1;34m(self, method, path, headers, json, data, *args, **kwargs)\u001b[0m\n\u001b[0;32m 427\u001b[0m info \u001b[38;5;241m=\u001b[39m r\u001b[38;5;241m.\u001b[39mrequest_info \u001b[38;5;66;03m# for debug only\u001b[39;00m\n\u001b[0;32m 428\u001b[0m contents \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m r\u001b[38;5;241m.\u001b[39mread()\n\u001b[1;32m--> 430\u001b[0m \u001b[43mvalidate_response\u001b[49m\u001b[43m(\u001b[49m\u001b[43mstatus\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcontents\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpath\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43margs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 431\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m status, headers, info, contents\n", + "File \u001b[1;32mc:\\Users\\dae\\.vscode\\Software\\.venv\\lib\\site-packages\\gcsfs\\retry.py:110\u001b[0m, in \u001b[0;36mvalidate_response\u001b[1;34m(status, content, path, args)\u001b[0m\n\u001b[0;32m 108\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mBad Request: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mpath\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;132;01m{\u001b[39;00mmsg\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 109\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m error:\n\u001b[1;32m--> 110\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m HttpError(error)\n\u001b[0;32m 111\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m status:\n\u001b[0;32m 112\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m HttpError({\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcode\u001b[39m\u001b[38;5;124m\"\u001b[39m: status, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmessage\u001b[39m\u001b[38;5;124m\"\u001b[39m: msg}) \u001b[38;5;66;03m# text-like\u001b[39;00m\n", + "\u001b[1;31mHttpError\u001b[0m: Invalid bucket name: 'pinecone-datasets-dev\\squad-text-embedding-ada-002', 400" + ] + } + ], + "source": [ + "dataset = load_dataset(\"squad-text-embedding-ada-002\")\n", + "dataset.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "K5Q16wRH9SmO", + "outputId": "fdf2947f-a270-4417-a02b-057e62356dfe" + }, + "outputs": [ + { "data": { "text/plain": [ - " id \\\n", - "0 5733be284776f41900661182 \n", - "1 5733bf84d058e614000b61be \n", - "2 5733bed24776f41900661188 \n", - "3 5733a6424776f41900660f51 \n", - "4 5733a70c4776f41900660f64 \n", - "\n", - " values sparse_values \\\n", - "0 [-0.010262451963272523, 0.02222637996192584, -... None \n", - "1 [-0.009786712423983223, -0.013988726438873078,... None \n", - "2 [0.013343917696606181, -0.0007001232846109822,... None \n", - "3 [-0.0085222901071539, 0.004399558219521822, -0... None \n", - "4 [-0.006695996885869355, -0.02067068565761649, ... None \n", - "\n", - " metadata blob \n", - "0 {'text': 'Architecturally, the school has a Ca... None \n", - "1 {'text': 'As at most other universities, Notre... None \n", - "2 {'text': 'The university is the major seat of ... None \n", - "3 {'text': 'The College of Engineering was estab... None \n", - "4 {'text': 'All of Notre Dame's undergraduate st... None " - ], + "18891" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(dataset)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "c3-Plec39SmO" + }, + "source": [ + "We'll format the dataset ready for upsert and reduce what we use to a subset of the full dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "id": "4CW5mNi89SmO", + "outputId": "b7485e0d-1aa6-4f2b-840e-fdef19d58ffc" + }, + "outputs": [ + { + "data": { "text/html": [ "\n", - "
\n", + "
\n", "
\n", "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idvaluesmetadata
05733be284776f41900661182[-0.010262451963272523, 0.02222637996192584, -...{'text': 'Architecturally, the school has a Ca...
15733bf84d058e614000b61be[-0.009786712423983223, -0.013988726438873078,...{'text': 'As at most other universities, Notre...
25733bed24776f41900661188[0.013343917696606181, -0.0007001232846109822,...{'text': 'The university is the major seat of ...
35733a6424776f41900660f51[-0.0085222901071539, 0.004399558219521822, -0...{'text': 'The College of Engineering was estab...
45733a70c4776f41900660f64[-0.006695996885869355, -0.02067068565761649, ...{'text': 'All of Notre Dame's undergraduate st...
\n", - "
\n", - "
\n", - "\n", - "
\n", - " \n", - "\n", - " \n", - "\n", - " \n", - "
\n", - "\n", - "\n", - "
\n", - " \n", - "\n", - "\n", - "\n", - " \n", - "
\n", - "
\n", - "
\n" - ] - }, - "metadata": {}, - "execution_count": 4 - } - ], - "source": [ - "# we drop sparse_values as they are not needed for this example\n", - "dataset.documents.drop(['sparse_values', 'blob'], axis=1, inplace=True)\n", - "\n", - "dataset.head()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "B2_Pt7N6Zg2X" - }, - "source": [ - "## Vector Database" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "JQTfOTR6aBRS" - }, - "source": [ - "Next we initialize the vector database. For this we need a [free API key](https://app.pinecone.io/), then we create the index:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "lgfywcQj9SmP" - }, - "outputs": [], - "source": [ - "index_name = 'langchain-retrieval-agent-fast'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "C3wrG-9yaJel" - }, - "outputs": [], - "source": [ - "import pinecone\n", - "import os\n", - "\n", - "# Load Pinecone API key\n", - "api_key = os.getenv('PINECONE_API_KEY') or 'api_key'\n", - "# Set Pinecone environment. Find next to API key in console\n", - "env = os.getenv('PINECONE_ENVIRONMENT') or \"us-central1-gcp\"\n", - "\n", - "pinecone.init(api_key=api_key, environment=env)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "D5WT4PAN9SmP" - }, - "outputs": [], - "source": [ - "import time\n", - "\n", - "if index_name in pinecone.list_indexes():\n", - " pinecone.delete_index(index_name)\n", - "\n", - "# we create a new index\n", - "pinecone.create_index(\n", - " name=index_name,\n", - " metric='dotproduct',\n", - " dimension=1536 # 1536 dim of text-embedding-ada-002\n", - ")\n", - "\n", - "# wait for index to be initialized\n", - "while not pinecone.describe_index(index_name).status['ready']:\n", - " time.sleep(1)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "uiSWrAQ5aRco" - }, - "source": [ - "Then connect to the index:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "bfsfuFmqaS4G", - "outputId": "45f17443-b87a-4682-ab44-6cfd6efdc46c" - }, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "{'dimension': 1536,\n", - " 'index_fullness': 0.0,\n", - " 'namespaces': {},\n", - " 'total_vector_count': 0}" - ] - }, - "metadata": {}, - "execution_count": 12 - } - ], - "source": [ - "index = pinecone.GRPCIndex(index_name)\n", - "index.describe_index_stats()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "QbDTrvvm9SmP" - }, - "source": [ - "We should see that the new Pinecone index has a `total_vector_count` of `0`, as we haven't added any vectors yet.\n", - "\n", - "Now we upsert the data to Pinecone:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 98, - "referenced_widgets": [ - "d7b2791e5f3d4c68b02da4123f715a72", - "e4e2a2e10c684ac7bbf102bad235464f", - "5290c01d786b4baf8b5e9adfe1a5befe", - "f073c54fdece48c0817f21f0970621d9", - "c593df22c7294a078a8d036d10e1c117", - "2a15af3253884c7cb97c6c6f3dd21e3f", - "bb3e80cb30214f6b80c4316606761d34", - "1a90f0bf4c8e4aabb8e346c3d9cfdff6", - "6e332262dd2944a68e3415bc827f1407", - "ded11ea7cc6b4c8aa6acbe8d03a6f742", - "bd0b82bd40b0418a8e326bec7e1cfe9e", - "2868c074bd55491a92000c7cd363ce6b", - "be04454d283147d79e353c1ab24b8573", - "89c1ae7c90004a6bae1e5aeedb19fa8c", - "b22dd946e0ac4a0abfb27efcf811c790", - "dd464906d8ab4900916b35dd3e779d46", - "4b2dd63f4b5e4a40ab5ec52826cc5bb3", - "87258691c1e041219045522dcf52bc52", - "9f125e0287ed46ebba34a0d26cdfb8cc", - "795dabbc25bb426c8756a36b5778f572", - "d9dd4543607840bfb4e813e801549c66", - "7e51d478a2e14ff09853d93c102ff40f" - ] - }, - "id": "AhDcbRGTaWPi", - "outputId": "14b0b058-fa02-4078-83b9-7c3067edf613" - }, - "outputs": [ - { - "output_type": "display_data", "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "d7b2791e5f3d4c68b02da4123f715a72", + "version_major": 2, + "version_minor": 0 + }, "text/plain": [ "sending upsert requests: 0%| | 0/18891 [00:00