diff --git a/applications/rag/example_notebooks/ingest_database.ipynb b/applications/rag/example_notebooks/ingest_database.ipynb deleted file mode 100644 index 04371a7b6..000000000 --- a/applications/rag/example_notebooks/ingest_database.ipynb +++ /dev/null @@ -1 +0,0 @@ -{"cells":[{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["import os\n","os.environ['KAGGLE_USERNAME'] = \"\"\n","os.environ['KAGGLE_KEY'] = \"\"\n","\n","# Download the zip file to local storage and then extract the desired contents directly to the GKE GCS CSI mounted bucket. The bucket is mounted at the \"/persist-data\" path in the jupyter pod.\n","!kaggle datasets download -d shivamb/netflix-shows -p ~/data --force\n","!mkdir /data/netflix-shows -p\n","!unzip -o ~/data/netflix-shows.zip -d /data/netflix-shows"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["!pip install langchain-google-cloud-sql-pg"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["import os\n","import uuid\n","\n","from langchain_community.document_loaders.csv_loader import CSVLoader\n","from langchain.text_splitter import RecursiveCharacterTextSplitter\n","from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings\n","\n","from langchain_google_cloud_sql_pg import PostgresEngine, PostgresVectorStore\n","from google.cloud.sql.connector import IPTypes\n","\n","# initialize parameters\n","INSTANCE_CONNECTION_NAME = os.environ.get(\"CLOUDSQL_INSTANCE_CONNECTION_NAME\", \"\")\n","print(f\"Your instance connection name is: {INSTANCE_CONNECTION_NAME}\")\n","cloud_variables = INSTANCE_CONNECTION_NAME.split(\":\")\n","\n","GCP_PROJECT_ID = os.environ.get(\"GCP_PROJECT_ID\", cloud_variables[0])\n","GCP_CLOUD_SQL_REGION = os.environ.get(\"CLOUDSQL_INSTANCE_REGION\", cloud_variables[1])\n","GCP_CLOUD_SQL_INSTANCE = os.environ.get(\"CLOUDSQL_INSTANCE\", cloud_variables[2])\n","\n","DB_NAME = os.environ.get(\"INSTANCE_CONNECTION_NAME\", \"pgvector-database\")\n","VECTOR_EMBEDDINGS_TABLE_NAME = os.environ.get(\"EMBEDDINGS_TABLE_NAME\", \"netflix_reviews_db\")\n","CHAT_HISTORY_TABLE_NAME = os.environ.get(\"CHAT_HISTORY_TABLE_NAME\", \"message_store\")\n","\n","VECTOR_DIMENSION = os.environ.get(\"VECTOR_DIMENSION\", 384)\n","SENTENCE_TRANSFORMER_MODEL = 'intfloat/multilingual-e5-small' \n","\n","SHARED_DATASET_BASE_PATH=\"/data/netflix-shows/\"\n","REVIEWS_FILE_NAME=\"netflix_titles.csv\"\n","\n","BATCH_SIZE = 100\n","CHUNK_SIZE = 1000\n","CHUNK_OVERLAP = 10\n","TABLE_NAME = 'netflix_reviews_db'\n","\n","try:\n"," db_username_file = open(\"/etc/secret-volume/username\", \"r\")\n"," DB_USER = db_username_file.read()\n"," db_username_file.close()\n","\n"," db_password_file = open(\"/etc/secret-volume/password\", \"r\")\n"," DB_PASS = db_password_file.read()\n"," db_password_file.close()\n","except:\n"," DB_USER = os.environ.get(\"DB_USERNAME\", \"postgres\")\n"," DB_PASS = os.environ.get(\"DB_PASS\", \"postgres\")\n","\n","engine = PostgresEngine.from_instance(\n"," project_id=GCP_PROJECT_ID,\n"," region=GCP_CLOUD_SQL_REGION,\n"," instance=GCP_CLOUD_SQL_INSTANCE,\n"," database=DB_NAME,\n"," user=DB_USER,\n"," password=DB_PASS,\n"," ip_type=IPTypes.PRIVATE,\n",")\n","\n","try:\n"," engine.init_vectorstore_table(\n"," VECTOR_EMBEDDINGS_TABLE_NAME,\n"," vector_size=VECTOR_DIMENSION,\n"," overwrite_existing=True,\n"," )\n","except Exception as err:\n"," print(f\"Error: {err}\")\n","\n","\n","embeddings_service = HuggingFaceEmbeddings(model_name=SENTENCE_TRANSFORMER_MODEL)\n","vector_store = PostgresVectorStore.create_sync(\n"," engine=engine,\n"," embedding_service=embeddings_service,\n"," table_name=VECTOR_EMBEDDINGS_TABLE_NAME,\n",")\n","\n","splitter = RecursiveCharacterTextSplitter(\n"," chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP, length_function=len\n",")\n","\n","loader = CSVLoader(file_path=f\"{SHARED_DATASET_BASE_PATH}/{REVIEWS_FILE_NAME}\")\n","documents = loader.load()\n","\n","documents = documents[:1000] #Taking a sample for test purposes \n","\n","splits = splitter.split_documents(documents)\n","ids = [str(uuid.uuid4()) for i in range(len(splits))]\n","vector_store.add_documents(splits, ids)"]}],"metadata":{"language_info":{"name":"python"}},"nbformat":4,"nbformat_minor":2}