-
Notifications
You must be signed in to change notification settings - Fork 177
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Creating a notebook for testing rag with a sample of the data
- Loading branch information
1 parent
e38a101
commit 799c8db
Showing
2 changed files
with
3 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
{"cells":[{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["import os\n","os.environ['KAGGLE_USERNAME'] = \"<username>\"\n","os.environ['KAGGLE_KEY'] = \"<token>\"\n","\n","# Download the zip file to local storage and then extract the desired contents directly to the GKE GCS CSI mounted bucket. The bucket is mounted at the \"/persist-data\" path in the jupyter pod.\n","!kaggle datasets download -d shivamb/netflix-shows -p ~/data --force\n","!mkdir /data/netflix-shows -p\n","!unzip -o ~/data/netflix-shows.zip -d /data/netflix-shows"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["!pip install langchain-google-cloud-sql-pg"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["import os\n","import uuid\n","\n","from langchain_community.document_loaders.csv_loader import CSVLoader\n","from langchain.text_splitter import RecursiveCharacterTextSplitter\n","from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings\n","\n","from langchain_google_cloud_sql_pg import PostgresEngine, PostgresVectorStore\n","from google.cloud.sql.connector import IPTypes\n","\n","# initialize parameters\n","INSTANCE_CONNECTION_NAME = os.environ.get(\"CLOUDSQL_INSTANCE_CONNECTION_NAME\", \"\")\n","print(f\"Your instance connection name is: {INSTANCE_CONNECTION_NAME}\")\n","cloud_variables = INSTANCE_CONNECTION_NAME.split(\":\")\n","\n","GCP_PROJECT_ID = os.environ.get(\"GCP_PROJECT_ID\", cloud_variables[0])\n","GCP_CLOUD_SQL_REGION = os.environ.get(\"CLOUDSQL_INSTANCE_REGION\", cloud_variables[1])\n","GCP_CLOUD_SQL_INSTANCE = os.environ.get(\"CLOUDSQL_INSTANCE\", cloud_variables[2])\n","\n","DB_NAME = os.environ.get(\"INSTANCE_CONNECTION_NAME\", \"pgvector-database\")\n","VECTOR_EMBEDDINGS_TABLE_NAME = os.environ.get(\"EMBEDDINGS_TABLE_NAME\", \"netflix_reviews_db\")\n","CHAT_HISTORY_TABLE_NAME = os.environ.get(\"CHAT_HISTORY_TABLE_NAME\", \"message_store\")\n","\n","VECTOR_DIMENSION = os.environ.get(\"VECTOR_DIMENSION\", 384)\n","SENTENCE_TRANSFORMER_MODEL = 'intfloat/multilingual-e5-small' \n","\n","SHARED_DATASET_BASE_PATH=\"/data/netflix-shows/\"\n","REVIEWS_FILE_NAME=\"netflix_titles.csv\"\n","\n","BATCH_SIZE = 100\n","CHUNK_SIZE = 1000\n","CHUNK_OVERLAP = 10\n","TABLE_NAME = 'netflix_reviews_db'\n","\n","try:\n"," db_username_file = open(\"/etc/secret-volume/username\", \"r\")\n"," DB_USER = db_username_file.read()\n"," db_username_file.close()\n","\n"," db_password_file = open(\"/etc/secret-volume/password\", \"r\")\n"," DB_PASS = db_password_file.read()\n"," db_password_file.close()\n","except:\n"," DB_USER = os.environ.get(\"DB_USERNAME\", \"postgres\")\n"," DB_PASS = os.environ.get(\"DB_PASS\", \"postgres\")\n","\n","engine = PostgresEngine.from_instance(\n"," project_id=GCP_PROJECT_ID,\n"," region=GCP_CLOUD_SQL_REGION,\n"," instance=GCP_CLOUD_SQL_INSTANCE,\n"," database=DB_NAME,\n"," user=DB_USER,\n"," password=DB_PASS,\n"," ip_type=IPTypes.PRIVATE,\n",")\n","\n","try:\n"," engine.init_vectorstore_table(\n"," VECTOR_EMBEDDINGS_TABLE_NAME,\n"," vector_size=VECTOR_DIMENSION,\n"," overwrite_existing=True,\n"," )\n","except Exception as err:\n"," print(f\"Error: {err}\")\n","\n","\n","embeddings_service = HuggingFaceEmbeddings(model_name=SENTENCE_TRANSFORMER_MODEL)\n","vector_store = PostgresVectorStore.create_sync(\n"," engine=engine,\n"," embedding_service=embeddings_service,\n"," table_name=VECTOR_EMBEDDINGS_TABLE_NAME,\n",")\n","\n","splitter = RecursiveCharacterTextSplitter(\n"," chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP, length_function=len\n",")\n","\n","loader = CSVLoader(file_path=f\"{SHARED_DATASET_BASE_PATH}/{REVIEWS_FILE_NAME}\")\n","documents = loader.load()\n","\n","documents = documents[:1000] #Taking a sample for test purposes \n","\n","splits = splitter.split_documents(documents)\n","ids = [str(uuid.uuid4()) for i in range(len(splits))]\n","vector_store.add_documents(splits, ids)"]}],"metadata":{"language_info":{"name":"python"}},"nbformat":4,"nbformat_minor":2} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters