Skip to content

Commit

Permalink
Creating a notebook for testing rag with a sample of the data
Browse files Browse the repository at this point in the history
  • Loading branch information
german-grandas committed Sep 12, 2024
1 parent e38a101 commit 799c8db
Show file tree
Hide file tree
Showing 2 changed files with 3 additions and 2 deletions.
1 change: 1 addition & 0 deletions applications/rag/example_notebooks/ingest_database.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"cells":[{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["import os\n","os.environ['KAGGLE_USERNAME'] = \"<username>\"\n","os.environ['KAGGLE_KEY'] = \"<token>\"\n","\n","# Download the zip file to local storage and then extract the desired contents directly to the GKE GCS CSI mounted bucket. The bucket is mounted at the \"/persist-data\" path in the jupyter pod.\n","!kaggle datasets download -d shivamb/netflix-shows -p ~/data --force\n","!mkdir /data/netflix-shows -p\n","!unzip -o ~/data/netflix-shows.zip -d /data/netflix-shows"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["!pip install langchain-google-cloud-sql-pg"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["import os\n","import uuid\n","\n","from langchain_community.document_loaders.csv_loader import CSVLoader\n","from langchain.text_splitter import RecursiveCharacterTextSplitter\n","from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings\n","\n","from langchain_google_cloud_sql_pg import PostgresEngine, PostgresVectorStore\n","from google.cloud.sql.connector import IPTypes\n","\n","# initialize parameters\n","INSTANCE_CONNECTION_NAME = os.environ.get(\"CLOUDSQL_INSTANCE_CONNECTION_NAME\", \"\")\n","print(f\"Your instance connection name is: {INSTANCE_CONNECTION_NAME}\")\n","cloud_variables = INSTANCE_CONNECTION_NAME.split(\":\")\n","\n","GCP_PROJECT_ID = os.environ.get(\"GCP_PROJECT_ID\", cloud_variables[0])\n","GCP_CLOUD_SQL_REGION = os.environ.get(\"CLOUDSQL_INSTANCE_REGION\", cloud_variables[1])\n","GCP_CLOUD_SQL_INSTANCE = os.environ.get(\"CLOUDSQL_INSTANCE\", cloud_variables[2])\n","\n","DB_NAME = os.environ.get(\"INSTANCE_CONNECTION_NAME\", \"pgvector-database\")\n","VECTOR_EMBEDDINGS_TABLE_NAME = os.environ.get(\"EMBEDDINGS_TABLE_NAME\", \"netflix_reviews_db\")\n","CHAT_HISTORY_TABLE_NAME = os.environ.get(\"CHAT_HISTORY_TABLE_NAME\", \"message_store\")\n","\n","VECTOR_DIMENSION = os.environ.get(\"VECTOR_DIMENSION\", 384)\n","SENTENCE_TRANSFORMER_MODEL = 'intfloat/multilingual-e5-small' \n","\n","SHARED_DATASET_BASE_PATH=\"/data/netflix-shows/\"\n","REVIEWS_FILE_NAME=\"netflix_titles.csv\"\n","\n","BATCH_SIZE = 100\n","CHUNK_SIZE = 1000\n","CHUNK_OVERLAP = 10\n","TABLE_NAME = 'netflix_reviews_db'\n","\n","try:\n"," db_username_file = open(\"/etc/secret-volume/username\", \"r\")\n"," DB_USER = db_username_file.read()\n"," db_username_file.close()\n","\n"," db_password_file = open(\"/etc/secret-volume/password\", \"r\")\n"," DB_PASS = db_password_file.read()\n"," db_password_file.close()\n","except:\n"," DB_USER = os.environ.get(\"DB_USERNAME\", \"postgres\")\n"," DB_PASS = os.environ.get(\"DB_PASS\", \"postgres\")\n","\n","engine = PostgresEngine.from_instance(\n"," project_id=GCP_PROJECT_ID,\n"," region=GCP_CLOUD_SQL_REGION,\n"," instance=GCP_CLOUD_SQL_INSTANCE,\n"," database=DB_NAME,\n"," user=DB_USER,\n"," password=DB_PASS,\n"," ip_type=IPTypes.PRIVATE,\n",")\n","\n","try:\n"," engine.init_vectorstore_table(\n"," VECTOR_EMBEDDINGS_TABLE_NAME,\n"," vector_size=VECTOR_DIMENSION,\n"," overwrite_existing=True,\n"," )\n","except Exception as err:\n"," print(f\"Error: {err}\")\n","\n","\n","embeddings_service = HuggingFaceEmbeddings(model_name=SENTENCE_TRANSFORMER_MODEL)\n","vector_store = PostgresVectorStore.create_sync(\n"," engine=engine,\n"," embedding_service=embeddings_service,\n"," table_name=VECTOR_EMBEDDINGS_TABLE_NAME,\n",")\n","\n","splitter = RecursiveCharacterTextSplitter(\n"," chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP, length_function=len\n",")\n","\n","loader = CSVLoader(file_path=f\"{SHARED_DATASET_BASE_PATH}/{REVIEWS_FILE_NAME}\")\n","documents = loader.load()\n","\n","documents = documents[:1000] #Taking a sample for test purposes \n","\n","splits = splitter.split_documents(documents)\n","ids = [str(uuid.uuid4()) for i in range(len(splits))]\n","vector_store.add_documents(splits, ids)"]}],"metadata":{"language_info":{"name":"python"}},"nbformat":4,"nbformat_minor":2}
4 changes: 2 additions & 2 deletions modules/jupyter/jupyter_image/notebook_image/cloudbuild.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,6 @@ steps:
- name: 'gcr.io/cloud-builders/docker'
args: [ 'pull', 'docker.io/jupyter/tensorflow-notebook:python-3.10' ]
- name: 'gcr.io/cloud-builders/docker'
args: [ 'build', '-t', '<Artiact registry repo>/<image name>', '.' ]
args: [ 'build', '-t', 'us-docker.pkg.dev/globant-gke-ai-resources/gke-ai-text-to-text/gke-jupyterhub-image', '.' ]
images:
- '<Artiact registry repo>/<image name>'
- 'us-docker.pkg.dev/globant-gke-ai-resources/gke-ai-text-to-text/gke-jupyterhub-image'

0 comments on commit 799c8db

Please sign in to comment.