From 8fb7686ace1688a4987d6a7675942d2bfa9e2873 Mon Sep 17 00:00:00 2001 From: abhiram1809 <53875874+abhiram1809@users.noreply.github.com> Date: Sat, 9 Mar 2024 10:56:52 +0530 Subject: [PATCH] Integration in Google Sheets Reader to load the Sheets into Pandas Dataframes (#11640) * Updated base.py Added reading Data as Pandas Dataframe Functionality. * Update base.py Reading Fix * Update base.py Co-authored-by: Jayant * Update base.py Added Co-Author Co-authored-by: Jayant * Basic Clean Up * Revert "Basic Clean Up" This reverts commit cafdfb681819ce3508c91d5a50e3f18609a6722a. * Revert "Add mixedbread reranker cookbook (#11536)" This reverts commit a88bc070313af3bd7ca4e7b278858fd2a83c4bb4. * Cleans up and added Documentation for Google Readers * Undo Reverts * Grammar Fix --------- Co-authored-by: Jayant Verma <98758438+jayantverma2809@users.noreply.github.com> Co-authored-by: Jayant --- .../data_connectors/GoogleSheetsDemo.ipynb | 684 ++++++++++++++++++ .../llama-index-readers-google/README.md | 38 +- .../llama_index/readers/google/sheets/base.py | 53 ++ .../llama-index-readers-google/pyproject.toml | 2 +- 4 files changed, 775 insertions(+), 2 deletions(-) create mode 100644 docs/examples/data_connectors/GoogleSheetsDemo.ipynb diff --git a/docs/examples/data_connectors/GoogleSheetsDemo.ipynb b/docs/examples/data_connectors/GoogleSheetsDemo.ipynb new file mode 100644 index 00000000000000..89358490b8c98d --- /dev/null +++ b/docs/examples/data_connectors/GoogleSheetsDemo.ipynb @@ -0,0 +1,684 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# **Google Sheets Reader**\n", + "Demonstrates Google Sheets Reader in LlamaIndex\n", + "\n", + "\n", + "* Make Sure you have token.json or credentials.json file in the Environment, More on that [here](https://developers.google.com/workspace/guides/create-credentials)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from llama_index.readers.google import GoogleSheetsReader" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Load Sheets as a List of Pandas Dataframe" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "list_of_sheets = [\"1ZF5iIeLLqROHbHsb1vOeRaLWKIgLU7rDDTSOZaqjpk0\"]\n", + "sheets = GoogleSheetsReader()\n", + "dataframes = sheets.load_data_in_pandas(list_of_sheets)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "dataframe" + }, + "text/html": [ + "\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
spotify_idnameartistsdaily_rankdaily_movementweekly_movementcountrysnapshot_datepopularityis_explicit...keyloudnessmodespeechinessacousticnessinstrumentalnesslivenessvalencetempotime_signature
02HafqoJbgXdtjwCOvNEF14Si No Estásiñigo quintero1032023-10-2797FALSE...5-8.7210.02850.82700.1380.52498.2244
17x9aauaA9cu6tyfpHnqDLoSeven (feat. Latto) (Explicit Ver.)Jung Kook, Latto2402023-10-2797TRUE...11-4.10710.04340.31100.08150.89124.9974
23rUGC1vUpkDG9CZFHMur1tgreedyTate McRae3-122023-10-2799TRUE...6-3.1800.03190.25600.1140.844111.0181
34MjDJD8cW7iVeWInc2BdyjMONACOBad Bunny4-1-32023-10-2796TRUE...4-5.00900.0680.150.0004020.580.13139.0564
47iQXYTyuG13aoeHxGG28NhPERRO NEGROBad Bunny, Feid5012023-10-2794TRUE...5-2.24810.2620.08872.16E-050.1790.34596.0574
..................................................................
363950AYt6NMyyLd0rLuvr0UkMHSlime You Out (feat. SZA)Drake, SZA4640AE2023-10-1884TRUE...5-9.24300.05020.50800.2590.10588.883
363962Gk6fi0dqt91NKvlzGsmm7SAY MY GRACE (feat. Travis Scott)Offset, Travis Scott4730AE2023-10-1880TRUE...10-5.0610.04520.058500.1320.476121.8794
3639726b3oVLrRUaaybJulow9kzPeopleLibianca4820AE2023-10-1888FALSE...10-7.62100.06780.5511.31E-050.1020.693124.3575
363985ydjxBSUIDn26MFzU3asP4Rainy DaysV4910AE2023-10-1888FALSE...9-8.01600.08750.73900.1480.28274.8284
3639959NraMJsLaMCVtwXTSia8iPradacassö, RAYE, D-Block Europe5000AE2023-10-1894TRUE...8-5.80410.03750.0011.79E-060.1130.422141.9044
\n", + "

36400 rows × 25 columns

\n", + "
\n", + "
\n", + "\n", + "
\n", + " \n", + "\n", + " \n", + "\n", + " \n", + "
\n", + "\n", + "\n", + "
\n", + " \n", + "\n", + "\n", + "\n", + " \n", + "
\n", + "\n", + "
\n", + "
\n" + ], + "text/plain": [ + " spotify_id name \\\n", + "0 2HafqoJbgXdtjwCOvNEF14 Si No Estás \n", + "1 7x9aauaA9cu6tyfpHnqDLo Seven (feat. Latto) (Explicit Ver.) \n", + "2 3rUGC1vUpkDG9CZFHMur1t greedy \n", + "3 4MjDJD8cW7iVeWInc2Bdyj MONACO \n", + "4 7iQXYTyuG13aoeHxGG28Nh PERRO NEGRO \n", + "... ... ... \n", + "36395 0AYt6NMyyLd0rLuvr0UkMH Slime You Out (feat. SZA) \n", + "36396 2Gk6fi0dqt91NKvlzGsmm7 SAY MY GRACE (feat. Travis Scott) \n", + "36397 26b3oVLrRUaaybJulow9kz People \n", + "36398 5ydjxBSUIDn26MFzU3asP4 Rainy Days \n", + "36399 59NraMJsLaMCVtwXTSia8i Prada \n", + "\n", + " artists daily_rank daily_movement weekly_movement \\\n", + "0 iñigo quintero 1 0 3 \n", + "1 Jung Kook, Latto 2 4 0 \n", + "2 Tate McRae 3 -1 2 \n", + "3 Bad Bunny 4 -1 -3 \n", + "4 Bad Bunny, Feid 5 0 1 \n", + "... ... ... ... ... \n", + "36395 Drake, SZA 46 4 0 \n", + "36396 Offset, Travis Scott 47 3 0 \n", + "36397 Libianca 48 2 0 \n", + "36398 V 49 1 0 \n", + "36399 cassö, RAYE, D-Block Europe 50 0 0 \n", + "\n", + " country snapshot_date popularity is_explicit ... key loudness mode \\\n", + "0 2023-10-27 97 FALSE ... 5 -8.72 1 \n", + "1 2023-10-27 97 TRUE ... 11 -4.107 1 \n", + "2 2023-10-27 99 TRUE ... 6 -3.18 0 \n", + "3 2023-10-27 96 TRUE ... 4 -5.009 0 \n", + "4 2023-10-27 94 TRUE ... 5 -2.248 1 \n", + "... ... ... ... ... ... .. ... ... \n", + "36395 AE 2023-10-18 84 TRUE ... 5 -9.243 0 \n", + "36396 AE 2023-10-18 80 TRUE ... 10 -5.06 1 \n", + "36397 AE 2023-10-18 88 FALSE ... 10 -7.621 0 \n", + "36398 AE 2023-10-18 88 FALSE ... 9 -8.016 0 \n", + "36399 AE 2023-10-18 94 TRUE ... 8 -5.804 1 \n", + "\n", + " speechiness acousticness instrumentalness liveness valence tempo \\\n", + "0 0.0285 0.827 0 0.138 0.524 98.224 \n", + "1 0.0434 0.311 0 0.0815 0.89 124.997 \n", + "2 0.0319 0.256 0 0.114 0.844 111.018 \n", + "3 0.068 0.15 0.000402 0.58 0.13 139.056 \n", + "4 0.262 0.0887 2.16E-05 0.179 0.345 96.057 \n", + "... ... ... ... ... ... ... \n", + "36395 0.0502 0.508 0 0.259 0.105 88.88 \n", + "36396 0.0452 0.0585 0 0.132 0.476 121.879 \n", + "36397 0.0678 0.551 1.31E-05 0.102 0.693 124.357 \n", + "36398 0.0875 0.739 0 0.148 0.282 74.828 \n", + "36399 0.0375 0.001 1.79E-06 0.113 0.422 141.904 \n", + "\n", + " time_signature \n", + "0 4 \n", + "1 4 \n", + "2 1 \n", + "3 4 \n", + "4 4 \n", + "... ... \n", + "36395 3 \n", + "36396 4 \n", + "36397 5 \n", + "36398 4 \n", + "36399 4 \n", + "\n", + "[36400 rows x 25 columns]" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dataframes[0]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Or Load Sheets as a List of Document Objects" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "documents = sheets.load_data(list_of_sheets)" + ] + } + ], + "metadata": { + "colab": { + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/llama-index-integrations/readers/llama-index-readers-google/README.md b/llama-index-integrations/readers/llama-index-readers-google/README.md index 1e86347f51ba7b..bc2095634da9df 100644 --- a/llama-index-integrations/readers/llama-index-readers-google/README.md +++ b/llama-index-integrations/readers/llama-index-readers-google/README.md @@ -1 +1,37 @@ -# LlamaIndex Readers Integration: Google +# LlamaIndex Integration: Google Readers + +Effortlessly incorporate Google-based data loaders into your Python workflow using LlamaIndex. Unlock the potential of various readers to enhance your data loading capabilities. Below are examples of integrating Google Docs and Google Sheets readers: + +### Google Docs Reader + +```python +from llama_index.readers.google import GoogleDocsReader + +# Specify the document IDs you want to load +document_ids = [""] + +# Load data from Google Docs +documents = GoogleDocsReader().load_data(document_ids=document_ids) +``` + +### Google Sheets Reader (Documents and Dataframes) + +```python +from llama_index.readers.google import GoogleSheetsReader + +# Specify the list of sheet IDs you want to load +list_of_sheets = ["spreadsheet_id"] + +# Create a Google Sheets Reader instance +sheets_reader = GoogleSheetsReader() + +# Load data into Pandas in Data Classes of choice (Documents or Dataframes) +documents = sheets.load_data(list_of_sheets) +dataframes = sheets_reader.load_data_in_pandas(list_of_sheets) +``` + +Integrate these readers seamlessly to efficiently manage and process your data within your Python environment, providing a robust foundation for your data-driven workflows with LlamaIndex. + +### Note + +Make sure you have a "token.json" or a "credentials.json" file in your environment to authenticate the Google Cloud Platform diff --git a/llama-index-integrations/readers/llama-index-readers-google/llama_index/readers/google/sheets/base.py b/llama-index-integrations/readers/llama-index-readers-google/llama_index/readers/google/sheets/base.py index f95d877155805f..abf3a25a2f2d1e 100644 --- a/llama-index-integrations/readers/llama-index-readers-google/llama_index/readers/google/sheets/base.py +++ b/llama-index-integrations/readers/llama-index-readers-google/llama_index/readers/google/sheets/base.py @@ -2,6 +2,7 @@ import logging import os +import pandas as pd from typing import Any, List import googleapiclient.discovery as discovery @@ -79,6 +80,21 @@ def load_data(self, spreadsheet_ids: List[str]) -> List[Document]: ) return results + def load_data_in_pandas(self, spreadsheet_ids: List[str]) -> List[pd.DataFrame]: + """Load data from the input directory. + + Args: + spreadsheet_ids (List[str]): a list of document ids. + """ + if spreadsheet_ids is None: + raise ValueError('Must specify a "spreadsheet_ids" in `load_kwargs`.') + + results = [] + for spreadsheet_id in spreadsheet_ids: + dataframes = self._load_sheet_in_pandas(spreadsheet_id) + results.extend(dataframes) + return results + def _load_sheet(self, spreadsheet_id: str) -> str: """Load a sheet from Google Sheets. @@ -115,6 +131,43 @@ def _load_sheet(self, spreadsheet_id: str) -> str: ) return sheet_text + def _load_sheet_in_pandas(self, spreadsheet_id: str) -> List[pd.DataFrame]: + """Load a sheet from Google Sheets. + + Args: + spreadsheet_id: the sheet id. + sheet_name: the sheet name. + + Returns: + The sheet data. + """ + credentials = self._get_credentials() + sheets_service = discovery.build("sheets", "v4", credentials=credentials) + sheet = sheets_service.spreadsheets() + spreadsheet_data = sheet.get(spreadsheetId=spreadsheet_id).execute() + sheets = spreadsheet_data.get("sheets") + dataframes = [] + for sheet in sheets: + properties = sheet.get("properties") + title = properties.get("title") + grid_props = properties.get("gridProperties") + rows = grid_props.get("rowCount") + cols = grid_props.get("columnCount") + range_pattern = f"{title}!R1C1:R{rows}C{cols}" + response = ( + sheets_service.spreadsheets() + .values() + .get(spreadsheetId=spreadsheet_id, range=range_pattern) + .execute() + ) + values = response.get("values", []) + if not values: + print(f"No data found in {title}") + else: + df = pd.DataFrame(values[1:], columns=values[0]) + dataframes.append(df) + return dataframes + def _get_credentials(self) -> Any: """Get valid user credentials from storage. diff --git a/llama-index-integrations/readers/llama-index-readers-google/pyproject.toml b/llama-index-integrations/readers/llama-index-readers-google/pyproject.toml index 79a58c41596465..c962ed793746bd 100644 --- a/llama-index-integrations/readers/llama-index-readers-google/pyproject.toml +++ b/llama-index-integrations/readers/llama-index-readers-google/pyproject.toml @@ -34,7 +34,7 @@ license = "MIT" maintainers = ["bbornsztein", "jerryjliu", "ong", "piroz", "pycui", "ravi03071991"] name = "llama-index-readers-google" readme = "README.md" -version = "0.1.4" +version = "0.1.5" [tool.poetry.dependencies] python = ">=3.10,<4.0"