From da3a6994d2dd0ef242b19c53d4d56b3a00b24a06 Mon Sep 17 00:00:00 2001 From: levi Date: Tue, 6 Feb 2024 14:05:29 +0100 Subject: [PATCH 01/14] added functionality to load all documents, added functionality to load all pages --- llama_hub/microsoft_sharepoint/README.md | 41 +++++++ llama_hub/microsoft_sharepoint/base.py | 130 ++++++++++++++++++++--- 2 files changed, 156 insertions(+), 15 deletions(-) diff --git a/llama_hub/microsoft_sharepoint/README.md b/llama_hub/microsoft_sharepoint/README.md index 55bbbc2bd2..b34a45733d 100644 --- a/llama_hub/microsoft_sharepoint/README.md +++ b/llama_hub/microsoft_sharepoint/README.md @@ -20,12 +20,18 @@ More info on Microsoft Graph APIs - [Refer here](https://learn.microsoft.com/en- To use this loader `client_id`, `client_secret` and `tenant_id` of the registered app in Microsoft Azure Portal is required. +This loader can: +- Load files present in a specific folder in SharePoint +- Load all files present in the drive of a SharePoint +- Load all pages under a SharePoint site + This loader loads the files present in a specific folder in sharepoint. If the files are present in the `Test` folder in SharePoint Site under `root` directory, then the input for the loader for `file_path` is `Test` ![FilePath](file_path_info.png) +### Example loading a single folder ```python from llama_index import download_loader SharePointLoader = download_loader("SharePointReader") @@ -43,5 +49,40 @@ documents = loader.load_data( ) ``` +### Example loading all files, no pages +```python +from llama_index import download_loader +SharePointLoader = download_loader("SharePointReader") + +loader = SharePointLoader( + client_id = "", + client_secret = "", + tenant_id = "" + ) + +documents = loader.load_data( + sharepoint_site_name: "", + recursive = True, +) +``` + +### Example loading all files and pages +```python +from llama_index import download_loader +SharePointLoader = download_loader("SharePointReader") + +loader = SharePointLoader( + client_id = "", + client_secret = "", + tenant_id = "" + ) + +documents = loader.load_data( + sharepoint_site_name: "", + recursive = True, + include_pages = True +) +``` + The loader doesn't access other components of the `SharePoint Site`. diff --git a/llama_hub/microsoft_sharepoint/base.py b/llama_hub/microsoft_sharepoint/base.py index 9b42ea668a..a240e3af86 100644 --- a/llama_hub/microsoft_sharepoint/base.py +++ b/llama_hub/microsoft_sharepoint/base.py @@ -43,6 +43,13 @@ def __init__( self.tenant_id = tenant_id self._authorization_headers = None + def _setup_site_config(self, sharepoint_site_name: str): + access_token = self._get_access_token() + + self._site_id_with_host_name = self._get_site_id_with_host_name( + access_token, sharepoint_site_name + ) + def _get_access_token(self) -> str: """ Gets the access_token for accessing file from SharePoint. @@ -154,9 +161,14 @@ def _get_sharepoint_folder_id(self, folder_path: str) -> str: Returns: str: The ID of the SharePoint site folder. """ - folder_id_endpoint = ( - f"{self._drive_id_endpoint}/{self._drive_id}/root:/{folder_path}" - ) + if folder_path == 'root': + folder_id_endpoint = ( + f"{self._drive_id_endpoint}/{self._drive_id}/root" + ) + else: + folder_id_endpoint = ( + f"{self._drive_id_endpoint}/{self._drive_id}/root:/{folder_path}" + ) response = requests.get( url=folder_id_endpoint, @@ -218,6 +230,83 @@ def _download_files_and_extract_metadata( else: logger.error(response.json()["error"]) raise ValueError(response.json()["error"]) + + def _download_pages_and_extract_metadata( + self, + download_dir, + ): + pages_endpoint = f"https://graph.microsoft.com/beta/sites/{self._site_id_with_host_name}/pages" + + data = self._get_results_with_odatanext(pages_endpoint) + metadata = {} + for item in data: + file_metadata = self._extract_page(item, download_dir) + if file_metadata: + metadata.update(file_metadata) + return metadata + + def _extract_page(self, item, download_dir) -> None|Dict[str, Dict[str, str]]: + page_endpoint: str = f"https://graph.microsoft.com/beta/sites/{self._site_id_with_host_name}/pages/{item['id']}/microsoft.graph.sitepage/webparts" + file_name = item['name'].replace('.aspx', '.html') + + response = requests.get(url=page_endpoint, headers=self._authorization_headers) + metadata = {} + + if response.status_code == 200: + + html_content = "\n".join( + [ + i["innerHtml"] + for i in response.json()['value'] + if i["@odata.type"] == "#microsoft.graph.textWebPart" + ] + ) + if html_content == "": + return None + + # Create the directory if it does not exist and save the file. + if not os.path.exists(download_dir): + os.makedirs(download_dir) + file_path = os.path.join(download_dir, file_name) + with open(file_path, "w") as f: + f.write(html_content) + metadata[file_path] = self._extract_metadata_for_file(item) + return metadata + else: + logger.error(response.json()["error"]) + raise ValueError(response.json()["error"]) + + def _get_results_with_odatanext(self, request: str, **kwargs): + """ + Given a request, checks if the result contains `@odata.nextLink` in the result. + If true, this function returns itself calling the @odata.nextLink. + If false, this function returns a list of all retrieved values. + + Args: + request (str): A GET request to be made, that might include a field '@odata.nextLink' + + Returns: + Dict[str, str]: A dictionary containing the metadata of the pages to be extracted + """ + if "prev_responses" not in kwargs.keys(): + prev_responses = [] + else: + prev_responses = kwargs["prev_responses"] + response = requests.get(url=request, headers=self._authorization_headers) + if response.status_code == 200: + result: Dict[str, Any] = response.json() + prev_responses += result["value"] + if "@odata.nextLink" in result.keys(): + return self._get_results_with_odatanext( + request=result["@odata.nextLink"], + prev_responses=prev_responses, + ) + else: + return prev_responses + else: + logger.error(response.json()["error"]) + raise ValueError(response.json()["error"]) + def _download_file_by_url(self, item: Dict[str, Any], download_dir: str) -> str: """ @@ -278,10 +367,11 @@ def _download_file( metadata[file_path] = self._extract_metadata_for_file(item) return metadata + + def _download_files_from_sharepoint( self, download_dir: str, - sharepoint_site_name: str, sharepoint_folder_path: str, recursive: bool, ) -> Dict[str, str]: @@ -298,11 +388,7 @@ def _download_files_from_sharepoint( Dict[str, str]: A dictionary containing the metadata of the downloaded files. """ - access_token = self._get_access_token() - self._site_id_with_host_name = self._get_site_id_with_host_name( - access_token, sharepoint_site_name - ) self._drive_id = self._get_drive_id() @@ -351,8 +437,10 @@ def get_metadata(filename: str) -> Any: def load_data( self, sharepoint_site_name: str, - sharepoint_folder_path: str, + sharepoint_folder_path: str = "root", recursive: bool = False, + include_documents: bool = True, + include_pages: bool = False ) -> List[Document]: """ Loads the files from the specified folder in the SharePoint site. @@ -360,7 +448,12 @@ def load_data( Args: sharepoint_site_name (str): The name of the SharePoint site. sharepoint_folder_path (str): The path of the folder in the SharePoint site. + If `root`, loads data from the root folder of the + SharePoint site. recursive (bool): If True, files from all subfolders are downloaded. + include_documents (bool): If True, loads documents for the given + sharepoint_site_name and sharepoint_folder_path. + include_pages (bool): If True, loads SharePoint pages for the given site_name. Returns: List[Document]: A list containing the documents with metadata. @@ -370,13 +463,20 @@ def load_data( """ try: with tempfile.TemporaryDirectory() as temp_dir: - files_metadata = self._download_files_from_sharepoint( - temp_dir, sharepoint_site_name, sharepoint_folder_path, recursive - ) + self._setup_site_config(sharepoint_site_name) + files_metadata = {} + if include_documents: + files_metadata.update(self._download_files_from_sharepoint( + temp_dir, sharepoint_folder_path, recursive + )) + if include_pages: + files_metadata.update(self._download_pages_and_extract_metadata( + temp_dir + )) # return self.files_metadata return self._load_documents_with_metadata( - files_metadata, temp_dir, recursive - ) + files_metadata, temp_dir, recursive) + except Exception as exp: - logger.error("An error occurred while accessing SharePoint: %s", exp) + logger.error("An error occurred while accessing SharePoint: ", exc_info=True) From 7211e27dafd79b579e9a7fc0ad490a8b3a6fb141 Mon Sep 17 00:00:00 2001 From: levi Date: Tue, 6 Feb 2024 15:27:34 +0100 Subject: [PATCH 02/14] updated with new 'include' argument and 'file_types' argument to filter file_types when loading documents. --- llama_hub/microsoft_sharepoint/README.md | 44 +++++++++++++++++++++-- llama_hub/microsoft_sharepoint/base.py | 45 ++++++++++++++++-------- 2 files changed, 71 insertions(+), 18 deletions(-) diff --git a/llama_hub/microsoft_sharepoint/README.md b/llama_hub/microsoft_sharepoint/README.md index b34a45733d..64eec3fdb7 100644 --- a/llama_hub/microsoft_sharepoint/README.md +++ b/llama_hub/microsoft_sharepoint/README.md @@ -49,7 +49,7 @@ documents = loader.load_data( ) ``` -### Example loading all files, no pages +### Example loading all files and pages ```python from llama_index import download_loader SharePointLoader = download_loader("SharePointReader") @@ -63,10 +63,47 @@ loader = SharePointLoader( documents = loader.load_data( sharepoint_site_name: "", recursive = True, + include= ['pages', 'documents'] ) ``` -### Example loading all files and pages +### Example loading just pages +```python +from llama_index import download_loader +SharePointLoader = download_loader("SharePointReader") + +loader = SharePointLoader( + client_id = "", + client_secret = "", + tenant_id = "" + ) + +documents = loader.load_data( + sharepoint_site_name: "", + recursive = True, + include = ['pages'] +) +``` + +### Example loading just documents +```python +from llama_index import download_loader +SharePointLoader = download_loader("SharePointReader") + +loader = SharePointLoader( + client_id = "", + client_secret = "", + tenant_id = "" + ) + +documents = loader.load_data( + sharepoint_site_name: "", + recursive = True, + include = ['documents'] +) +``` + +### Example loading just documents with filetype .docx or .pdf ```python from llama_index import download_loader SharePointLoader = download_loader("SharePointReader") @@ -80,7 +117,8 @@ loader = SharePointLoader( documents = loader.load_data( sharepoint_site_name: "", recursive = True, - include_pages = True + include = ['documents'], + file_types = ['docx', 'pdf'] ) ``` diff --git a/llama_hub/microsoft_sharepoint/base.py b/llama_hub/microsoft_sharepoint/base.py index a240e3af86..ffd9c51842 100644 --- a/llama_hub/microsoft_sharepoint/base.py +++ b/llama_hub/microsoft_sharepoint/base.py @@ -184,7 +184,8 @@ def _download_files_and_extract_metadata( self, folder_id: str, download_dir: str, - include_subfolders: bool = False, + include_subfolders: bool, + file_types: List[str] ) -> Dict[str, str]: """ Downloads files from the specified folder ID and extracts metadata. @@ -193,6 +194,7 @@ def _download_files_and_extract_metadata( folder_id (str): The ID of the folder from which the files should be downloaded. download_dir (str): The directory where the files should be downloaded. include_subfolders (bool): If True, files from all subfolders are downloaded. + file_types: (List[str]): A set of file types to load. If empty, loads all file types. Returns: Dict[str, str]: A dictionary containing the metadata of the downloaded files. @@ -219,13 +221,16 @@ def _download_files_and_extract_metadata( folder_id=item["id"], download_dir=sub_folder_download_dir, include_subfolders=include_subfolders, + file_types=file_types ) metadata.update(subfolder_metadata) elif "file" in item: - file_metadata = self._download_file(item, download_dir) - metadata.update(file_metadata) + file_type = item['name'].split('.')[-1] + if not file_types or (file_type in file_types): + file_metadata = self._download_file(item, download_dir) + metadata.update(file_metadata) return metadata else: logger.error(response.json()["error"]) @@ -374,6 +379,7 @@ def _download_files_from_sharepoint( download_dir: str, sharepoint_folder_path: str, recursive: bool, + file_types: List[str] ) -> Dict[str, str]: """ Downloads files from the specified folder and returns the metadata for the downloaded files. @@ -383,6 +389,7 @@ def _download_files_from_sharepoint( sharepoint_site_name (str): The name of the SharePoint site. sharepoint_folder_path (str): The path of the folder in the SharePoint site. recursive (bool): If True, files from all subfolders are downloaded. + file_types: (List[str]): A set of file types to load. If empty, loads all file types. Returns: Dict[str, str]: A dictionary containing the metadata of the downloaded files. @@ -397,7 +404,10 @@ def _download_files_from_sharepoint( ) metadata = self._download_files_and_extract_metadata( - self.sharepoint_folder_id, download_dir, recursive + folder_id=self.sharepoint_folder_id, + download_dir=download_dir, + include_subfolders=recursive, + file_types=file_types ) return metadata @@ -439,8 +449,8 @@ def load_data( sharepoint_site_name: str, sharepoint_folder_path: str = "root", recursive: bool = False, - include_documents: bool = True, - include_pages: bool = False + include: List[str] = ['documents', 'pages'], + file_types: List[str] = [] ) -> List[Document]: """ Loads the files from the specified folder in the SharePoint site. @@ -451,9 +461,10 @@ def load_data( If `root`, loads data from the root folder of the SharePoint site. recursive (bool): If True, files from all subfolders are downloaded. - include_documents (bool): If True, loads documents for the given - sharepoint_site_name and sharepoint_folder_path. - include_pages (bool): If True, loads SharePoint pages for the given site_name. + include (List[str]): list of Sharepoint objects to include. + Must contain at least 'pages' or 'documents' or both. + file_types (List[str]): list of file extensions to include when downloading from + the Sharepoint Drive. Leave empty to download all filetypes. Returns: List[Document]: A list containing the documents with metadata. @@ -461,22 +472,26 @@ def load_data( Raises: Exception: If an error occurs while accessing SharePoint site. """ + if not include: + raise ValueError("'include' should not be an empty list, and include either 'documents' and/or 'pages'") + if any([i not in ['documents', 'pages'] for i in include]): + raise ValueError("'include' contains an unexpected value. " + + f"Valid values are ['documents', 'pages'], but got {include}") + if 'documents' not in include and (recursive or file_types): + logger.warning("'documents' is not in 'included', so 'recursive' and 'file_types' have no effect.") try: with tempfile.TemporaryDirectory() as temp_dir: self._setup_site_config(sharepoint_site_name) files_metadata = {} - if include_documents: + if 'documents' in include: files_metadata.update(self._download_files_from_sharepoint( - temp_dir, sharepoint_folder_path, recursive + temp_dir, sharepoint_folder_path, recursive, file_types )) - if include_pages: + if 'pages' in include: files_metadata.update(self._download_pages_and_extract_metadata( temp_dir )) - # return self.files_metadata return self._load_documents_with_metadata( files_metadata, temp_dir, recursive) - - except Exception as exp: logger.error("An error occurred while accessing SharePoint: ", exc_info=True) From ba27fab4189095af21edb0e48a7d2d45e82c0434 Mon Sep 17 00:00:00 2001 From: levi Date: Tue, 6 Feb 2024 15:56:42 +0100 Subject: [PATCH 03/14] Added documentation --- llama_hub/microsoft_sharepoint/base.py | 36 ++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/llama_hub/microsoft_sharepoint/base.py b/llama_hub/microsoft_sharepoint/base.py index ffd9c51842..be8e7aa008 100644 --- a/llama_hub/microsoft_sharepoint/base.py +++ b/llama_hub/microsoft_sharepoint/base.py @@ -240,6 +240,18 @@ def _download_pages_and_extract_metadata( self, download_dir, ): + """ + Downloads Sharepoint pages as HTML files and extracts metadata. + + Args: + download_dir (str): The directory where the files should be downloaded. + + Returns: + Dict[str, str]: A dictionary containing the metadata of the downloaded Sharepoint pages. + + Raises: + ValueError: If there is an error in downloading the files. + """ pages_endpoint = f"https://graph.microsoft.com/beta/sites/{self._site_id_with_host_name}/pages" data = self._get_results_with_odatanext(pages_endpoint) @@ -251,6 +263,18 @@ def _download_pages_and_extract_metadata( return metadata def _extract_page(self, item, download_dir) -> None|Dict[str, Dict[str, str]]: + """ + Retrieves the HTML content of the SharePoint page referenced by the 'item' argument + from the Microsoft Graph. Stores the content as an .html file in the download_dir. + + Args: + item (Dict[str, Any]): a sharepoint item that contains + the fields 'id', 'name' and 'webUrl' + download_dir (str): A directory to download the file to. + + Returns: + The metadata of the item + """ page_endpoint: str = f"https://graph.microsoft.com/beta/sites/{self._site_id_with_host_name}/pages/{item['id']}/microsoft.graph.sitepage/webparts" file_name = item['name'].replace('.aspx', '.html') @@ -365,6 +389,18 @@ def _download_file( item: Dict[str, Any], download_dir: str, ): + """ + Downloads a file to the temporary download folder and returns + its metadata. + + Args: + item (Dict[str, Any]): a sharepoint item that contains + the fields 'id', 'name' and 'webUrl' + download_dir (str): A directory to download the file to. + + Returns: + The metadata of the item + """ metadata = {} file_path = self._download_file_by_url(item, download_dir) From ea1dfb67e5186dd6bb349cb0948e0a7b877b69e2 Mon Sep 17 00:00:00 2001 From: levi Date: Tue, 6 Feb 2024 16:01:05 +0100 Subject: [PATCH 04/14] running format and lint --- llama_hub/microsoft_sharepoint/base.py | 101 +++++++++++++------------ 1 file changed, 53 insertions(+), 48 deletions(-) diff --git a/llama_hub/microsoft_sharepoint/base.py b/llama_hub/microsoft_sharepoint/base.py index be8e7aa008..d59cc478cb 100644 --- a/llama_hub/microsoft_sharepoint/base.py +++ b/llama_hub/microsoft_sharepoint/base.py @@ -161,10 +161,8 @@ def _get_sharepoint_folder_id(self, folder_path: str) -> str: Returns: str: The ID of the SharePoint site folder. """ - if folder_path == 'root': - folder_id_endpoint = ( - f"{self._drive_id_endpoint}/{self._drive_id}/root" - ) + if folder_path == "root": + folder_id_endpoint = f"{self._drive_id_endpoint}/{self._drive_id}/root" else: folder_id_endpoint = ( f"{self._drive_id_endpoint}/{self._drive_id}/root:/{folder_path}" @@ -185,7 +183,7 @@ def _download_files_and_extract_metadata( folder_id: str, download_dir: str, include_subfolders: bool, - file_types: List[str] + file_types: List[str], ) -> Dict[str, str]: """ Downloads files from the specified folder ID and extracts metadata. @@ -221,13 +219,13 @@ def _download_files_and_extract_metadata( folder_id=item["id"], download_dir=sub_folder_download_dir, include_subfolders=include_subfolders, - file_types=file_types + file_types=file_types, ) metadata.update(subfolder_metadata) elif "file" in item: - file_type = item['name'].split('.')[-1] + file_type = item["name"].split(".")[-1] if not file_types or (file_type in file_types): file_metadata = self._download_file(item, download_dir) metadata.update(file_metadata) @@ -235,11 +233,11 @@ def _download_files_and_extract_metadata( else: logger.error(response.json()["error"]) raise ValueError(response.json()["error"]) - + def _download_pages_and_extract_metadata( - self, - download_dir, - ): + self, + download_dir, + ): """ Downloads Sharepoint pages as HTML files and extracts metadata. @@ -262,13 +260,13 @@ def _download_pages_and_extract_metadata( metadata.update(file_metadata) return metadata - def _extract_page(self, item, download_dir) -> None|Dict[str, Dict[str, str]]: + def _extract_page(self, item, download_dir) -> None | Dict[str, Dict[str, str]]: """ - Retrieves the HTML content of the SharePoint page referenced by the 'item' argument + Retrieves the HTML content of the SharePoint page referenced by the 'item' argument from the Microsoft Graph. Stores the content as an .html file in the download_dir. Args: - item (Dict[str, Any]): a sharepoint item that contains + item (Dict[str, Any]): a sharepoint item that contains the fields 'id', 'name' and 'webUrl' download_dir (str): A directory to download the file to. @@ -276,7 +274,7 @@ def _extract_page(self, item, download_dir) -> None|Dict[str, Dict[str, str]]: The metadata of the item """ page_endpoint: str = f"https://graph.microsoft.com/beta/sites/{self._site_id_with_host_name}/pages/{item['id']}/microsoft.graph.sitepage/webparts" - file_name = item['name'].replace('.aspx', '.html') + file_name = item["name"].replace(".aspx", ".html") response = requests.get(url=page_endpoint, headers=self._authorization_headers) metadata = {} @@ -286,7 +284,7 @@ def _extract_page(self, item, download_dir) -> None|Dict[str, Dict[str, str]]: html_content = "\n".join( [ i["innerHtml"] - for i in response.json()['value'] + for i in response.json()["value"] if i["@odata.type"] == "#microsoft.graph.textWebPart" ] ) @@ -316,7 +314,7 @@ def _get_results_with_odatanext(self, request: str, **kwargs): Returns: Dict[str, str]: A dictionary containing the metadata of the pages to be extracted - """ + """ if "prev_responses" not in kwargs.keys(): prev_responses = [] else: @@ -336,7 +334,6 @@ def _get_results_with_odatanext(self, request: str, **kwargs): logger.error(response.json()["error"]) raise ValueError(response.json()["error"]) - def _download_file_by_url(self, item: Dict[str, Any], download_dir: str) -> str: """ Downloads the file from the provided URL. @@ -391,10 +388,10 @@ def _download_file( ): """ Downloads a file to the temporary download folder and returns - its metadata. + its metadata. Args: - item (Dict[str, Any]): a sharepoint item that contains + item (Dict[str, Any]): a sharepoint item that contains the fields 'id', 'name' and 'webUrl' download_dir (str): A directory to download the file to. @@ -408,14 +405,12 @@ def _download_file( metadata[file_path] = self._extract_metadata_for_file(item) return metadata - - def _download_files_from_sharepoint( self, download_dir: str, sharepoint_folder_path: str, recursive: bool, - file_types: List[str] + file_types: List[str], ) -> Dict[str, str]: """ Downloads files from the specified folder and returns the metadata for the downloaded files. @@ -432,7 +427,6 @@ def _download_files_from_sharepoint( """ - self._drive_id = self._get_drive_id() self.sharepoint_folder_id = self._get_sharepoint_folder_id( @@ -440,10 +434,10 @@ def _download_files_from_sharepoint( ) metadata = self._download_files_and_extract_metadata( - folder_id=self.sharepoint_folder_id, - download_dir=download_dir, - include_subfolders=recursive, - file_types=file_types + folder_id=self.sharepoint_folder_id, + download_dir=download_dir, + include_subfolders=recursive, + file_types=file_types, ) return metadata @@ -485,8 +479,8 @@ def load_data( sharepoint_site_name: str, sharepoint_folder_path: str = "root", recursive: bool = False, - include: List[str] = ['documents', 'pages'], - file_types: List[str] = [] + include: List[str] = ["documents", "pages"], + file_types: List[str] = [], ) -> List[Document]: """ Loads the files from the specified folder in the SharePoint site. @@ -494,10 +488,10 @@ def load_data( Args: sharepoint_site_name (str): The name of the SharePoint site. sharepoint_folder_path (str): The path of the folder in the SharePoint site. - If `root`, loads data from the root folder of the + If `root`, loads data from the root folder of the SharePoint site. recursive (bool): If True, files from all subfolders are downloaded. - include (List[str]): list of Sharepoint objects to include. + include (List[str]): list of Sharepoint objects to include. Must contain at least 'pages' or 'documents' or both. file_types (List[str]): list of file extensions to include when downloading from the Sharepoint Drive. Leave empty to download all filetypes. @@ -509,25 +503,36 @@ def load_data( Exception: If an error occurs while accessing SharePoint site. """ if not include: - raise ValueError("'include' should not be an empty list, and include either 'documents' and/or 'pages'") - if any([i not in ['documents', 'pages'] for i in include]): - raise ValueError("'include' contains an unexpected value. " + - f"Valid values are ['documents', 'pages'], but got {include}") - if 'documents' not in include and (recursive or file_types): - logger.warning("'documents' is not in 'included', so 'recursive' and 'file_types' have no effect.") + raise ValueError( + "'include' should not be an empty list, and include either 'documents' and/or 'pages'" + ) + if any([i not in ["documents", "pages"] for i in include]): + raise ValueError( + "'include' contains an unexpected value. " + + f"Valid values are ['documents', 'pages'], but got {include}" + ) + if "documents" not in include and (recursive or file_types): + logger.warning( + "'documents' is not in 'included', so 'recursive' and 'file_types' have no effect." + ) try: with tempfile.TemporaryDirectory() as temp_dir: self._setup_site_config(sharepoint_site_name) files_metadata = {} - if 'documents' in include: - files_metadata.update(self._download_files_from_sharepoint( - temp_dir, sharepoint_folder_path, recursive, file_types - )) - if 'pages' in include: - files_metadata.update(self._download_pages_and_extract_metadata( - temp_dir - )) + if "documents" in include: + files_metadata.update( + self._download_files_from_sharepoint( + temp_dir, sharepoint_folder_path, recursive, file_types + ) + ) + if "pages" in include: + files_metadata.update( + self._download_pages_and_extract_metadata(temp_dir) + ) return self._load_documents_with_metadata( - files_metadata, temp_dir, recursive) + files_metadata, temp_dir, recursive + ) except Exception as exp: - logger.error("An error occurred while accessing SharePoint: ", exc_info=True) + logger.error( + "An error occurred while accessing SharePoint: %s", exp, exc_info=True + ) From caaf94fe9db91f07fb929639afd502daef73398b Mon Sep 17 00:00:00 2001 From: levi Date: Wed, 7 Feb 2024 09:04:50 +0100 Subject: [PATCH 05/14] fixed issue with None in output type hints --- llama_hub/microsoft_sharepoint/base.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/llama_hub/microsoft_sharepoint/base.py b/llama_hub/microsoft_sharepoint/base.py index d59cc478cb..278adb297e 100644 --- a/llama_hub/microsoft_sharepoint/base.py +++ b/llama_hub/microsoft_sharepoint/base.py @@ -255,12 +255,14 @@ def _download_pages_and_extract_metadata( data = self._get_results_with_odatanext(pages_endpoint) metadata = {} for item in data: - file_metadata = self._extract_page(item, download_dir) - if file_metadata: + try: + file_metadata = self._extract_page(item, download_dir) metadata.update(file_metadata) + except ValueError: + pass return metadata - def _extract_page(self, item, download_dir) -> None | Dict[str, Dict[str, str]]: + def _extract_page(self, item, download_dir) -> Dict[str, Dict[str, str]]: """ Retrieves the HTML content of the SharePoint page referenced by the 'item' argument from the Microsoft Graph. Stores the content as an .html file in the download_dir. @@ -289,7 +291,7 @@ def _extract_page(self, item, download_dir) -> None | Dict[str, Dict[str, str]]: ] ) if html_content == "": - return None + raise ValueError(f"The page {item['name']} does not contain a textWebPart.") # Create the directory if it does not exist and save the file. if not os.path.exists(download_dir): From b70102ebe448c13623af93cdc0ee4153f2fd099e Mon Sep 17 00:00:00 2001 From: levi Date: Wed, 7 Feb 2024 11:05:35 +0100 Subject: [PATCH 06/14] running format and lint --- llama_hub/microsoft_sharepoint/base.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/llama_hub/microsoft_sharepoint/base.py b/llama_hub/microsoft_sharepoint/base.py index 278adb297e..a295fc5d37 100644 --- a/llama_hub/microsoft_sharepoint/base.py +++ b/llama_hub/microsoft_sharepoint/base.py @@ -291,7 +291,9 @@ def _extract_page(self, item, download_dir) -> Dict[str, Dict[str, str]]: ] ) if html_content == "": - raise ValueError(f"The page {item['name']} does not contain a textWebPart.") + raise ValueError( + f"The page {item['name']} does not contain a textWebPart." + ) # Create the directory if it does not exist and save the file. if not os.path.exists(download_dir): From 1146d69326bddff0122e621000c71dde922ae28e Mon Sep 17 00:00:00 2001 From: levi Date: Wed, 7 Feb 2024 15:49:28 +0100 Subject: [PATCH 07/14] implemented batch call for page content --- llama_hub/microsoft_sharepoint/base.py | 52 +++++++++++++++++++------- 1 file changed, 38 insertions(+), 14 deletions(-) diff --git a/llama_hub/microsoft_sharepoint/base.py b/llama_hub/microsoft_sharepoint/base.py index a295fc5d37..b87ebe2d37 100644 --- a/llama_hub/microsoft_sharepoint/base.py +++ b/llama_hub/microsoft_sharepoint/base.py @@ -253,40 +253,62 @@ def _download_pages_and_extract_metadata( pages_endpoint = f"https://graph.microsoft.com/beta/sites/{self._site_id_with_host_name}/pages" data = self._get_results_with_odatanext(pages_endpoint) + # the maximum is 20 requests per batch + # see https://learn.microsoft.com/en-us/graph/json-batching + batch_size = 20 metadata = {} - for item in data: - try: - file_metadata = self._extract_page(item, download_dir) - metadata.update(file_metadata) - except ValueError: - pass + for i in range(0, len(data), batch_size): + batch = dict(enumerate(data[i : i + batch_size])) + batch_endpoint: str = "https://graph.microsoft.com/beta/$batch" + body = { + "requests": [ + { + "url": f"/sites/{self._site_id_with_host_name}/pages/{item['id']}/microsoft.graph.sitepage/webparts", + "method": "GET", + "id": idx, + } + for idx, item in batch.items() + ] + } + batch_response = requests.post( + url=batch_endpoint, json=body, headers=self._authorization_headers + ) + for response in batch_response.json()["responses"]: + try: + file_metadata = self._extract_page( + item=batch[int(response["id"])], + response=response, + download_dir=download_dir, + ) + metadata.update(file_metadata) + except ValueError: + pass return metadata - def _extract_page(self, item, download_dir) -> Dict[str, Dict[str, str]]: + def _extract_page(self, item, response, download_dir) -> Dict[str, Dict[str, str]]: """ Retrieves the HTML content of the SharePoint page referenced by the 'item' argument - from the Microsoft Graph. Stores the content as an .html file in the download_dir. + from the Microsoft Graph batch response. Stores the content as an .html file in the download_dir. Args: item (Dict[str, Any]): a sharepoint item that contains the fields 'id', 'name' and 'webUrl' + response (Dict[str, Any]): A single Microsoft Graph response from a batch request. + Expected to be correlated with the given item. download_dir (str): A directory to download the file to. Returns: The metadata of the item """ - page_endpoint: str = f"https://graph.microsoft.com/beta/sites/{self._site_id_with_host_name}/pages/{item['id']}/microsoft.graph.sitepage/webparts" file_name = item["name"].replace(".aspx", ".html") - - response = requests.get(url=page_endpoint, headers=self._authorization_headers) metadata = {} - if response.status_code == 200: + if response.get("status") == 200: html_content = "\n".join( [ i["innerHtml"] - for i in response.json()["value"] + for i in response["body"]["value"] if i["@odata.type"] == "#microsoft.graph.textWebPart" ] ) @@ -305,7 +327,9 @@ def _extract_page(self, item, download_dir) -> Dict[str, Dict[str, str]]: return metadata else: logger.error(response.json()["error"]) - raise ValueError(response.json()["error"]) + raise ValueError( + f"status: {response['status']}, body: {response['body']['error']}" + ) def _get_results_with_odatanext(self, request: str, **kwargs): """ From 45d5d41f467a87458e6e4cc8235c16dcf6a807e2 Mon Sep 17 00:00:00 2001 From: levi Date: Wed, 7 Feb 2024 16:10:59 +0100 Subject: [PATCH 08/14] Improved type hints, documentation and comments --- llama_hub/microsoft_sharepoint/base.py | 51 +++++++++++++++++--------- 1 file changed, 33 insertions(+), 18 deletions(-) diff --git a/llama_hub/microsoft_sharepoint/base.py b/llama_hub/microsoft_sharepoint/base.py index b87ebe2d37..a1be48d89e 100644 --- a/llama_hub/microsoft_sharepoint/base.py +++ b/llama_hub/microsoft_sharepoint/base.py @@ -44,10 +44,11 @@ def __init__( self._authorization_headers = None def _setup_site_config(self, sharepoint_site_name: str): - access_token = self._get_access_token() - + self._authorization_headers = { + "Authorization": f"Bearer {self._get_access_token()}" + } self._site_id_with_host_name = self._get_site_id_with_host_name( - access_token, sharepoint_site_name + sharepoint_site_name ) def _get_access_token(self) -> str: @@ -81,11 +82,12 @@ def _get_access_token(self) -> str: logger.error(response.json()["error"]) raise ValueError(response.json()["error_description"]) - def _get_site_id_with_host_name(self, access_token, sharepoint_site_name) -> str: + def _get_site_id_with_host_name(self, sharepoint_site_name: str) -> str: """ Retrieves the site ID of a SharePoint site using the provided site name. Args: + access_token (str): access_token sharepoint_site_name (str): The name of the SharePoint site. Returns: @@ -97,7 +99,6 @@ def _get_site_id_with_host_name(self, access_token, sharepoint_site_name) -> str site_information_endpoint = ( f"https://graph.microsoft.com/v1.0/sites?search={sharepoint_site_name}" ) - self._authorization_headers = {"Authorization": f"Bearer {access_token}"} response = requests.get( url=site_information_endpoint, @@ -184,7 +185,7 @@ def _download_files_and_extract_metadata( download_dir: str, include_subfolders: bool, file_types: List[str], - ) -> Dict[str, str]: + ) -> Dict[str, Dict[str, str]]: """ Downloads files from the specified folder ID and extracts metadata. @@ -195,7 +196,7 @@ def _download_files_and_extract_metadata( file_types: (List[str]): A set of file types to load. If empty, loads all file types. Returns: - Dict[str, str]: A dictionary containing the metadata of the downloaded files. + Dict[str, Dict[str, str]]: A dictionary containing the metadata of the downloaded files. Raises: ValueError: If there is an error in downloading the files. @@ -236,8 +237,8 @@ def _download_files_and_extract_metadata( def _download_pages_and_extract_metadata( self, - download_dir, - ): + download_dir: str, + ) -> Dict[str, Dict[str, str]]: """ Downloads Sharepoint pages as HTML files and extracts metadata. @@ -245,7 +246,7 @@ def _download_pages_and_extract_metadata( download_dir (str): The directory where the files should be downloaded. Returns: - Dict[str, str]: A dictionary containing the metadata of the downloaded Sharepoint pages. + Dict[str, Dict[str, str]]: A dictionary containing the metadata of the downloaded Sharepoint pages. Raises: ValueError: If there is an error in downloading the files. @@ -257,9 +258,14 @@ def _download_pages_and_extract_metadata( # see https://learn.microsoft.com/en-us/graph/json-batching batch_size = 20 metadata = {} + + # request the page content for a batch of 20 pages for i in range(0, len(data), batch_size): + # Create a dict using enumerate to index each item in the batch, to later correlate the result with the original data batch = dict(enumerate(data[i : i + batch_size])) batch_endpoint: str = "https://graph.microsoft.com/beta/$batch" + + # set-up the requests to be made body = { "requests": [ { @@ -273,6 +279,9 @@ def _download_pages_and_extract_metadata( batch_response = requests.post( url=batch_endpoint, json=body, headers=self._authorization_headers ) + + # the result should contain results for all pages. + # If something went wrong, this is indicated in the response per page for response in batch_response.json()["responses"]: try: file_metadata = self._extract_page( @@ -285,20 +294,23 @@ def _download_pages_and_extract_metadata( pass return metadata - def _extract_page(self, item, response, download_dir) -> Dict[str, Dict[str, str]]: + def _extract_page( + self, item: Dict[str, Any], response: Dict[str, Any], download_dir: str + ) -> Dict[str, Dict[str, str]]: """ Retrieves the HTML content of the SharePoint page referenced by the 'item' argument from the Microsoft Graph batch response. Stores the content as an .html file in the download_dir. Args: item (Dict[str, Any]): a sharepoint item that contains - the fields 'id', 'name' and 'webUrl' + the fields 'id', 'name' and 'webUrl'. response (Dict[str, Any]): A single Microsoft Graph response from a batch request. Expected to be correlated with the given item. download_dir (str): A directory to download the file to. Returns: - The metadata of the item + Dict[str, Dict[str, str]]: The file_name of the page stored in the download_dir as key + and the metadata of the page (item) as value """ file_name = item["name"].replace(".aspx", ".html") metadata = {} @@ -331,7 +343,9 @@ def _extract_page(self, item, response, download_dir) -> Dict[str, Dict[str, str f"status: {response['status']}, body: {response['body']['error']}" ) - def _get_results_with_odatanext(self, request: str, **kwargs): + def _get_results_with_odatanext( + self, request: str, **kwargs + ) -> List[Dict[str, Any]]: """ Given a request, checks if the result contains `@odata.nextLink` in the result. If true, this function returns itself calling the @odata.nextLink. @@ -341,7 +355,7 @@ def _get_results_with_odatanext(self, request: str, **kwargs): request (str): A GET request to be made, that might include a field '@odata.nextLink' Returns: - Dict[str, str]: A dictionary containing the metadata of the pages to be extracted + List[Dict[str, Any]]: A List with containing the metadata in Dict[str, Any] form of the pages to be extracted """ if "prev_responses" not in kwargs.keys(): prev_responses = [] @@ -413,7 +427,7 @@ def _download_file( self, item: Dict[str, Any], download_dir: str, - ): + ) -> Dict[str, Dict[str, str]]: """ Downloads a file to the temporary download folder and returns its metadata. @@ -439,7 +453,7 @@ def _download_files_from_sharepoint( sharepoint_folder_path: str, recursive: bool, file_types: List[str], - ) -> Dict[str, str]: + ) -> Dict[str, Dict[str, str]]: """ Downloads files from the specified folder and returns the metadata for the downloaded files. @@ -451,7 +465,8 @@ def _download_files_from_sharepoint( file_types: (List[str]): A set of file types to load. If empty, loads all file types. Returns: - Dict[str, str]: A dictionary containing the metadata of the downloaded files. + Dict[str,Dict[str, str]]: A dictionary containing file_name of the stored file + as key and the metadata of the downloaded files as value. """ From 2901c8c5a4db258510222189b7fec7f31908933c Mon Sep 17 00:00:00 2001 From: levi Date: Thu, 8 Feb 2024 16:36:13 +0100 Subject: [PATCH 09/14] fix typo --- llama_hub/microsoft_sharepoint/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama_hub/microsoft_sharepoint/base.py b/llama_hub/microsoft_sharepoint/base.py index a1be48d89e..379710d6c8 100644 --- a/llama_hub/microsoft_sharepoint/base.py +++ b/llama_hub/microsoft_sharepoint/base.py @@ -556,7 +556,7 @@ def load_data( ) if "documents" not in include and (recursive or file_types): logger.warning( - "'documents' is not in 'included', so 'recursive' and 'file_types' have no effect." + "'documents' is not in 'include', so 'recursive' and 'file_types' have no effect." ) try: with tempfile.TemporaryDirectory() as temp_dir: From 0e01fa2aa50cfaa5a98c7367abb347eeb331f940 Mon Sep 17 00:00:00 2001 From: levi Date: Tue, 13 Feb 2024 11:33:05 +0100 Subject: [PATCH 10/14] Updated ReadMe --- llama_hub/microsoft_sharepoint/README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/llama_hub/microsoft_sharepoint/README.md b/llama_hub/microsoft_sharepoint/README.md index 64eec3fdb7..ccba57e731 100644 --- a/llama_hub/microsoft_sharepoint/README.md +++ b/llama_hub/microsoft_sharepoint/README.md @@ -25,7 +25,6 @@ This loader can: - Load all files present in the drive of a SharePoint - Load all pages under a SharePoint site -This loader loads the files present in a specific folder in sharepoint. If the files are present in the `Test` folder in SharePoint Site under `root` directory, then the input for the loader for `file_path` is `Test` From aca4c9e4f7695fd302fa2d86a7c39c6a5011eb79 Mon Sep 17 00:00:00 2001 From: levi Date: Tue, 13 Feb 2024 11:46:08 +0100 Subject: [PATCH 11/14] Updated docstring _get_site_id_with_host_name --- llama_hub/microsoft_sharepoint/base.py | 1 - 1 file changed, 1 deletion(-) diff --git a/llama_hub/microsoft_sharepoint/base.py b/llama_hub/microsoft_sharepoint/base.py index 379710d6c8..6c596cc883 100644 --- a/llama_hub/microsoft_sharepoint/base.py +++ b/llama_hub/microsoft_sharepoint/base.py @@ -87,7 +87,6 @@ def _get_site_id_with_host_name(self, sharepoint_site_name: str) -> str: Retrieves the site ID of a SharePoint site using the provided site name. Args: - access_token (str): access_token sharepoint_site_name (str): The name of the SharePoint site. Returns: From 8647de0883ec05ae3c5a6d18a035b405975f3c71 Mon Sep 17 00:00:00 2001 From: levi Date: Tue, 13 Feb 2024 12:40:11 +0100 Subject: [PATCH 12/14] changed default sharepoint_folder_path argument to "" instead of root --- llama_hub/microsoft_sharepoint/base.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/llama_hub/microsoft_sharepoint/base.py b/llama_hub/microsoft_sharepoint/base.py index e342d69b9e..6c1a0711a5 100644 --- a/llama_hub/microsoft_sharepoint/base.py +++ b/llama_hub/microsoft_sharepoint/base.py @@ -168,12 +168,10 @@ def _get_sharepoint_folder_id(self, folder_path: str) -> str: Returns: str: The ID of the SharePoint site folder. """ - if folder_path == "root": - folder_id_endpoint = f"{self._drive_id_endpoint}/{self._drive_id}/root" - else: - folder_id_endpoint = ( - f"{self._drive_id_endpoint}/{self._drive_id}/root:/{folder_path}" - ) + folder_id_endpoint = f"{self._drive_id_endpoint}/{self._drive_id}/root" + + if folder_path: + folder_id_endpoint += f":/{folder_path}" response = requests.get( url=folder_id_endpoint, @@ -530,7 +528,7 @@ def get_metadata(filename: str) -> Any: def load_data( self, sharepoint_site_name: str, - sharepoint_folder_path: str = "root", + sharepoint_folder_path: str = "", recursive: bool = False, include: List[str] = ["documents", "pages"], file_types: List[str] = [], @@ -541,7 +539,7 @@ def load_data( Args: sharepoint_site_name (str): The name of the SharePoint site. sharepoint_folder_path (str): The path of the folder in the SharePoint site. - If `root`, loads data from the root folder of the + If `""` (default), loads data from the root folder of the SharePoint site. recursive (bool): If True, files from all subfolders are downloaded. include (List[str]): list of Sharepoint objects to include. From 9f1875a43d674288997e972374068162e7b80afa Mon Sep 17 00:00:00 2001 From: levi Date: Tue, 13 Feb 2024 12:40:31 +0100 Subject: [PATCH 13/14] Updated ReadMe with more explicit instructions on how to use parameters, and what the default parameters are. --- llama_hub/microsoft_sharepoint/README.md | 44 ++++++++++++++++++------ 1 file changed, 34 insertions(+), 10 deletions(-) diff --git a/llama_hub/microsoft_sharepoint/README.md b/llama_hub/microsoft_sharepoint/README.md index ccba57e731..09a5eb5a56 100644 --- a/llama_hub/microsoft_sharepoint/README.md +++ b/llama_hub/microsoft_sharepoint/README.md @@ -30,7 +30,13 @@ If the files are present in the `Test` folder in SharePoint Site under `root` di ![FilePath](file_path_info.png) -### Example loading a single folder +### Example loading all files and pages +If `sharepoint_folder_path` is not provided it defaults to `""`. +In that case, the root folder of the SharePoint Drive is used as the folder to load files from. + +If both `sharepoint_folder_path` is not provided and `recursive` is set to `True`, all files in the SharePoint Drive are loaded. +If `recursive` is not provided, it defaults to `False`. In this case, files from subfolders are not loaded. + ```python from llama_index import download_loader SharePointLoader = download_loader("SharePointReader") @@ -42,13 +48,20 @@ loader = SharePointLoader( ) documents = loader.load_data( - sharepoint_site_name: "", - sharepoint_folder_path: "", + sharepoint_site_name = "", recursive = True, ) ``` -### Example loading all files and pages +### Example loading a single folder +To load a single folder, specify the `sharepoint_folder_path` with the name of the folder or path from the root directory. +Example: `sharepoint_folder_path = "my/folder/path"` + +In order to load only the documents from this `sharepoint_folder_path`, and not the pages for the `sharepoint_site_name`, +you need to provide the `include` argument as `['documents]`. By default, `include` is equal to `['documents', 'pages']`. + +If you do not want to include files from subfolders for the given `sharepoint_folder_path`, remove the argument `recursive` (defaults to `False`). + ```python from llama_index import download_loader SharePointLoader = download_loader("SharePointReader") @@ -60,13 +73,21 @@ loader = SharePointLoader( ) documents = loader.load_data( - sharepoint_site_name: "", + sharepoint_site_name = "", + sharepoint_folder_path = "", recursive = True, - include= ['pages', 'documents'] + include = ['documents'] ) ``` + + ### Example loading just pages +In order to load only the pages for the `sharepoint_site_name`, +you need to provide the `include` argument as `['pages]`. By default, `include` is equal to `['documents', 'pages']`. + +Note: `recursive` and `sharepoint_folder_path` arguments have no effect if `documents` is not in `include`. + ```python from llama_index import download_loader SharePointLoader = download_loader("SharePointReader") @@ -78,8 +99,7 @@ loader = SharePointLoader( ) documents = loader.load_data( - sharepoint_site_name: "", - recursive = True, + sharepoint_site_name = "", include = ['pages'] ) ``` @@ -96,13 +116,17 @@ loader = SharePointLoader( ) documents = loader.load_data( - sharepoint_site_name: "", + sharepoint_site_name = "", recursive = True, include = ['documents'] ) ``` ### Example loading just documents with filetype .docx or .pdf + +If you want to only load specific filetypes, provide the file extension names in `file_types`. +Example: to only include .pdf and .docx files, set `file_types` to `['docx', 'pdf']` + ```python from llama_index import download_loader SharePointLoader = download_loader("SharePointReader") @@ -114,7 +138,7 @@ loader = SharePointLoader( ) documents = loader.load_data( - sharepoint_site_name: "", + sharepoint_site_name = "", recursive = True, include = ['documents'], file_types = ['docx', 'pdf'] From ba66cee730f0939b23945c4beed24a2de19ee51f Mon Sep 17 00:00:00 2001 From: levi Date: Tue, 13 Feb 2024 13:12:55 +0100 Subject: [PATCH 14/14] fix typo --- llama_hub/microsoft_sharepoint/README.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/llama_hub/microsoft_sharepoint/README.md b/llama_hub/microsoft_sharepoint/README.md index 09a5eb5a56..bc07373a6b 100644 --- a/llama_hub/microsoft_sharepoint/README.md +++ b/llama_hub/microsoft_sharepoint/README.md @@ -55,10 +55,11 @@ documents = loader.load_data( ### Example loading a single folder To load a single folder, specify the `sharepoint_folder_path` with the name of the folder or path from the root directory. + Example: `sharepoint_folder_path = "my/folder/path"` In order to load only the documents from this `sharepoint_folder_path`, and not the pages for the `sharepoint_site_name`, -you need to provide the `include` argument as `['documents]`. By default, `include` is equal to `['documents', 'pages']`. +you need to provide the `include` argument as `['documents']`. By default, `include` is equal to `['documents', 'pages']`. If you do not want to include files from subfolders for the given `sharepoint_folder_path`, remove the argument `recursive` (defaults to `False`). @@ -84,9 +85,9 @@ documents = loader.load_data( ### Example loading just pages In order to load only the pages for the `sharepoint_site_name`, -you need to provide the `include` argument as `['pages]`. By default, `include` is equal to `['documents', 'pages']`. +you need to provide the `include` argument as `['pages']`. By default, `include` is equal to `['documents', 'pages']`. -Note: `recursive` and `sharepoint_folder_path` arguments have no effect if `documents` is not in `include`. +Note: `recursive` and `sharepoint_folder_path` arguments have no effect if `documents` is not in the list of the argument `include`. ```python from llama_index import download_loader