diff --git a/llama_hub/microsoft_sharepoint/README.md b/llama_hub/microsoft_sharepoint/README.md index 55bbbc2bd..bc07373a6 100644 --- a/llama_hub/microsoft_sharepoint/README.md +++ b/llama_hub/microsoft_sharepoint/README.md @@ -20,12 +20,114 @@ More info on Microsoft Graph APIs - [Refer here](https://learn.microsoft.com/en- To use this loader `client_id`, `client_secret` and `tenant_id` of the registered app in Microsoft Azure Portal is required. -This loader loads the files present in a specific folder in sharepoint. +This loader can: +- Load files present in a specific folder in SharePoint +- Load all files present in the drive of a SharePoint +- Load all pages under a SharePoint site + If the files are present in the `Test` folder in SharePoint Site under `root` directory, then the input for the loader for `file_path` is `Test` ![FilePath](file_path_info.png) +### Example loading all files and pages +If `sharepoint_folder_path` is not provided it defaults to `""`. +In that case, the root folder of the SharePoint Drive is used as the folder to load files from. + +If both `sharepoint_folder_path` is not provided and `recursive` is set to `True`, all files in the SharePoint Drive are loaded. +If `recursive` is not provided, it defaults to `False`. In this case, files from subfolders are not loaded. + +```python +from llama_index import download_loader +SharePointLoader = download_loader("SharePointReader") + +loader = SharePointLoader( + client_id = "", + client_secret = "", + tenant_id = "" + ) + +documents = loader.load_data( + sharepoint_site_name = "", + recursive = True, +) +``` + +### Example loading a single folder +To load a single folder, specify the `sharepoint_folder_path` with the name of the folder or path from the root directory. + +Example: `sharepoint_folder_path = "my/folder/path"` + +In order to load only the documents from this `sharepoint_folder_path`, and not the pages for the `sharepoint_site_name`, +you need to provide the `include` argument as `['documents']`. By default, `include` is equal to `['documents', 'pages']`. + +If you do not want to include files from subfolders for the given `sharepoint_folder_path`, remove the argument `recursive` (defaults to `False`). + +```python +from llama_index import download_loader +SharePointLoader = download_loader("SharePointReader") + +loader = SharePointLoader( + client_id = "", + client_secret = "", + tenant_id = "" + ) + +documents = loader.load_data( + sharepoint_site_name = "", + sharepoint_folder_path = "", + recursive = True, + include = ['documents'] +) +``` + + + +### Example loading just pages +In order to load only the pages for the `sharepoint_site_name`, +you need to provide the `include` argument as `['pages']`. By default, `include` is equal to `['documents', 'pages']`. + +Note: `recursive` and `sharepoint_folder_path` arguments have no effect if `documents` is not in the list of the argument `include`. + +```python +from llama_index import download_loader +SharePointLoader = download_loader("SharePointReader") + +loader = SharePointLoader( + client_id = "", + client_secret = "", + tenant_id = "" + ) + +documents = loader.load_data( + sharepoint_site_name = "", + include = ['pages'] +) +``` + +### Example loading just documents +```python +from llama_index import download_loader +SharePointLoader = download_loader("SharePointReader") + +loader = SharePointLoader( + client_id = "", + client_secret = "", + tenant_id = "" + ) + +documents = loader.load_data( + sharepoint_site_name = "", + recursive = True, + include = ['documents'] +) +``` + +### Example loading just documents with filetype .docx or .pdf + +If you want to only load specific filetypes, provide the file extension names in `file_types`. +Example: to only include .pdf and .docx files, set `file_types` to `['docx', 'pdf']` + ```python from llama_index import download_loader SharePointLoader = download_loader("SharePointReader") @@ -37,9 +139,10 @@ loader = SharePointLoader( ) documents = loader.load_data( - sharepoint_site_name: "", - sharepoint_folder_path: "", + sharepoint_site_name = "", recursive = True, + include = ['documents'], + file_types = ['docx', 'pdf'] ) ``` diff --git a/llama_hub/microsoft_sharepoint/base.py b/llama_hub/microsoft_sharepoint/base.py index 96794e896..6c1a0711a 100644 --- a/llama_hub/microsoft_sharepoint/base.py +++ b/llama_hub/microsoft_sharepoint/base.py @@ -50,6 +50,14 @@ def __init__( self.file_extractor = file_extractor self.filename_as_id = filename_as_id + def _setup_site_config(self, sharepoint_site_name: str): + self._authorization_headers = { + "Authorization": f"Bearer {self._get_access_token()}" + } + self._site_id_with_host_name = self._get_site_id_with_host_name( + sharepoint_site_name + ) + def _get_access_token(self) -> str: """ Gets the access_token for accessing file from SharePoint. @@ -81,7 +89,7 @@ def _get_access_token(self) -> str: logger.error(response.json()["error"]) raise ValueError(response.json()["error_description"]) - def _get_site_id_with_host_name(self, access_token, sharepoint_site_name) -> str: + def _get_site_id_with_host_name(self, sharepoint_site_name: str) -> str: """ Retrieves the site ID of a SharePoint site using the provided site name. @@ -97,7 +105,6 @@ def _get_site_id_with_host_name(self, access_token, sharepoint_site_name) -> str site_information_endpoint = ( f"https://graph.microsoft.com/v1.0/sites?search={sharepoint_site_name}" ) - self._authorization_headers = {"Authorization": f"Bearer {access_token}"} response = requests.get( url=site_information_endpoint, @@ -161,9 +168,10 @@ def _get_sharepoint_folder_id(self, folder_path: str) -> str: Returns: str: The ID of the SharePoint site folder. """ - folder_id_endpoint = ( - f"{self._drive_id_endpoint}/{self._drive_id}/root:/{folder_path}" - ) + folder_id_endpoint = f"{self._drive_id_endpoint}/{self._drive_id}/root" + + if folder_path: + folder_id_endpoint += f":/{folder_path}" response = requests.get( url=folder_id_endpoint, @@ -179,8 +187,9 @@ def _download_files_and_extract_metadata( self, folder_id: str, download_dir: str, - include_subfolders: bool = False, - ) -> Dict[str, str]: + include_subfolders: bool, + file_types: List[str], + ) -> Dict[str, Dict[str, str]]: """ Downloads files from the specified folder ID and extracts metadata. @@ -188,9 +197,10 @@ def _download_files_and_extract_metadata( folder_id (str): The ID of the folder from which the files should be downloaded. download_dir (str): The directory where the files should be downloaded. include_subfolders (bool): If True, files from all subfolders are downloaded. + file_types: (List[str]): A set of file types to load. If empty, loads all file types. Returns: - Dict[str, str]: A dictionary containing the metadata of the downloaded files. + Dict[str, Dict[str, str]]: A dictionary containing the metadata of the downloaded files. Raises: ValueError: If there is an error in downloading the files. @@ -214,14 +224,158 @@ def _download_files_and_extract_metadata( folder_id=item["id"], download_dir=sub_folder_download_dir, include_subfolders=include_subfolders, + file_types=file_types, ) metadata.update(subfolder_metadata) elif "file" in item: - file_metadata = self._download_file(item, download_dir) + file_type = item["name"].split(".")[-1] + if not file_types or (file_type in file_types): + file_metadata = self._download_file(item, download_dir) + metadata.update(file_metadata) + return metadata + else: + logger.error(response.json()["error"]) + raise ValueError(response.json()["error"]) + + def _download_pages_and_extract_metadata( + self, + download_dir: str, + ) -> Dict[str, Dict[str, str]]: + """ + Downloads Sharepoint pages as HTML files and extracts metadata. + + Args: + download_dir (str): The directory where the files should be downloaded. + + Returns: + Dict[str, Dict[str, str]]: A dictionary containing the metadata of the downloaded Sharepoint pages. + + Raises: + ValueError: If there is an error in downloading the files. + """ + pages_endpoint = f"https://graph.microsoft.com/beta/sites/{self._site_id_with_host_name}/pages" + + data = self._get_results_with_odatanext(pages_endpoint) + # the maximum is 20 requests per batch + # see https://learn.microsoft.com/en-us/graph/json-batching + batch_size = 20 + metadata = {} + + # request the page content for a batch of 20 pages + for i in range(0, len(data), batch_size): + # Create a dict using enumerate to index each item in the batch, to later correlate the result with the original data + batch = dict(enumerate(data[i : i + batch_size])) + batch_endpoint: str = "https://graph.microsoft.com/beta/$batch" + + # set-up the requests to be made + body = { + "requests": [ + { + "url": f"/sites/{self._site_id_with_host_name}/pages/{item['id']}/microsoft.graph.sitepage/webparts", + "method": "GET", + "id": idx, + } + for idx, item in batch.items() + ] + } + batch_response = requests.post( + url=batch_endpoint, json=body, headers=self._authorization_headers + ) + + # the result should contain results for all pages. + # If something went wrong, this is indicated in the response per page + for response in batch_response.json()["responses"]: + try: + file_metadata = self._extract_page( + item=batch[int(response["id"])], + response=response, + download_dir=download_dir, + ) metadata.update(file_metadata) + except ValueError: + pass + return metadata + + def _extract_page( + self, item: Dict[str, Any], response: Dict[str, Any], download_dir: str + ) -> Dict[str, Dict[str, str]]: + """ + Retrieves the HTML content of the SharePoint page referenced by the 'item' argument + from the Microsoft Graph batch response. Stores the content as an .html file in the download_dir. + + Args: + item (Dict[str, Any]): a sharepoint item that contains + the fields 'id', 'name' and 'webUrl'. + response (Dict[str, Any]): A single Microsoft Graph response from a batch request. + Expected to be correlated with the given item. + download_dir (str): A directory to download the file to. + + Returns: + Dict[str, Dict[str, str]]: The file_name of the page stored in the download_dir as key + and the metadata of the page (item) as value + """ + file_name = item["name"].replace(".aspx", ".html") + metadata = {} + + if response.get("status") == 200: + + html_content = "\n".join( + [ + i["innerHtml"] + for i in response["body"]["value"] + if i["@odata.type"] == "#microsoft.graph.textWebPart" + ] + ) + if html_content == "": + raise ValueError( + f"The page {item['name']} does not contain a textWebPart." + ) + + # Create the directory if it does not exist and save the file. + if not os.path.exists(download_dir): + os.makedirs(download_dir) + file_path = os.path.join(download_dir, file_name) + with open(file_path, "w") as f: + f.write(html_content) + metadata[file_path] = self._extract_metadata_for_file(item) return metadata + else: + logger.error(response.json()["error"]) + raise ValueError( + f"status: {response['status']}, body: {response['body']['error']}" + ) + + def _get_results_with_odatanext( + self, request: str, **kwargs + ) -> List[Dict[str, Any]]: + """ + Given a request, checks if the result contains `@odata.nextLink` in the result. + If true, this function returns itself calling the @odata.nextLink. + If false, this function returns a list of all retrieved values. + + Args: + request (str): A GET request to be made, that might include a field '@odata.nextLink' + + Returns: + List[Dict[str, Any]]: A List with containing the metadata in Dict[str, Any] form of the pages to be extracted + """ + if "prev_responses" not in kwargs.keys(): + prev_responses = [] + else: + prev_responses = kwargs["prev_responses"] + response = requests.get(url=request, headers=self._authorization_headers) + if response.status_code == 200: + result: Dict[str, Any] = response.json() + prev_responses += result["value"] + if "@odata.nextLink" in result.keys(): + return self._get_results_with_odatanext( + request=result["@odata.nextLink"], + prev_responses=prev_responses, + ) + else: + return prev_responses else: logger.error(response.json()["error"]) raise ValueError(response.json()["error"]) @@ -277,7 +431,19 @@ def _download_file( self, item: Dict[str, Any], download_dir: str, - ): + ) -> Dict[str, Dict[str, str]]: + """ + Downloads a file to the temporary download folder and returns + its metadata. + + Args: + item (Dict[str, Any]): a sharepoint item that contains + the fields 'id', 'name' and 'webUrl' + download_dir (str): A directory to download the file to. + + Returns: + The metadata of the item + """ metadata = {} file_path = self._download_file_by_url(item, download_dir) @@ -288,10 +454,10 @@ def _download_file( def _download_files_from_sharepoint( self, download_dir: str, - sharepoint_site_name: str, sharepoint_folder_path: str, recursive: bool, - ) -> Dict[str, str]: + file_types: List[str], + ) -> Dict[str, Dict[str, str]]: """ Downloads files from the specified folder and returns the metadata for the downloaded files. @@ -300,16 +466,13 @@ def _download_files_from_sharepoint( sharepoint_site_name (str): The name of the SharePoint site. sharepoint_folder_path (str): The path of the folder in the SharePoint site. recursive (bool): If True, files from all subfolders are downloaded. + file_types: (List[str]): A set of file types to load. If empty, loads all file types. Returns: - Dict[str, str]: A dictionary containing the metadata of the downloaded files. + Dict[str,Dict[str, str]]: A dictionary containing file_name of the stored file + as key and the metadata of the downloaded files as value. """ - access_token = self._get_access_token() - - self._site_id_with_host_name = self._get_site_id_with_host_name( - access_token, sharepoint_site_name - ) self._drive_id = self._get_drive_id() @@ -318,7 +481,10 @@ def _download_files_from_sharepoint( ) metadata = self._download_files_and_extract_metadata( - self.sharepoint_folder_id, download_dir, recursive + folder_id=self.sharepoint_folder_id, + download_dir=download_dir, + include_subfolders=recursive, + file_types=file_types, ) return metadata @@ -362,8 +528,10 @@ def get_metadata(filename: str) -> Any: def load_data( self, sharepoint_site_name: str, - sharepoint_folder_path: str, + sharepoint_folder_path: str = "", recursive: bool = False, + include: List[str] = ["documents", "pages"], + file_types: List[str] = [], ) -> List[Document]: """ Loads the files from the specified folder in the SharePoint site. @@ -371,7 +539,13 @@ def load_data( Args: sharepoint_site_name (str): The name of the SharePoint site. sharepoint_folder_path (str): The path of the folder in the SharePoint site. + If `""` (default), loads data from the root folder of the + SharePoint site. recursive (bool): If True, files from all subfolders are downloaded. + include (List[str]): list of Sharepoint objects to include. + Must contain at least 'pages' or 'documents' or both. + file_types (List[str]): list of file extensions to include when downloading from + the Sharepoint Drive. Leave empty to download all filetypes. Returns: List[Document]: A list containing the documents with metadata. @@ -379,15 +553,37 @@ def load_data( Raises: Exception: If an error occurs while accessing SharePoint site. """ + if not include: + raise ValueError( + "'include' should not be an empty list, and include either 'documents' and/or 'pages'" + ) + if any([i not in ["documents", "pages"] for i in include]): + raise ValueError( + "'include' contains an unexpected value. " + + f"Valid values are ['documents', 'pages'], but got {include}" + ) + if "documents" not in include and (recursive or file_types): + logger.warning( + "'documents' is not in 'include', so 'recursive' and 'file_types' have no effect." + ) try: with tempfile.TemporaryDirectory() as temp_dir: - files_metadata = self._download_files_from_sharepoint( - temp_dir, sharepoint_site_name, sharepoint_folder_path, recursive - ) - # return self.files_metadata + self._setup_site_config(sharepoint_site_name) + files_metadata = {} + if "documents" in include: + files_metadata.update( + self._download_files_from_sharepoint( + temp_dir, sharepoint_folder_path, recursive, file_types + ) + ) + if "pages" in include: + files_metadata.update( + self._download_pages_and_extract_metadata(temp_dir) + ) return self._load_documents_with_metadata( files_metadata, temp_dir, recursive ) - except Exception as exp: - logger.error("An error occurred while accessing SharePoint: %s", exp) + logger.error( + "An error occurred while accessing SharePoint: %s", exp, exc_info=True + )