From da3a6994d2dd0ef242b19c53d4d56b3a00b24a06 Mon Sep 17 00:00:00 2001
From: levi <levi.vanderheijden@vodafoneziggo.com>
Date: Tue, 6 Feb 2024 14:05:29 +0100
Subject: [PATCH 01/14] added functionality to load all documents, added
 functionality to load all pages

---
 llama_hub/microsoft_sharepoint/README.md |  41 +++++++
 llama_hub/microsoft_sharepoint/base.py   | 130 ++++++++++++++++++++---
 2 files changed, 156 insertions(+), 15 deletions(-)
diff --git a/llama_hub/microsoft_sharepoint/README.md b/llama_hub/microsoft_sharepoint/README.md
index 55bbbc2bd2..b34a45733d 100644
--- a/llama_hub/microsoft_sharepoint/README.md
+++ b/llama_hub/microsoft_sharepoint/README.md
@@ -20,12 +20,18 @@ More info on Microsoft Graph APIs - [Refer here](https://learn.microsoft.com/en-
 
 To use this loader `client_id`, `client_secret` and `tenant_id` of the registered app in Microsoft Azure Portal is required.
 
+This loader can:
+- Load files present in a specific folder in SharePoint
+- Load all files present in the drive of a SharePoint
+- Load all pages under a SharePoint site
+
 This loader loads the files present in a specific folder in sharepoint.
 
 If the files are present in the `Test` folder in SharePoint Site under `root` directory, then the input for the loader for  `file_path` is `Test`
 
 ![FilePath](file_path_info.png)
 
+### Example loading a single folder
 ```python
 from llama_index import download_loader 
 SharePointLoader = download_loader("SharePointReader")
@@ -43,5 +49,40 @@ documents = loader.load_data(
 )
 ```
 
+### Example loading all files, no pages
+```python
+from llama_index import download_loader 
+SharePointLoader = download_loader("SharePointReader")
+
+loader = SharePointLoader(
+            client_id = "<Client ID of the app>",
+            client_secret = "<Client Secret of the app>",
+            tenant_id = "<Tenant ID of the Micorsoft Azure Directory>"
+            )
+
+documents = loader.load_data(
+            sharepoint_site_name: "<Sharepoint Site Name>",
+            recursive = True,
+)
+```
+
+### Example loading all files and pages
+```python
+from llama_index import download_loader 
+SharePointLoader = download_loader("SharePointReader")
+
+loader = SharePointLoader(
+            client_id = "<Client ID of the app>",
+            client_secret = "<Client Secret of the app>",
+            tenant_id = "<Tenant ID of the Micorsoft Azure Directory>"
+            )
+
+documents = loader.load_data(
+            sharepoint_site_name: "<Sharepoint Site Name>",
+            recursive = True,
+            include_pages = True
+)
+```
+
 The loader doesn't access other components of the `SharePoint Site`.
 
diff --git a/llama_hub/microsoft_sharepoint/base.py b/llama_hub/microsoft_sharepoint/base.py
index 9b42ea668a..a240e3af86 100644
--- a/llama_hub/microsoft_sharepoint/base.py
+++ b/llama_hub/microsoft_sharepoint/base.py
@@ -43,6 +43,13 @@ def __init__(
         self.tenant_id = tenant_id
         self._authorization_headers = None
 
+    def _setup_site_config(self, sharepoint_site_name: str):
+        access_token = self._get_access_token()
+
+        self._site_id_with_host_name = self._get_site_id_with_host_name(
+            access_token, sharepoint_site_name
+        )
+
     def _get_access_token(self) -> str:
         """
         Gets the access_token for accessing file from SharePoint.
@@ -154,9 +161,14 @@ def _get_sharepoint_folder_id(self, folder_path: str) -> str:
         Returns:
             str: The ID of the SharePoint site folder.
         """
-        folder_id_endpoint = (
-            f"{self._drive_id_endpoint}/{self._drive_id}/root:/{folder_path}"
-        )
+        if folder_path == 'root':
+            folder_id_endpoint = (
+                f"{self._drive_id_endpoint}/{self._drive_id}/root"
+            )
+        else:
+            folder_id_endpoint = (
+                f"{self._drive_id_endpoint}/{self._drive_id}/root:/{folder_path}"
+            )
 
         response = requests.get(
             url=folder_id_endpoint,
@@ -218,6 +230,83 @@ def _download_files_and_extract_metadata(
         else:
             logger.error(response.json()["error"])
             raise ValueError(response.json()["error"])
+        
+    def _download_pages_and_extract_metadata(
+            self,
+            download_dir,
+        ):
+        pages_endpoint = f"https://graph.microsoft.com/beta/sites/{self._site_id_with_host_name}/pages"
+
+        data = self._get_results_with_odatanext(pages_endpoint)
+        metadata = {}
+        for item in data:
+            file_metadata = self._extract_page(item, download_dir)
+            if file_metadata:
+                metadata.update(file_metadata)
+        return metadata
+
+    def _extract_page(self, item, download_dir) -> None|Dict[str, Dict[str, str]]:
+        page_endpoint: str = f"https://graph.microsoft.com/beta/sites/{self._site_id_with_host_name}/pages/{item['id']}/microsoft.graph.sitepage/webparts"
+        file_name = item['name'].replace('.aspx', '.html')
+
+        response = requests.get(url=page_endpoint, headers=self._authorization_headers)
+        metadata = {}
+
+        if response.status_code == 200:
+
+            html_content = "\n".join(
+                [
+                    i["innerHtml"]
+                    for i in response.json()['value']
+                    if i["@odata.type"] == "#microsoft.graph.textWebPart"
+                ]
+            )
+            if html_content == "":
+                return None
+
+            # Create the directory if it does not exist and save the file.
+            if not os.path.exists(download_dir):
+                os.makedirs(download_dir)
+            file_path = os.path.join(download_dir, file_name)
+            with open(file_path, "w") as f:
+                f.write(html_content)
+            metadata[file_path] = self._extract_metadata_for_file(item)
+            return metadata
+        else:
+            logger.error(response.json()["error"])
+            raise ValueError(response.json()["error"])
+
+    def _get_results_with_odatanext(self, request: str, **kwargs):
+        """
+        Given a request, checks if the result contains `@odata.nextLink` in the result.
+        If true, this function returns itself calling the @odata.nextLink.
+        If false, this function returns a list of all retrieved values.
+
+        Args:
+            request (str): A GET request to be made, that might include a field '@odata.nextLink'
+
+        Returns:
+            Dict[str, str]: A dictionary containing the metadata of the pages to be extracted
+        """  
+        if "prev_responses" not in kwargs.keys():
+            prev_responses = []
+        else:
+            prev_responses = kwargs["prev_responses"]
+        response = requests.get(url=request, headers=self._authorization_headers)
+        if response.status_code == 200:
+            result: Dict[str, Any] = response.json()
+            prev_responses += result["value"]
+            if "@odata.nextLink" in result.keys():
+                return self._get_results_with_odatanext(
+                    request=result["@odata.nextLink"],
+                    prev_responses=prev_responses,
+                )
+            else:
+                return prev_responses
+        else:
+            logger.error(response.json()["error"])
+            raise ValueError(response.json()["error"])
+
 
     def _download_file_by_url(self, item: Dict[str, Any], download_dir: str) -> str:
         """
@@ -278,10 +367,11 @@ def _download_file(
         metadata[file_path] = self._extract_metadata_for_file(item)
         return metadata
 
+
+
     def _download_files_from_sharepoint(
         self,
         download_dir: str,
-        sharepoint_site_name: str,
         sharepoint_folder_path: str,
         recursive: bool,
     ) -> Dict[str, str]:
@@ -298,11 +388,7 @@ def _download_files_from_sharepoint(
             Dict[str, str]: A dictionary containing the metadata of the downloaded files.
 
         """
-        access_token = self._get_access_token()
 
-        self._site_id_with_host_name = self._get_site_id_with_host_name(
-            access_token, sharepoint_site_name
-        )
 
         self._drive_id = self._get_drive_id()
 
@@ -351,8 +437,10 @@ def get_metadata(filename: str) -> Any:
     def load_data(
         self,
         sharepoint_site_name: str,
-        sharepoint_folder_path: str,
+        sharepoint_folder_path: str = "root",
         recursive: bool = False,
+        include_documents: bool = True,
+        include_pages: bool = False
     ) -> List[Document]:
         """
         Loads the files from the specified folder in the SharePoint site.
@@ -360,7 +448,12 @@ def load_data(
         Args:
             sharepoint_site_name (str): The name of the SharePoint site.
             sharepoint_folder_path (str): The path of the folder in the SharePoint site.
+                                          If `root`, loads data from the root folder of the 
+                                          SharePoint site.
             recursive (bool): If True, files from all subfolders are downloaded.
+            include_documents (bool): If True, loads documents for the given 
+                                      sharepoint_site_name and sharepoint_folder_path.
+            include_pages (bool): If True, loads SharePoint pages for the given site_name.
 
         Returns:
             List[Document]: A list containing the documents with metadata.
@@ -370,13 +463,20 @@ def load_data(
         """
         try:
             with tempfile.TemporaryDirectory() as temp_dir:
-                files_metadata = self._download_files_from_sharepoint(
-                    temp_dir, sharepoint_site_name, sharepoint_folder_path, recursive
-                )
+                self._setup_site_config(sharepoint_site_name)
+                files_metadata = {}
+                if include_documents:
+                    files_metadata.update(self._download_files_from_sharepoint(
+                        temp_dir, sharepoint_folder_path, recursive
+                    ))
+                if include_pages:
+                    files_metadata.update(self._download_pages_and_extract_metadata(
+                        temp_dir
+                    ))
                 # return self.files_metadata
                 return self._load_documents_with_metadata(
-                    files_metadata, temp_dir, recursive
-                )
+                    files_metadata, temp_dir, recursive)
+                
 
         except Exception as exp:
-            logger.error("An error occurred while accessing SharePoint: %s", exp)
+            logger.error("An error occurred while accessing SharePoint: ", exc_info=True)

From 7211e27dafd79b579e9a7fc0ad490a8b3a6fb141 Mon Sep 17 00:00:00 2001
From: levi <levi.vanderheijden@vodafoneziggo.com>
Date: Tue, 6 Feb 2024 15:27:34 +0100
Subject: [PATCH 02/14] updated with new 'include' argument and 'file_types'
 argument to filter file_types when loading documents.

---
 llama_hub/microsoft_sharepoint/README.md | 44 +++++++++++++++++++++--
 llama_hub/microsoft_sharepoint/base.py   | 45 ++++++++++++++++--------
 2 files changed, 71 insertions(+), 18 deletions(-)

diff --git a/llama_hub/microsoft_sharepoint/README.md b/llama_hub/microsoft_sharepoint/README.md
index b34a45733d..64eec3fdb7 100644
--- a/llama_hub/microsoft_sharepoint/README.md
+++ b/llama_hub/microsoft_sharepoint/README.md
@@ -49,7 +49,7 @@ documents = loader.load_data(
 )
 ```
 
-### Example loading all files, no pages
+### Example loading all files and pages
 ```python
 from llama_index import download_loader 
 SharePointLoader = download_loader("SharePointReader")
@@ -63,10 +63,47 @@ loader = SharePointLoader(
 documents = loader.load_data(
             sharepoint_site_name: "<Sharepoint Site Name>",
             recursive = True,
+            include= ['pages', 'documents']
 )
 ```
 
-### Example loading all files and pages
+### Example loading just pages
+```python
+from llama_index import download_loader 
+SharePointLoader = download_loader("SharePointReader")
+
+loader = SharePointLoader(
+            client_id = "<Client ID of the app>",
+            client_secret = "<Client Secret of the app>",
+            tenant_id = "<Tenant ID of the Micorsoft Azure Directory>"
+            )
+
+documents = loader.load_data(
+            sharepoint_site_name: "<Sharepoint Site Name>",
+            recursive = True,
+            include = ['pages']
+)
+```
+
+### Example loading just documents
+```python
+from llama_index import download_loader 
+SharePointLoader = download_loader("SharePointReader")
+
+loader = SharePointLoader(
+            client_id = "<Client ID of the app>",
+            client_secret = "<Client Secret of the app>",
+            tenant_id = "<Tenant ID of the Micorsoft Azure Directory>"
+            )
+
+documents = loader.load_data(
+            sharepoint_site_name: "<Sharepoint Site Name>",
+            recursive = True,
+            include = ['documents']
+)
+```
+
+### Example loading just documents with filetype .docx or .pdf
 ```python
 from llama_index import download_loader 
 SharePointLoader = download_loader("SharePointReader")
@@ -80,7 +117,8 @@ loader = SharePointLoader(
 documents = loader.load_data(
             sharepoint_site_name: "<Sharepoint Site Name>",
             recursive = True,
-            include_pages = True
+            include = ['documents'],
+            file_types = ['docx', 'pdf']
 )
 ```
 
diff --git a/llama_hub/microsoft_sharepoint/base.py b/llama_hub/microsoft_sharepoint/base.py
index a240e3af86..ffd9c51842 100644
--- a/llama_hub/microsoft_sharepoint/base.py
+++ b/llama_hub/microsoft_sharepoint/base.py
@@ -184,7 +184,8 @@ def _download_files_and_extract_metadata(
         self,
         folder_id: str,
         download_dir: str,
-        include_subfolders: bool = False,
+        include_subfolders: bool,
+        file_types: List[str]
     ) -> Dict[str, str]:
         """
         Downloads files from the specified folder ID and extracts metadata.
@@ -193,6 +194,7 @@ def _download_files_and_extract_metadata(
             folder_id (str): The ID of the folder from which the files should be downloaded.
             download_dir (str): The directory where the files should be downloaded.
             include_subfolders (bool): If True, files from all subfolders are downloaded.
+            file_types: (List[str]): A set of file types to load. If empty, loads all file types.
 
         Returns:
             Dict[str, str]: A dictionary containing the metadata of the downloaded files.
@@ -219,13 +221,16 @@ def _download_files_and_extract_metadata(
                         folder_id=item["id"],
                         download_dir=sub_folder_download_dir,
                         include_subfolders=include_subfolders,
+                        file_types=file_types
                     )
 
                     metadata.update(subfolder_metadata)
 
                 elif "file" in item:
-                    file_metadata = self._download_file(item, download_dir)
-                    metadata.update(file_metadata)
+                    file_type = item['name'].split('.')[-1]
+                    if not file_types or (file_type in file_types):
+                        file_metadata = self._download_file(item, download_dir)
+                        metadata.update(file_metadata)
             return metadata
         else:
             logger.error(response.json()["error"])
@@ -374,6 +379,7 @@ def _download_files_from_sharepoint(
         download_dir: str,
         sharepoint_folder_path: str,
         recursive: bool,
+        file_types: List[str]
     ) -> Dict[str, str]:
         """
         Downloads files from the specified folder and returns the metadata for the downloaded files.
@@ -383,6 +389,7 @@ def _download_files_from_sharepoint(
             sharepoint_site_name (str): The name of the SharePoint site.
             sharepoint_folder_path (str): The path of the folder in the SharePoint site.
             recursive (bool): If True, files from all subfolders are downloaded.
+            file_types: (List[str]): A set of file types to load. If empty, loads all file types.
 
         Returns:
             Dict[str, str]: A dictionary containing the metadata of the downloaded files.
@@ -397,7 +404,10 @@ def _download_files_from_sharepoint(
         )
 
         metadata = self._download_files_and_extract_metadata(
-            self.sharepoint_folder_id, download_dir, recursive
+            folder_id=self.sharepoint_folder_id, 
+            download_dir=download_dir, 
+            include_subfolders=recursive, 
+            file_types=file_types
         )
 
         return metadata
@@ -439,8 +449,8 @@ def load_data(
         sharepoint_site_name: str,
         sharepoint_folder_path: str = "root",
         recursive: bool = False,
-        include_documents: bool = True,
-        include_pages: bool = False
+        include: List[str] = ['documents', 'pages'],
+        file_types: List[str] = []
     ) -> List[Document]:
         """
         Loads the files from the specified folder in the SharePoint site.
@@ -451,9 +461,10 @@ def load_data(
                                           If `root`, loads data from the root folder of the 
                                           SharePoint site.
             recursive (bool): If True, files from all subfolders are downloaded.
-            include_documents (bool): If True, loads documents for the given 
-                                      sharepoint_site_name and sharepoint_folder_path.
-            include_pages (bool): If True, loads SharePoint pages for the given site_name.
+            include (List[str]): list of Sharepoint objects to include. 
+                                  Must contain at least 'pages' or 'documents' or both.
+            file_types (List[str]): list of file extensions to include when downloading from
+                                     the Sharepoint Drive. Leave empty to download all filetypes.
 
         Returns:
             List[Document]: A list containing the documents with metadata.
@@ -461,22 +472,26 @@ def load_data(
         Raises:
             Exception: If an error occurs while accessing SharePoint site.
         """
+        if not include:
+            raise ValueError("'include' should not be an empty list, and include either 'documents' and/or 'pages'")
+        if any([i not in ['documents', 'pages'] for i in include]):
+            raise ValueError("'include' contains an unexpected value. " +
+                             f"Valid values are ['documents', 'pages'], but got {include}")
+        if 'documents' not in include and (recursive or file_types):
+            logger.warning("'documents' is not in 'included', so 'recursive' and 'file_types' have no effect.")
         try:
             with tempfile.TemporaryDirectory() as temp_dir:
                 self._setup_site_config(sharepoint_site_name)
                 files_metadata = {}
-                if include_documents:
+                if 'documents' in include:
                     files_metadata.update(self._download_files_from_sharepoint(
-                        temp_dir, sharepoint_folder_path, recursive
+                        temp_dir, sharepoint_folder_path, recursive, file_types
                     ))
-                if include_pages:
+                if 'pages' in include:
                     files_metadata.update(self._download_pages_and_extract_metadata(
                         temp_dir
                     ))
-                # return self.files_metadata
                 return self._load_documents_with_metadata(
                     files_metadata, temp_dir, recursive)
-                
-
         except Exception as exp:
             logger.error("An error occurred while accessing SharePoint: ", exc_info=True)

From ba27fab4189095af21edb0e48a7d2d45e82c0434 Mon Sep 17 00:00:00 2001
From: levi <levi.vanderheijden@vodafoneziggo.com>
Date: Tue, 6 Feb 2024 15:56:42 +0100
Subject: [PATCH 03/14] Added documentation

---
 llama_hub/microsoft_sharepoint/base.py | 36 ++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/llama_hub/microsoft_sharepoint/base.py b/llama_hub/microsoft_sharepoint/base.py
index ffd9c51842..be8e7aa008 100644
--- a/llama_hub/microsoft_sharepoint/base.py
+++ b/llama_hub/microsoft_sharepoint/base.py
@@ -240,6 +240,18 @@ def _download_pages_and_extract_metadata(
             self,
             download_dir,
         ):
+        """
+        Downloads Sharepoint pages as HTML files and extracts metadata.
+
+        Args:
+            download_dir (str): The directory where the files should be downloaded.
+
+        Returns:
+            Dict[str, str]: A dictionary containing the metadata of the downloaded Sharepoint pages.
+
+        Raises:
+            ValueError: If there is an error in downloading the files.
+        """
         pages_endpoint = f"https://graph.microsoft.com/beta/sites/{self._site_id_with_host_name}/pages"
 
         data = self._get_results_with_odatanext(pages_endpoint)
@@ -251,6 +263,18 @@ def _download_pages_and_extract_metadata(
         return metadata
 
     def _extract_page(self, item, download_dir) -> None|Dict[str, Dict[str, str]]:
+        """
+        Retrieves the HTML content of the SharePoint page referenced by the 'item' argument 
+        from the Microsoft Graph. Stores the content as an .html file in the download_dir.
+
+        Args:
+            item (Dict[str, Any]): a sharepoint item that contains 
+                  the fields 'id', 'name' and 'webUrl'
+            download_dir (str): A directory to download the file to.
+
+        Returns:
+            The metadata of the item
+        """
         page_endpoint: str = f"https://graph.microsoft.com/beta/sites/{self._site_id_with_host_name}/pages/{item['id']}/microsoft.graph.sitepage/webparts"
         file_name = item['name'].replace('.aspx', '.html')
 
@@ -365,6 +389,18 @@ def _download_file(
         item: Dict[str, Any],
         download_dir: str,
     ):
+        """
+        Downloads a file to the temporary download folder and returns
+        its metadata. 
+
+        Args:
+            item (Dict[str, Any]): a sharepoint item that contains 
+                  the fields 'id', 'name' and 'webUrl'
+            download_dir (str): A directory to download the file to.
+
+        Returns:
+            The metadata of the item
+        """
         metadata = {}
 
         file_path = self._download_file_by_url(item, download_dir)

From ea1dfb67e5186dd6bb349cb0948e0a7b877b69e2 Mon Sep 17 00:00:00 2001
From: levi <levi.vanderheijden@vodafoneziggo.com>
Date: Tue, 6 Feb 2024 16:01:05 +0100
Subject: [PATCH 04/14] running format and lint

---
 llama_hub/microsoft_sharepoint/base.py | 101 +++++++++++++------------
 1 file changed, 53 insertions(+), 48 deletions(-)

diff --git a/llama_hub/microsoft_sharepoint/base.py b/llama_hub/microsoft_sharepoint/base.py
index be8e7aa008..d59cc478cb 100644
--- a/llama_hub/microsoft_sharepoint/base.py
+++ b/llama_hub/microsoft_sharepoint/base.py
@@ -161,10 +161,8 @@ def _get_sharepoint_folder_id(self, folder_path: str) -> str:
         Returns:
             str: The ID of the SharePoint site folder.
         """
-        if folder_path == 'root':
-            folder_id_endpoint = (
-                f"{self._drive_id_endpoint}/{self._drive_id}/root"
-            )
+        if folder_path == "root":
+            folder_id_endpoint = f"{self._drive_id_endpoint}/{self._drive_id}/root"
         else:
             folder_id_endpoint = (
                 f"{self._drive_id_endpoint}/{self._drive_id}/root:/{folder_path}"
@@ -185,7 +183,7 @@ def _download_files_and_extract_metadata(
         folder_id: str,
         download_dir: str,
         include_subfolders: bool,
-        file_types: List[str]
+        file_types: List[str],
     ) -> Dict[str, str]:
         """
         Downloads files from the specified folder ID and extracts metadata.
@@ -221,13 +219,13 @@ def _download_files_and_extract_metadata(
                         folder_id=item["id"],
                         download_dir=sub_folder_download_dir,
                         include_subfolders=include_subfolders,
-                        file_types=file_types
+                        file_types=file_types,
                     )
 
                     metadata.update(subfolder_metadata)
 
                 elif "file" in item:
-                    file_type = item['name'].split('.')[-1]
+                    file_type = item["name"].split(".")[-1]
                     if not file_types or (file_type in file_types):
                         file_metadata = self._download_file(item, download_dir)
                         metadata.update(file_metadata)
@@ -235,11 +233,11 @@ def _download_files_and_extract_metadata(
         else:
             logger.error(response.json()["error"])
             raise ValueError(response.json()["error"])
-        
+
     def _download_pages_and_extract_metadata(
-            self,
-            download_dir,
-        ):
+        self,
+        download_dir,
+    ):
         """
         Downloads Sharepoint pages as HTML files and extracts metadata.
 
@@ -262,13 +260,13 @@ def _download_pages_and_extract_metadata(
                 metadata.update(file_metadata)
         return metadata
 
-    def _extract_page(self, item, download_dir) -> None|Dict[str, Dict[str, str]]:
+    def _extract_page(self, item, download_dir) -> None | Dict[str, Dict[str, str]]:
         """
-        Retrieves the HTML content of the SharePoint page referenced by the 'item' argument 
+        Retrieves the HTML content of the SharePoint page referenced by the 'item' argument
         from the Microsoft Graph. Stores the content as an .html file in the download_dir.
 
         Args:
-            item (Dict[str, Any]): a sharepoint item that contains 
+            item (Dict[str, Any]): a sharepoint item that contains
                   the fields 'id', 'name' and 'webUrl'
             download_dir (str): A directory to download the file to.
 
@@ -276,7 +274,7 @@ def _extract_page(self, item, download_dir) -> None|Dict[str, Dict[str, str]]:
             The metadata of the item
         """
         page_endpoint: str = f"https://graph.microsoft.com/beta/sites/{self._site_id_with_host_name}/pages/{item['id']}/microsoft.graph.sitepage/webparts"
-        file_name = item['name'].replace('.aspx', '.html')
+        file_name = item["name"].replace(".aspx", ".html")
 
         response = requests.get(url=page_endpoint, headers=self._authorization_headers)
         metadata = {}
@@ -286,7 +284,7 @@ def _extract_page(self, item, download_dir) -> None|Dict[str, Dict[str, str]]:
             html_content = "\n".join(
                 [
                     i["innerHtml"]
-                    for i in response.json()['value']
+                    for i in response.json()["value"]
                     if i["@odata.type"] == "#microsoft.graph.textWebPart"
                 ]
             )
@@ -316,7 +314,7 @@ def _get_results_with_odatanext(self, request: str, **kwargs):
 
         Returns:
             Dict[str, str]: A dictionary containing the metadata of the pages to be extracted
-        """  
+        """
         if "prev_responses" not in kwargs.keys():
             prev_responses = []
         else:
@@ -336,7 +334,6 @@ def _get_results_with_odatanext(self, request: str, **kwargs):
             logger.error(response.json()["error"])
             raise ValueError(response.json()["error"])
 
-
     def _download_file_by_url(self, item: Dict[str, Any], download_dir: str) -> str:
         """
         Downloads the file from the provided URL.
@@ -391,10 +388,10 @@ def _download_file(
     ):
         """
         Downloads a file to the temporary download folder and returns
-        its metadata. 
+        its metadata.
 
         Args:
-            item (Dict[str, Any]): a sharepoint item that contains 
+            item (Dict[str, Any]): a sharepoint item that contains
                   the fields 'id', 'name' and 'webUrl'
             download_dir (str): A directory to download the file to.
 
@@ -408,14 +405,12 @@ def _download_file(
         metadata[file_path] = self._extract_metadata_for_file(item)
         return metadata
 
-
-
     def _download_files_from_sharepoint(
         self,
         download_dir: str,
         sharepoint_folder_path: str,
         recursive: bool,
-        file_types: List[str]
+        file_types: List[str],
     ) -> Dict[str, str]:
         """
         Downloads files from the specified folder and returns the metadata for the downloaded files.
@@ -432,7 +427,6 @@ def _download_files_from_sharepoint(
 
         """
 
-
         self._drive_id = self._get_drive_id()
 
         self.sharepoint_folder_id = self._get_sharepoint_folder_id(
@@ -440,10 +434,10 @@ def _download_files_from_sharepoint(
         )
 
         metadata = self._download_files_and_extract_metadata(
-            folder_id=self.sharepoint_folder_id, 
-            download_dir=download_dir, 
-            include_subfolders=recursive, 
-            file_types=file_types
+            folder_id=self.sharepoint_folder_id,
+            download_dir=download_dir,
+            include_subfolders=recursive,
+            file_types=file_types,
         )
 
         return metadata
@@ -485,8 +479,8 @@ def load_data(
         sharepoint_site_name: str,
         sharepoint_folder_path: str = "root",
         recursive: bool = False,
-        include: List[str] = ['documents', 'pages'],
-        file_types: List[str] = []
+        include: List[str] = ["documents", "pages"],
+        file_types: List[str] = [],
     ) -> List[Document]:
         """
         Loads the files from the specified folder in the SharePoint site.
@@ -494,10 +488,10 @@ def load_data(
         Args:
             sharepoint_site_name (str): The name of the SharePoint site.
             sharepoint_folder_path (str): The path of the folder in the SharePoint site.
-                                          If `root`, loads data from the root folder of the 
+                                          If `root`, loads data from the root folder of the
                                           SharePoint site.
             recursive (bool): If True, files from all subfolders are downloaded.
-            include (List[str]): list of Sharepoint objects to include. 
+            include (List[str]): list of Sharepoint objects to include.
                                   Must contain at least 'pages' or 'documents' or both.
             file_types (List[str]): list of file extensions to include when downloading from
                                      the Sharepoint Drive. Leave empty to download all filetypes.
@@ -509,25 +503,36 @@ def load_data(
             Exception: If an error occurs while accessing SharePoint site.
         """
         if not include:
-            raise ValueError("'include' should not be an empty list, and include either 'documents' and/or 'pages'")
-        if any([i not in ['documents', 'pages'] for i in include]):
-            raise ValueError("'include' contains an unexpected value. " +
-                             f"Valid values are ['documents', 'pages'], but got {include}")
-        if 'documents' not in include and (recursive or file_types):
-            logger.warning("'documents' is not in 'included', so 'recursive' and 'file_types' have no effect.")
+            raise ValueError(
+                "'include' should not be an empty list, and include either 'documents' and/or 'pages'"
+            )
+        if any([i not in ["documents", "pages"] for i in include]):
+            raise ValueError(
+                "'include' contains an unexpected value. "
+                + f"Valid values are ['documents', 'pages'], but got {include}"
+            )
+        if "documents" not in include and (recursive or file_types):
+            logger.warning(
+                "'documents' is not in 'included', so 'recursive' and 'file_types' have no effect."
+            )
         try:
             with tempfile.TemporaryDirectory() as temp_dir:
                 self._setup_site_config(sharepoint_site_name)
                 files_metadata = {}
-                if 'documents' in include:
-                    files_metadata.update(self._download_files_from_sharepoint(
-                        temp_dir, sharepoint_folder_path, recursive, file_types
-                    ))
-                if 'pages' in include:
-                    files_metadata.update(self._download_pages_and_extract_metadata(
-                        temp_dir
-                    ))
+                if "documents" in include:
+                    files_metadata.update(
+                        self._download_files_from_sharepoint(
+                            temp_dir, sharepoint_folder_path, recursive, file_types
+                        )
+                    )
+                if "pages" in include:
+                    files_metadata.update(
+                        self._download_pages_and_extract_metadata(temp_dir)
+                    )
                 return self._load_documents_with_metadata(
-                    files_metadata, temp_dir, recursive)
+                    files_metadata, temp_dir, recursive
+                )
         except Exception as exp:
-            logger.error("An error occurred while accessing SharePoint: ", exc_info=True)
+            logger.error(
+                "An error occurred while accessing SharePoint: %s", exp, exc_info=True
+            )

From caaf94fe9db91f07fb929639afd502daef73398b Mon Sep 17 00:00:00 2001
From: levi <levi.vanderheijden@vodafoneziggo.com>
Date: Wed, 7 Feb 2024 09:04:50 +0100
Subject: [PATCH 05/14] fixed issue with None in output type hints

---
 llama_hub/microsoft_sharepoint/base.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/llama_hub/microsoft_sharepoint/base.py b/llama_hub/microsoft_sharepoint/base.py
index d59cc478cb..278adb297e 100644
--- a/llama_hub/microsoft_sharepoint/base.py
+++ b/llama_hub/microsoft_sharepoint/base.py
@@ -255,12 +255,14 @@ def _download_pages_and_extract_metadata(
         data = self._get_results_with_odatanext(pages_endpoint)
         metadata = {}
         for item in data:
-            file_metadata = self._extract_page(item, download_dir)
-            if file_metadata:
+            try:
+                file_metadata = self._extract_page(item, download_dir)
                 metadata.update(file_metadata)
+            except ValueError:
+                pass
         return metadata
 
-    def _extract_page(self, item, download_dir) -> None | Dict[str, Dict[str, str]]:
+    def _extract_page(self, item, download_dir) -> Dict[str, Dict[str, str]]:
         """
         Retrieves the HTML content of the SharePoint page referenced by the 'item' argument
         from the Microsoft Graph. Stores the content as an .html file in the download_dir.
@@ -289,7 +291,7 @@ def _extract_page(self, item, download_dir) -> None | Dict[str, Dict[str, str]]:
                 ]
             )
             if html_content == "":
-                return None
+                raise ValueError(f"The page {item['name']} does not contain a textWebPart.")
 
             # Create the directory if it does not exist and save the file.
             if not os.path.exists(download_dir):

From b70102ebe448c13623af93cdc0ee4153f2fd099e Mon Sep 17 00:00:00 2001
From: levi <levi.vanderheijden@vodafoneziggo.com>
Date: Wed, 7 Feb 2024 11:05:35 +0100
Subject: [PATCH 06/14] running format and lint

---
 llama_hub/microsoft_sharepoint/base.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/llama_hub/microsoft_sharepoint/base.py b/llama_hub/microsoft_sharepoint/base.py
index 278adb297e..a295fc5d37 100644
--- a/llama_hub/microsoft_sharepoint/base.py
+++ b/llama_hub/microsoft_sharepoint/base.py
@@ -291,7 +291,9 @@ def _extract_page(self, item, download_dir) -> Dict[str, Dict[str, str]]:
                 ]
             )
             if html_content == "":
-                raise ValueError(f"The page {item['name']} does not contain a textWebPart.")
+                raise ValueError(
+                    f"The page {item['name']} does not contain a textWebPart."
+                )
 
             # Create the directory if it does not exist and save the file.
             if not os.path.exists(download_dir):

From 1146d69326bddff0122e621000c71dde922ae28e Mon Sep 17 00:00:00 2001
From: levi <levi.vanderheijden@vodafoneziggo.com>
Date: Wed, 7 Feb 2024 15:49:28 +0100
Subject: [PATCH 07/14] implemented batch call for page content

---
 llama_hub/microsoft_sharepoint/base.py | 52 +++++++++++++++++++-------
 1 file changed, 38 insertions(+), 14 deletions(-)

diff --git a/llama_hub/microsoft_sharepoint/base.py b/llama_hub/microsoft_sharepoint/base.py
index a295fc5d37..b87ebe2d37 100644
--- a/llama_hub/microsoft_sharepoint/base.py
+++ b/llama_hub/microsoft_sharepoint/base.py
@@ -253,40 +253,62 @@ def _download_pages_and_extract_metadata(
         pages_endpoint = f"https://graph.microsoft.com/beta/sites/{self._site_id_with_host_name}/pages"
 
         data = self._get_results_with_odatanext(pages_endpoint)
+        # the maximum is 20 requests per batch
+        # see https://learn.microsoft.com/en-us/graph/json-batching
+        batch_size = 20
         metadata = {}
-        for item in data:
-            try:
-                file_metadata = self._extract_page(item, download_dir)
-                metadata.update(file_metadata)
-            except ValueError:
-                pass
+        for i in range(0, len(data), batch_size):
+            batch = dict(enumerate(data[i : i + batch_size]))
+            batch_endpoint: str = "https://graph.microsoft.com/beta/$batch"
+            body = {
+                "requests": [
+                    {
+                        "url": f"/sites/{self._site_id_with_host_name}/pages/{item['id']}/microsoft.graph.sitepage/webparts",
+                        "method": "GET",
+                        "id": idx,
+                    }
+                    for idx, item in batch.items()
+                ]
+            }
+            batch_response = requests.post(
+                url=batch_endpoint, json=body, headers=self._authorization_headers
+            )
+            for response in batch_response.json()["responses"]:
+                try:
+                    file_metadata = self._extract_page(
+                        item=batch[int(response["id"])],
+                        response=response,
+                        download_dir=download_dir,
+                    )
+                    metadata.update(file_metadata)
+                except ValueError:
+                    pass
         return metadata
 
-    def _extract_page(self, item, download_dir) -> Dict[str, Dict[str, str]]:
+    def _extract_page(self, item, response, download_dir) -> Dict[str, Dict[str, str]]:
         """
         Retrieves the HTML content of the SharePoint page referenced by the 'item' argument
-        from the Microsoft Graph. Stores the content as an .html file in the download_dir.
+        from the Microsoft Graph batch response. Stores the content as an .html file in the download_dir.
 
         Args:
             item (Dict[str, Any]): a sharepoint item that contains
                   the fields 'id', 'name' and 'webUrl'
+            response (Dict[str, Any]): A single Microsoft Graph response from a batch request.
+                                       Expected to be correlated with the given item.
             download_dir (str): A directory to download the file to.
 
         Returns:
             The metadata of the item
         """
-        page_endpoint: str = f"https://graph.microsoft.com/beta/sites/{self._site_id_with_host_name}/pages/{item['id']}/microsoft.graph.sitepage/webparts"
         file_name = item["name"].replace(".aspx", ".html")
-
-        response = requests.get(url=page_endpoint, headers=self._authorization_headers)
         metadata = {}
 
-        if response.status_code == 200:
+        if response.get("status") == 200:
 
             html_content = "\n".join(
                 [
                     i["innerHtml"]
-                    for i in response.json()["value"]
+                    for i in response["body"]["value"]
                     if i["@odata.type"] == "#microsoft.graph.textWebPart"
                 ]
             )
@@ -305,7 +327,9 @@ def _extract_page(self, item, download_dir) -> Dict[str, Dict[str, str]]:
             return metadata
         else:
             logger.error(response.json()["error"])
-            raise ValueError(response.json()["error"])
+            raise ValueError(
+                f"status: {response['status']}, body: {response['body']['error']}"
+            )
 
     def _get_results_with_odatanext(self, request: str, **kwargs):
         """

From 45d5d41f467a87458e6e4cc8235c16dcf6a807e2 Mon Sep 17 00:00:00 2001
From: levi <levi.vanderheijden@vodafoneziggo.com>
Date: Wed, 7 Feb 2024 16:10:59 +0100
Subject: [PATCH 08/14] Improved type hints, documentation and comments

---
 llama_hub/microsoft_sharepoint/base.py | 51 +++++++++++++++++---------
 1 file changed, 33 insertions(+), 18 deletions(-)

diff --git a/llama_hub/microsoft_sharepoint/base.py b/llama_hub/microsoft_sharepoint/base.py
index b87ebe2d37..a1be48d89e 100644
--- a/llama_hub/microsoft_sharepoint/base.py
+++ b/llama_hub/microsoft_sharepoint/base.py
@@ -44,10 +44,11 @@ def __init__(
         self._authorization_headers = None
 
     def _setup_site_config(self, sharepoint_site_name: str):
-        access_token = self._get_access_token()
-
+        self._authorization_headers = {
+            "Authorization": f"Bearer {self._get_access_token()}"
+        }
         self._site_id_with_host_name = self._get_site_id_with_host_name(
-            access_token, sharepoint_site_name
+            sharepoint_site_name
         )
 
     def _get_access_token(self) -> str:
@@ -81,11 +82,12 @@ def _get_access_token(self) -> str:
             logger.error(response.json()["error"])
             raise ValueError(response.json()["error_description"])
 
-    def _get_site_id_with_host_name(self, access_token, sharepoint_site_name) -> str:
+    def _get_site_id_with_host_name(self, sharepoint_site_name: str) -> str:
         """
         Retrieves the site ID of a SharePoint site using the provided site name.
 
         Args:
+            access_token (str): access_token
             sharepoint_site_name (str): The name of the SharePoint site.
 
         Returns:
@@ -97,7 +99,6 @@ def _get_site_id_with_host_name(self, access_token, sharepoint_site_name) -> str
         site_information_endpoint = (
             f"https://graph.microsoft.com/v1.0/sites?search={sharepoint_site_name}"
         )
-        self._authorization_headers = {"Authorization": f"Bearer {access_token}"}
 
         response = requests.get(
             url=site_information_endpoint,
@@ -184,7 +185,7 @@ def _download_files_and_extract_metadata(
         download_dir: str,
         include_subfolders: bool,
         file_types: List[str],
-    ) -> Dict[str, str]:
+    ) -> Dict[str, Dict[str, str]]:
         """
         Downloads files from the specified folder ID and extracts metadata.
 
@@ -195,7 +196,7 @@ def _download_files_and_extract_metadata(
             file_types: (List[str]): A set of file types to load. If empty, loads all file types.
 
         Returns:
-            Dict[str, str]: A dictionary containing the metadata of the downloaded files.
+            Dict[str, Dict[str, str]]: A dictionary containing the metadata of the downloaded files.
 
         Raises:
             ValueError: If there is an error in downloading the files.
@@ -236,8 +237,8 @@ def _download_files_and_extract_metadata(
 
     def _download_pages_and_extract_metadata(
         self,
-        download_dir,
-    ):
+        download_dir: str,
+    ) -> Dict[str, Dict[str, str]]:
         """
         Downloads Sharepoint pages as HTML files and extracts metadata.
 
@@ -245,7 +246,7 @@ def _download_pages_and_extract_metadata(
             download_dir (str): The directory where the files should be downloaded.
 
         Returns:
-            Dict[str, str]: A dictionary containing the metadata of the downloaded Sharepoint pages.
+            Dict[str, Dict[str, str]]: A dictionary containing the metadata of the downloaded Sharepoint pages.
 
         Raises:
             ValueError: If there is an error in downloading the files.
@@ -257,9 +258,14 @@ def _download_pages_and_extract_metadata(
         # see https://learn.microsoft.com/en-us/graph/json-batching
         batch_size = 20
         metadata = {}
+
+        # request the page content for a batch of 20 pages
         for i in range(0, len(data), batch_size):
+            # Create a dict using enumerate to index each item in the batch, to later correlate the result with the original data
             batch = dict(enumerate(data[i : i + batch_size]))
             batch_endpoint: str = "https://graph.microsoft.com/beta/$batch"
+
+            # set-up the requests to be made
             body = {
                 "requests": [
                     {
@@ -273,6 +279,9 @@ def _download_pages_and_extract_metadata(
             batch_response = requests.post(
                 url=batch_endpoint, json=body, headers=self._authorization_headers
             )
+
+            # the result should contain results for all pages.
+            # If something went wrong, this is indicated in the response per page
             for response in batch_response.json()["responses"]:
                 try:
                     file_metadata = self._extract_page(
@@ -285,20 +294,23 @@ def _download_pages_and_extract_metadata(
                     pass
         return metadata
 
-    def _extract_page(self, item, response, download_dir) -> Dict[str, Dict[str, str]]:
+    def _extract_page(
+        self, item: Dict[str, Any], response: Dict[str, Any], download_dir: str
+    ) -> Dict[str, Dict[str, str]]:
         """
         Retrieves the HTML content of the SharePoint page referenced by the 'item' argument
         from the Microsoft Graph batch response. Stores the content as an .html file in the download_dir.
 
         Args:
             item (Dict[str, Any]): a sharepoint item that contains
-                  the fields 'id', 'name' and 'webUrl'
+                  the fields 'id', 'name' and 'webUrl'.
             response (Dict[str, Any]): A single Microsoft Graph response from a batch request.
                                        Expected to be correlated with the given item.
             download_dir (str): A directory to download the file to.
 
         Returns:
-            The metadata of the item
+            Dict[str, Dict[str, str]]: The file_name of the page stored in the download_dir as key
+                                       and the metadata of the page (item) as value
         """
         file_name = item["name"].replace(".aspx", ".html")
         metadata = {}
@@ -331,7 +343,9 @@ def _extract_page(self, item, response, download_dir) -> Dict[str, Dict[str, str
                 f"status: {response['status']}, body: {response['body']['error']}"
             )
 
-    def _get_results_with_odatanext(self, request: str, **kwargs):
+    def _get_results_with_odatanext(
+        self, request: str, **kwargs
+    ) -> List[Dict[str, Any]]:
         """
         Given a request, checks if the result contains `@odata.nextLink` in the result.
         If true, this function returns itself calling the @odata.nextLink.
@@ -341,7 +355,7 @@ def _get_results_with_odatanext(self, request: str, **kwargs):
             request (str): A GET request to be made, that might include a field '@odata.nextLink'
 
         Returns:
-            Dict[str, str]: A dictionary containing the metadata of the pages to be extracted
+            List[Dict[str, Any]]: A List with containing the metadata in Dict[str, Any] form of the pages to be extracted
         """
         if "prev_responses" not in kwargs.keys():
             prev_responses = []
@@ -413,7 +427,7 @@ def _download_file(
         self,
         item: Dict[str, Any],
         download_dir: str,
-    ):
+    ) -> Dict[str, Dict[str, str]]:
         """
         Downloads a file to the temporary download folder and returns
         its metadata.
@@ -439,7 +453,7 @@ def _download_files_from_sharepoint(
         sharepoint_folder_path: str,
         recursive: bool,
         file_types: List[str],
-    ) -> Dict[str, str]:
+    ) -> Dict[str, Dict[str, str]]:
         """
         Downloads files from the specified folder and returns the metadata for the downloaded files.
 
@@ -451,7 +465,8 @@ def _download_files_from_sharepoint(
             file_types: (List[str]): A set of file types to load. If empty, loads all file types.
 
         Returns:
-            Dict[str, str]: A dictionary containing the metadata of the downloaded files.
+            Dict[str,Dict[str, str]]: A dictionary containing file_name of the stored file
+                                      as key and the metadata of the downloaded files as value.
 
         """
 

From 2901c8c5a4db258510222189b7fec7f31908933c Mon Sep 17 00:00:00 2001
From: levi <levi.vanderheijden@vodafoneziggo.com>
Date: Thu, 8 Feb 2024 16:36:13 +0100
Subject: [PATCH 09/14] fix typo

---
 llama_hub/microsoft_sharepoint/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llama_hub/microsoft_sharepoint/base.py b/llama_hub/microsoft_sharepoint/base.py
index a1be48d89e..379710d6c8 100644
--- a/llama_hub/microsoft_sharepoint/base.py
+++ b/llama_hub/microsoft_sharepoint/base.py
@@ -556,7 +556,7 @@ def load_data(
             )
         if "documents" not in include and (recursive or file_types):
             logger.warning(
-                "'documents' is not in 'included', so 'recursive' and 'file_types' have no effect."
+                "'documents' is not in 'include', so 'recursive' and 'file_types' have no effect."
             )
         try:
             with tempfile.TemporaryDirectory() as temp_dir:

From 0e01fa2aa50cfaa5a98c7367abb347eeb331f940 Mon Sep 17 00:00:00 2001
From: levi <levi.vanderheijden@vodafoneziggo.com>
Date: Tue, 13 Feb 2024 11:33:05 +0100
Subject: [PATCH 10/14] Updated ReadMe

---
 llama_hub/microsoft_sharepoint/README.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llama_hub/microsoft_sharepoint/README.md b/llama_hub/microsoft_sharepoint/README.md
index 64eec3fdb7..ccba57e731 100644
--- a/llama_hub/microsoft_sharepoint/README.md
+++ b/llama_hub/microsoft_sharepoint/README.md
@@ -25,7 +25,6 @@ This loader can:
 - Load all files present in the drive of a SharePoint
 - Load all pages under a SharePoint site
 
-This loader loads the files present in a specific folder in sharepoint.
 
 If the files are present in the `Test` folder in SharePoint Site under `root` directory, then the input for the loader for  `file_path` is `Test`
 

From aca4c9e4f7695fd302fa2d86a7c39c6a5011eb79 Mon Sep 17 00:00:00 2001
From: levi <levi.vanderheijden@vodafoneziggo.com>
Date: Tue, 13 Feb 2024 11:46:08 +0100
Subject: [PATCH 11/14] Updated docstring _get_site_id_with_host_name

---
 llama_hub/microsoft_sharepoint/base.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llama_hub/microsoft_sharepoint/base.py b/llama_hub/microsoft_sharepoint/base.py
index 379710d6c8..6c596cc883 100644
--- a/llama_hub/microsoft_sharepoint/base.py
+++ b/llama_hub/microsoft_sharepoint/base.py
@@ -87,7 +87,6 @@ def _get_site_id_with_host_name(self, sharepoint_site_name: str) -> str:
         Retrieves the site ID of a SharePoint site using the provided site name.
 
         Args:
-            access_token (str): access_token
             sharepoint_site_name (str): The name of the SharePoint site.
 
         Returns:

From 8647de0883ec05ae3c5a6d18a035b405975f3c71 Mon Sep 17 00:00:00 2001
From: levi <levi.vanderheijden@vodafoneziggo.com>
Date: Tue, 13 Feb 2024 12:40:11 +0100
Subject: [PATCH 12/14] changed default sharepoint_folder_path argument to ""
 instead of root

---
 llama_hub/microsoft_sharepoint/base.py | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/llama_hub/microsoft_sharepoint/base.py b/llama_hub/microsoft_sharepoint/base.py
index e342d69b9e..6c1a0711a5 100644
--- a/llama_hub/microsoft_sharepoint/base.py
+++ b/llama_hub/microsoft_sharepoint/base.py
@@ -168,12 +168,10 @@ def _get_sharepoint_folder_id(self, folder_path: str) -> str:
         Returns:
             str: The ID of the SharePoint site folder.
         """
-        if folder_path == "root":
-            folder_id_endpoint = f"{self._drive_id_endpoint}/{self._drive_id}/root"
-        else:
-            folder_id_endpoint = (
-                f"{self._drive_id_endpoint}/{self._drive_id}/root:/{folder_path}"
-            )
+        folder_id_endpoint = f"{self._drive_id_endpoint}/{self._drive_id}/root"
+
+        if folder_path:
+            folder_id_endpoint += f":/{folder_path}"
 
         response = requests.get(
             url=folder_id_endpoint,
@@ -530,7 +528,7 @@ def get_metadata(filename: str) -> Any:
     def load_data(
         self,
         sharepoint_site_name: str,
-        sharepoint_folder_path: str = "root",
+        sharepoint_folder_path: str = "",
         recursive: bool = False,
         include: List[str] = ["documents", "pages"],
         file_types: List[str] = [],
@@ -541,7 +539,7 @@ def load_data(
         Args:
             sharepoint_site_name (str): The name of the SharePoint site.
             sharepoint_folder_path (str): The path of the folder in the SharePoint site.
-                                          If `root`, loads data from the root folder of the
+                                          If `""` (default), loads data from the root folder of the
                                           SharePoint site.
             recursive (bool): If True, files from all subfolders are downloaded.
             include (List[str]): list of Sharepoint objects to include.

From 9f1875a43d674288997e972374068162e7b80afa Mon Sep 17 00:00:00 2001
From: levi <levi.vanderheijden@vodafoneziggo.com>
Date: Tue, 13 Feb 2024 12:40:31 +0100
Subject: [PATCH 13/14] Updated ReadMe with more explicit instructions on how
 to use parameters, and what the default parameters are.

---
 llama_hub/microsoft_sharepoint/README.md | 44 ++++++++++++++++++------
 1 file changed, 34 insertions(+), 10 deletions(-)

diff --git a/llama_hub/microsoft_sharepoint/README.md b/llama_hub/microsoft_sharepoint/README.md
index ccba57e731..09a5eb5a56 100644
--- a/llama_hub/microsoft_sharepoint/README.md
+++ b/llama_hub/microsoft_sharepoint/README.md
@@ -30,7 +30,13 @@ If the files are present in the `Test` folder in SharePoint Site under `root` di
 
 ![FilePath](file_path_info.png)
 
-### Example loading a single folder
+### Example loading all files and pages
+If `sharepoint_folder_path` is not provided it defaults to `""`. 
+In that case, the root folder of the SharePoint Drive is used as the folder to load files from. 
+
+If both `sharepoint_folder_path` is not provided and `recursive` is set to `True`, all files in the SharePoint Drive are loaded. 
+If `recursive` is not provided, it defaults to `False`. In this case, files from subfolders are not loaded. 
+
 ```python
 from llama_index import download_loader 
 SharePointLoader = download_loader("SharePointReader")
@@ -42,13 +48,20 @@ loader = SharePointLoader(
             )
 
 documents = loader.load_data(
-            sharepoint_site_name: "<Sharepoint Site Name>",
-            sharepoint_folder_path: "<Folder Path>",
+            sharepoint_site_name = "<Sharepoint Site Name>",
             recursive = True,
 )
 ```
 
-### Example loading all files and pages
+### Example loading a single folder
+To load a single folder, specify the `sharepoint_folder_path` with the name of the folder or path from the root directory. 
+Example: `sharepoint_folder_path = "my/folder/path"`
+
+In order to load only the documents from this `sharepoint_folder_path`, and not the pages for the `sharepoint_site_name`, 
+you need to provide the `include` argument as `['documents]`. By default, `include` is equal to `['documents', 'pages']`.
+
+If you do not want to include files from subfolders for the given `sharepoint_folder_path`, remove the argument `recursive` (defaults to `False`). 
+
 ```python
 from llama_index import download_loader 
 SharePointLoader = download_loader("SharePointReader")
@@ -60,13 +73,21 @@ loader = SharePointLoader(
             )
 
 documents = loader.load_data(
-            sharepoint_site_name: "<Sharepoint Site Name>",
+            sharepoint_site_name = "<Sharepoint Site Name>",
+            sharepoint_folder_path = "<Folder Path>",
             recursive = True,
-            include= ['pages', 'documents']
+            include = ['documents']
 )
 ```
 
+
+
 ### Example loading just pages
+In order to load only the pages for the `sharepoint_site_name`, 
+you need to provide the `include` argument as `['pages]`. By default, `include` is equal to `['documents', 'pages']`.
+
+Note: `recursive` and `sharepoint_folder_path` arguments have no effect if `documents` is not in `include`.
+
 ```python
 from llama_index import download_loader 
 SharePointLoader = download_loader("SharePointReader")
@@ -78,8 +99,7 @@ loader = SharePointLoader(
             )
 
 documents = loader.load_data(
-            sharepoint_site_name: "<Sharepoint Site Name>",
-            recursive = True,
+            sharepoint_site_name = "<Sharepoint Site Name>",
             include = ['pages']
 )
 ```
@@ -96,13 +116,17 @@ loader = SharePointLoader(
             )
 
 documents = loader.load_data(
-            sharepoint_site_name: "<Sharepoint Site Name>",
+            sharepoint_site_name = "<Sharepoint Site Name>",
             recursive = True,
             include = ['documents']
 )
 ```
 
 ### Example loading just documents with filetype .docx or .pdf
+
+If you want to only load specific filetypes, provide the file extension names in `file_types`. 
+Example: to only include .pdf and .docx files, set `file_types` to `['docx', 'pdf']`
+
 ```python
 from llama_index import download_loader 
 SharePointLoader = download_loader("SharePointReader")
@@ -114,7 +138,7 @@ loader = SharePointLoader(
             )
 
 documents = loader.load_data(
-            sharepoint_site_name: "<Sharepoint Site Name>",
+            sharepoint_site_name = "<Sharepoint Site Name>",
             recursive = True,
             include = ['documents'],
             file_types = ['docx', 'pdf']

From ba66cee730f0939b23945c4beed24a2de19ee51f Mon Sep 17 00:00:00 2001
From: levi <levi.vanderheijden@vodafoneziggo.com>
Date: Tue, 13 Feb 2024 13:12:55 +0100
Subject: [PATCH 14/14] fix typo

---
 llama_hub/microsoft_sharepoint/README.md | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/llama_hub/microsoft_sharepoint/README.md b/llama_hub/microsoft_sharepoint/README.md
index 09a5eb5a56..bc07373a6b 100644
--- a/llama_hub/microsoft_sharepoint/README.md
+++ b/llama_hub/microsoft_sharepoint/README.md
@@ -55,10 +55,11 @@ documents = loader.load_data(
 
 ### Example loading a single folder
 To load a single folder, specify the `sharepoint_folder_path` with the name of the folder or path from the root directory. 
+
 Example: `sharepoint_folder_path = "my/folder/path"`
 
 In order to load only the documents from this `sharepoint_folder_path`, and not the pages for the `sharepoint_site_name`, 
-you need to provide the `include` argument as `['documents]`. By default, `include` is equal to `['documents', 'pages']`.
+you need to provide the `include` argument as `['documents']`. By default, `include` is equal to `['documents', 'pages']`.
 
 If you do not want to include files from subfolders for the given `sharepoint_folder_path`, remove the argument `recursive` (defaults to `False`). 
 
@@ -84,9 +85,9 @@ documents = loader.load_data(
 
 ### Example loading just pages
 In order to load only the pages for the `sharepoint_site_name`, 
-you need to provide the `include` argument as `['pages]`. By default, `include` is equal to `['documents', 'pages']`.
+you need to provide the `include` argument as `['pages']`. By default, `include` is equal to `['documents', 'pages']`.
 
-Note: `recursive` and `sharepoint_folder_path` arguments have no effect if `documents` is not in `include`.
+Note: `recursive` and `sharepoint_folder_path` arguments have no effect if `documents` is not in the list of the argument `include`.
 
 ```python
 from llama_index import download_loader