From efe74091f32871e6197e0bea7cdd52c882358951 Mon Sep 17 00:00:00 2001 From: mkupferman Date: Mon, 19 Feb 2024 18:54:41 -0500 Subject: [PATCH] Gmail API pagination (#469) * Use pagination to fetch more than 100 Gmail messages at once * Provide `paginate_messages` option to allow use of previous behavior --- docs/source/usage.md | 2 ++ parsedmarc/cli.py | 4 ++++ parsedmarc/mail/gmail.py | 34 +++++++++++++++++++++++++--------- 3 files changed, 31 insertions(+), 9 deletions(-) diff --git a/docs/source/usage.md b/docs/source/usage.md index 5df5eefd..cc0e0352 100644 --- a/docs/source/usage.md +++ b/docs/source/usage.md @@ -295,6 +295,8 @@ The full set of configuration options are: (Default: `https://www.googleapis.com/auth/gmail.modify`) - `oauth2_port` - int: The TCP port for the local server to listen on for the OAuth2 response (Default: `8080`) + - `paginate_messages` - bool: When `True`, fetch all applicable Gmail messages. + When `False`, only fetch up to 100 new messages per run (Default: `True`) - `log_analytics` - `client_id` - str: The app registration's client ID - `client_secret` - str: The app registration's client secret diff --git a/parsedmarc/cli.py b/parsedmarc/cli.py index d08f4e88..d34cf6ac 100644 --- a/parsedmarc/cli.py +++ b/parsedmarc/cli.py @@ -395,6 +395,7 @@ def process_reports(reports_): gmail_api_credentials_file=None, gmail_api_token_file=None, gmail_api_include_spam_trash=False, + gmail_api_paginate_messages=True, gmail_api_scopes=[], gmail_api_oauth2_port=8080, log_file=args.log_file, @@ -829,6 +830,8 @@ def process_reports(reports_): gmail_api_config.get("token_file", ".token") opts.gmail_api_include_spam_trash = \ gmail_api_config.getboolean("include_spam_trash", False) + opts.gmail_api_paginate_messages = \ + gmail_api_config.getboolean("paginate_messages", True) opts.gmail_api_scopes = \ gmail_api_config.get("scopes", default_gmail_api_scope) @@ -1098,6 +1101,7 @@ def process_reports(reports_): token_file=opts.gmail_api_token_file, scopes=opts.gmail_api_scopes, include_spam_trash=opts.gmail_api_include_spam_trash, + paginate_messages=opts.gmail_api_paginate_messages, reports_folder=opts.mailbox_reports_folder, oauth2_port=opts.gmail_api_oauth2_port ) diff --git a/parsedmarc/mail/gmail.py b/parsedmarc/mail/gmail.py index 92ec431d..436e1f02 100644 --- a/parsedmarc/mail/gmail.py +++ b/parsedmarc/mail/gmail.py @@ -42,11 +42,13 @@ def __init__(self, scopes: List[str], include_spam_trash: bool, reports_folder: str, - oauth2_port: int): + oauth2_port: int, + paginate_messages: bool): creds = _get_creds(token_file, credentials_file, scopes, oauth2_port) self.service = build('gmail', 'v1', credentials=creds) self.include_spam_trash = include_spam_trash self.reports_label_id = self._find_label_id_for_label(reports_folder) + self.paginate_messages = paginate_messages def create_folder(self, folder_name: str): # Gmail doesn't support the name Archive @@ -65,16 +67,30 @@ def create_folder(self, folder_name: str): else: raise e + def _fetch_all_message_ids(self, reports_label_id, page_token=None): + results = ( + self.service.users() + .messages() + .list( + userId="me", + includeSpamTrash=self.include_spam_trash, + labelIds=[reports_label_id], + pageToken=page_token, + ) + .execute() + ) + messages = results.get("messages", []) + for message in messages: + yield message["id"] + + if "nextPageToken" in results and self.paginate_messages: + yield from self._fetch_all_message_ids( + reports_label_id, results["nextPageToken"] + ) + def fetch_messages(self, reports_folder: str, **kwargs) -> List[str]: reports_label_id = self._find_label_id_for_label(reports_folder) - results = self.service.users().messages()\ - .list(userId='me', - includeSpamTrash=self.include_spam_trash, - labelIds=[reports_label_id] - )\ - .execute() - messages = results.get('messages', []) - return [message['id'] for message in messages] + return [id for id in self._fetch_all_message_ids(reports_label_id)] def fetch_message(self, message_id): msg = self.service.users().messages()\