diff --git a/.gitignore b/.gitignore index 28a68db..47e6f88 100644 --- a/.gitignore +++ b/.gitignore @@ -165,4 +165,6 @@ cython_debug/ archive/ reddit/ *.md -reddit_acvhive* \ No newline at end of file +*.json +reddit_acvhive* +file_log.json \ No newline at end of file diff --git a/README.md b/README.md index 33ff9d4..cc3304f 100644 --- a/README.md +++ b/README.md @@ -43,6 +43,8 @@ After adding all secrets: ![Repository Secrets](resources/repositiory_secrets.pn 3. **Manually Trigger the Workflow**: - Go to the **Actions** tab > Select the **Reddit Stash Workflow** from the list on the left > Click **Run workflow** > Select the branch `main` > Click the green **Run workflow** button. The workflow will then be triggered, and you can monitor its progress in the Actions tab. Upon successful completion, you should see the Reddit folder in your Dropbox. +4. The workflow should run automatically at midnight CET time automatically. + #### Local Installation 1. **Clone this repository**: @@ -131,6 +133,7 @@ The `settings.ini` file in the root directory of the project allows you to confi save_directory = reddit/ # your system save directory dropbox_directory = /reddit # your dropbox directory save_type = ALL # Options: 'ALL' to save all activity, 'SAVED' to save only saved posts/comments +check_type = LOG # Options: 'LOG' to use the logging file to verify the file exisitnece, 'DIR' to verify the file exisitence based on the downloaded directory. [Configuration] client_id = None # Can be set here or via environment variables @@ -138,12 +141,14 @@ client_secret = None # Can be set here or via environment variables username = None # Can be set here or via environment variables password = None # Can be set here or via environment variables ``` -save_directory: Specifies the directory where the Reddit content will be saved, modify it to the location you want it to be in. -dropbox_directory : Specifies the folder where the Reddit content will be saved on dropbox, modify it to the location you want it to be in. -save_type: Determines what user activity is saved, accepts these two values: -* `ALL`: Saves all posts and comments made by the user, along with the saved posts and comments with it's context. -* `SAVED`: Saves only the posts and comments the user has saved on Reddit with it's context. - +* save_directory: Specifies the directory where the Reddit content will be saved, modify it to the location you want it to be in. +* dropbox_directory : Specifies the folder where the Reddit content will be saved on dropbox, modify it to the location you want it to be in. +* save_type: Determines what user activity is saved, accepts these two values: + * `ALL`: Saves all posts and comments made by the user, along with the saved posts and comments with it's context. + * `SAVED`: Saves only the posts and comments the user has saved on Reddit with it's context. +* check_type : Determines if the file existence needs to be checked using the log file only or using the directory. +* `LOG` : Uses the log file only to check the file exisitence, faster processing. Recommneded to use in the github action setup. +* `DIR` : Uses the saved/ downloaded directory to check the file existence, slower processing. Recommended to use in the local setup. Note: You can still use environment variables as a fallback or override for the Reddit API credentials if they are not set in the settings.ini file. #### Setting Up Reddit Environment Variables diff --git a/dropbox_utils.py b/dropbox_utils.py index 94b8343..290edd6 100644 --- a/dropbox_utils.py +++ b/dropbox_utils.py @@ -3,10 +3,68 @@ import sys import dropbox import requests +import hashlib import configparser +from dropbox.exceptions import ApiError +from dropbox.files import FileMetadata + # Import the validate_and_set_directory function from utils from utils.file_path_validate import validate_and_set_directory + +class DropboxContentHasher: + """Implements Dropbox content hashing as per the provided reference code.""" + + BLOCK_SIZE = 4 * 1024 * 1024 + + def __init__(self): + self._overall_hasher = hashlib.sha256() + self._block_hasher = hashlib.sha256() + self._block_pos = 0 + + self.digest_size = self._overall_hasher.digest_size + + def update(self, new_data): + if self._overall_hasher is None: + raise AssertionError( + "can't use this object anymore; you already called digest()") + + assert isinstance(new_data, bytes), ( + "Expecting a byte string, got {!r}".format(new_data)) + + new_data_pos = 0 + while new_data_pos < len(new_data): + if self._block_pos == self.BLOCK_SIZE: + self._overall_hasher.update(self._block_hasher.digest()) + self._block_hasher = hashlib.sha256() + self._block_pos = 0 + + space_in_block = self.BLOCK_SIZE - self._block_pos + part = new_data[new_data_pos:(new_data_pos+space_in_block)] + self._block_hasher.update(part) + + self._block_pos += len(part) + new_data_pos += len(part) + + def _finish(self): + if self._overall_hasher is None: + raise AssertionError( + "can't use this object anymore; you already called digest() or hexdigest()") + + if self._block_pos > 0: + self._overall_hasher.update(self._block_hasher.digest()) + self._block_hasher = None + h = self._overall_hasher + self._overall_hasher = None # Make sure we can't use this object anymore. + return h + + def digest(self): + return self._finish().digest() + + def hexdigest(self): + return self._finish().hexdigest() + + def refresh_dropbox_token(): refresh_token = os.getenv('DROPBOX_REFRESH_TOKEN') client_id = os.getenv('DROPBOX_APP_KEY') @@ -41,6 +99,9 @@ def refresh_dropbox_token(): # Fetch the dropbox_folder from the settings.ini file with a fallback dropbox_folder = config_parser.get('Settings', 'dropbox_directory', fallback='/reddit') +# Fetch the check_type from the settings.ini file with a fallback +check_type = config_parser.get('Settings', 'check_type', fallback='LOG').upper() + def sanitize_filename(filename): """Sanitize the filename to be Dropbox-compatible.""" sanitized_name = re.sub(r'[<>:"/\\|?*\x00-\x1F]', '_', filename) # Also remove control characters @@ -53,28 +114,39 @@ def sanitize_filename(filename): return sanitized_name -def list_dropbox_files(dbx, dropbox_folder): - """List all files in the specified Dropbox folder.""" - file_names = set() +def calculate_local_content_hash(file_path): + """Calculate the Dropbox content hash for a local file.""" + hasher = DropboxContentHasher() + with open(file_path, 'rb') as f: + while True: + chunk = f.read(1024 * 1024) + if len(chunk) == 0: + break + hasher.update(chunk) + return hasher.hexdigest() + +def list_dropbox_files_with_hashes(dbx, dropbox_folder): + """List all files in the specified Dropbox folder along with their content hashes.""" + file_metadata = {} try: result = dbx.files_list_folder(dropbox_folder, recursive=True) while True: for entry in result.entries: - if isinstance(entry, dropbox.files.FileMetadata): - file_names.add(entry.path_lower) + if isinstance(entry, FileMetadata): + file_metadata[entry.path_lower] = entry.content_hash if not result.has_more: break result = dbx.files_list_folder_continue(result.cursor) - except dropbox.exceptions.ApiError as err: + except ApiError as err: print(f"Failed to list files in Dropbox folder {dropbox_folder}: {err}") - return file_names + return file_metadata def upload_directory_to_dropbox(local_directory, dropbox_folder="/"): - """Uploads all files in the specified local directory to Dropbox without overwriting.""" + """Uploads all files in the specified local directory to Dropbox, replacing only changed files.""" dbx = dropbox.Dropbox(os.getenv('DROPBOX_TOKEN')) - # List all files currently in the Dropbox folder - existing_files = list_dropbox_files(dbx, dropbox_folder) + # List all files currently in the Dropbox folder along with their content hashes + dropbox_files = list_dropbox_files_with_hashes(dbx, dropbox_folder) uploaded_count = 0 uploaded_size = 0 @@ -93,54 +165,70 @@ def upload_directory_to_dropbox(local_directory, dropbox_folder="/"): # Adjust for sanitized name dropbox_path = dropbox_path.replace(file_name, sanitized_name) - if dropbox_path.lower() in existing_files: + local_content_hash = calculate_local_content_hash(file_path) + + # Check if the file exists and is the same on Dropbox + if dropbox_path.lower() in dropbox_files and dropbox_files[dropbox_path.lower()] == local_content_hash: skipped_count += 1 continue + # Upload the file since it doesn't exist or has changed try: with open(file_path, "rb") as f: file_size = os.path.getsize(file_path) - dbx.files_upload(f.read(), dropbox_path) + dbx.files_upload(f.read(), dropbox_path, mode=dropbox.files.WriteMode.overwrite) uploaded_count += 1 uploaded_size += file_size - except dropbox.exceptions.ApiError as e: + except ApiError as e: print(f"Failed to upload {file_path} to Dropbox: {e}") print(f"Upload completed. {uploaded_count} files uploaded ({uploaded_size / (1024 * 1024):.2f} MB).") - print(f"{skipped_count} files were skipped (already existed).") + print(f"{skipped_count} files were skipped (already existed or unchanged).") def download_directory_from_dropbox(dbx, dropbox_folder, local_directory): - """Downloads all files in the specified Dropbox folder to the local directory.""" + """Downloads all files in the specified Dropbox folder to the local directory, replacing only changed files.""" downloaded_count = 0 downloaded_size = 0 skipped_count = 0 + # List all files currently in the Dropbox folder along with their content hashes + dropbox_files = list_dropbox_files_with_hashes(dbx, dropbox_folder) + try: - result = dbx.files_list_folder(dropbox_folder, recursive=True) - while True: - for entry in result.entries: - if isinstance(entry, dropbox.files.FileMetadata): - local_path = os.path.join(local_directory, entry.path_lower[len(dropbox_folder):].lstrip('/')) - - # Skip the download if the file already exists locally - if os.path.exists(local_path): - skipped_count += 1 - continue - - os.makedirs(os.path.dirname(local_path), exist_ok=True) - with open(local_path, "wb") as f: - metadata, res = dbx.files_download(entry.path_lower) - f.write(res.content) - downloaded_count += 1 - downloaded_size += metadata.size - if not result.has_more: - break - result = dbx.files_list_folder_continue(result.cursor) - except dropbox.exceptions.ApiError as err: + for dropbox_path, dropbox_hash in dropbox_files.items(): + local_path = os.path.join(local_directory, dropbox_path[len(dropbox_folder):].lstrip('/')) + + if os.path.exists(local_path): + local_content_hash = calculate_local_content_hash(local_path) + if local_content_hash == dropbox_hash: + skipped_count += 1 + continue + + os.makedirs(os.path.dirname(local_path), exist_ok=True) + with open(local_path, "wb") as f: + metadata, res = dbx.files_download(dropbox_path) + f.write(res.content) + downloaded_count += 1 + downloaded_size += metadata.size + except ApiError as err: print(f"Failed to download files from Dropbox folder {dropbox_folder}: {err}") print(f"Download completed. {downloaded_count} files downloaded ({downloaded_size / (1024 * 1024):.2f} MB).") - print(f"{skipped_count} files were skipped (i.e. they already existed).") + print(f"{skipped_count} files were skipped (already existed or unchanged).") + +def download_log_file_from_dropbox(dbx, dropbox_folder, local_directory): + """Download only the log file from Dropbox.""" + log_file_path = os.path.join(local_directory, 'file_log.json') + + try: + # Download the log file + metadata, res = dbx.files_download(f"{dropbox_folder}/file_log.json") + os.makedirs(os.path.dirname(log_file_path), exist_ok=True) + with open(log_file_path, "wb") as f: + f.write(res.content) + print(f"Log file downloaded successfully to {log_file_path}.") + except ApiError as err: + print(f"Failed to download the log file from Dropbox: {err}") if __name__ == "__main__": # Refresh the access token because it expires @@ -148,6 +236,13 @@ def download_directory_from_dropbox(dbx, dropbox_folder, local_directory): dbx = dropbox.Dropbox(os.getenv('DROPBOX_TOKEN')) if '--download' in sys.argv: - download_directory_from_dropbox(dbx, dropbox_folder, local_dir) + if check_type == 'LOG': + print("Downloading only the log file as check_type is LOG.") + download_log_file_from_dropbox(dbx, dropbox_folder, local_dir) + elif check_type == 'DIR': + print("Downloading the entire directory as check_type is DIR.") + download_directory_from_dropbox(dbx, dropbox_folder, local_dir) + else: + raise ValueError(f"Unknown check_type: {check_type}") elif '--upload' in sys.argv: upload_directory_to_dropbox(local_dir, dropbox_folder) \ No newline at end of file diff --git a/reddit_stash.py b/reddit_stash.py index b9dd188..5ffa903 100644 --- a/reddit_stash.py +++ b/reddit_stash.py @@ -3,6 +3,7 @@ from utils.file_path_validate import validate_and_set_directory from utils.file_operations import save_user_activity from utils.env_config import load_config_and_env +from utils.log_utils import load_file_log, log_file, is_file_logged # Load configuration config_parser = configparser.ConfigParser() @@ -27,8 +28,11 @@ ) if __name__ == "__main__": + # Load the log file from the save directory + file_log = load_file_log(save_directory) + # Process user activity (submissions, comments, and saved items) and get statistics - processed_count, skipped_count, total_size = save_user_activity(reddit, save_directory) + processed_count, skipped_count, total_size = save_user_activity(reddit, save_directory, file_log) # Print final statistics of processing print(f"Processing completed. {processed_count} items processed, {skipped_count} items skipped.") diff --git a/settings.ini b/settings.ini index f111dd8..ea1d736 100644 --- a/settings.ini +++ b/settings.ini @@ -2,6 +2,7 @@ save_directory = reddit/ dropbox_directory = /reddit save_type = ALL +check_type = LOG [Configuration] client_id = None diff --git a/utils/file_operations.py b/utils/file_operations.py index 68e6999..23be5ef 100644 --- a/utils/file_operations.py +++ b/utils/file_operations.py @@ -2,10 +2,10 @@ import time import configparser from tqdm import tqdm -from datetime import datetime - -from praw.models import Submission, Comment -from utils.time_utilities import dynamic_sleep, lazy_load_comments +from praw.models import Submission, Comment # Import Submission and Comment +from utils.log_utils import log_file, save_file_log +from utils.save_utils import save_submission, save_comment_and_context # Import common functions +from utils.time_utilities import dynamic_sleep # Dynamically determine the path to the root directory BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) @@ -17,18 +17,7 @@ config = configparser.ConfigParser() config.read(config_path) save_type = config.get('Settings', 'save_type', fallback='ALL').upper() - -def format_date(timestamp): - """Format a UTC timestamp into a human-readable date.""" - return datetime.utcfromtimestamp(timestamp).strftime('%Y-%m-%d %H:%M:%S') - -def extract_video_id(url): - """Extract the video ID from a YouTube URL.""" - if "youtube.com" in url: - return url.split("v=")[-1] - elif "youtu.be" in url: - return url.split("/")[-1] - return None +check_type = config.get('Settings', 'check_type', fallback='LOG').upper() def create_directory(subreddit_name, save_directory, created_dirs_cache): """Create the directory for saving data if it does not exist.""" @@ -38,105 +27,82 @@ def create_directory(subreddit_name, save_directory, created_dirs_cache): created_dirs_cache.add(sub_dir) return sub_dir -def get_existing_files(save_directory): - """Build a set of all existing files in the save directory.""" +def get_existing_files_from_log(file_log): + """Return a set of unique keys (subreddit + id) based on the JSON log.""" + existing_files = set(file_log.keys()) + return existing_files + +def get_existing_files_from_dir(save_directory): + """Build a set of all existing files in the save directory using os.walk.""" existing_files = set() for root, dirs, files in os.walk(save_directory): for file in files: - existing_files.add(os.path.join(root, file)) + # Extract the unique key format (id-subreddit) from the file path + filename = os.path.splitext(file)[0] + subreddit_name = os.path.basename(root) + if filename.startswith("POST_"): + file_id = filename.split("POST_")[1] + elif filename.startswith("COMMENT_"): + file_id = filename.split("COMMENT_")[1] + elif filename.startswith("SAVED_POST_"): + file_id = filename.split("SAVED_POST_")[1] + elif filename.startswith("SAVED_COMMENT_"): + file_id = filename.split("SAVED_COMMENT_")[1] + else: + continue + unique_key = f"{file_id}-{subreddit_name}" + existing_files.add(unique_key) return existing_files -def save_to_file(content, file_path, save_function, existing_files): +def save_to_file(content, file_path, save_function, existing_files, file_log, save_directory, created_dirs_cache): """Save content to a file using the specified save function.""" - if file_path in existing_files: - # File already exists, skip saving + file_id = content.id # Assuming `id` is unique for each Reddit content + subreddit_name = content.subreddit.display_name # Get the subreddit name + + # Create the unique key + unique_key = f"{file_id}-{subreddit_name}" + + # If the file is already logged or exists in the directory, skip saving + if unique_key in existing_files: return True # Indicate that the file already exists and no saving was performed + + # Ensure the subreddit directory exists only if we're about to save something new + sub_dir = os.path.join(save_directory, subreddit_name) + if sub_dir not in created_dirs_cache: + os.makedirs(sub_dir, exist_ok=True) + created_dirs_cache.add(sub_dir) + + # Proceed with saving the file try: with open(file_path, 'w', encoding="utf-8") as f: save_function(content, f) + + # Log the file after saving successfully with the unique key + log_file(file_log, file_id, { + 'subreddit': subreddit_name, + 'type': type(content).__name__, + 'file_path': file_path # This will be converted to relative in log_file + }, save_directory) + return False # Indicate that the file was saved successfully except Exception as e: print(f"Failed to save {file_path}: {e}") return False # Indicate that the file could not be saved -def save_submission(submission, f): - """Save a submission and its metadata.""" - f.write('---\n') # Start of frontmatter - f.write(f'id: {submission.id}\n') - f.write(f'subreddit: /r/{submission.subreddit.display_name}\n') - f.write(f'timestamp: {format_date(submission.created_utc)}\n') - f.write(f'author: /u/{submission.author.name if submission.author else "[deleted]"}\n') - - if submission.link_flair_text: # Check if flair exists and is not None - f.write(f'flair: {submission.link_flair_text}\n') - - f.write(f'comments: {submission.num_comments}\n') - f.write(f'permalink: https://reddit.com{submission.permalink}\n') - f.write('---\n\n') # End of frontmatter - f.write(f'# {submission.title}\n\n') - f.write(f'**Upvotes:** {submission.score} | **Permalink:** [Link](https://reddit.com{submission.permalink})\n\n') - - if submission.is_self: - f.write(submission.selftext if submission.selftext else '[Deleted Post]') - else: - if submission.url.endswith(('.jpg', '.jpeg', '.png', '.gif')): - f.write(f"![Image]({submission.url})") - elif "youtube.com" in submission.url or "youtu.be" in submission.url: - video_id = extract_video_id(submission.url) - f.write(f"[![Video](https://img.youtube.com/vi/{video_id}/0.jpg)]({submission.url})") - else: - f.write(submission.url if submission.url else '[Deleted Post]') - - f.write('\n\n## Comments:\n\n') - lazy_comments = lazy_load_comments(submission) - process_comments(lazy_comments, f) - -def save_comment_and_context(comment, f): - """Save a comment and its context.""" - f.write('---\n') # Start of frontmatter - f.write(f'Comment by /u/{comment.author.name if comment.author else "[deleted]"}\n') - f.write(f'- **Upvotes:** {comment.score} | **Permalink:** [Link](https://reddit.com{comment.permalink})\n') - f.write(f'{comment.body}\n\n') - f.write('---\n\n') # End of frontmatter - - parent = comment.parent() - if isinstance(parent, Submission): - f.write(f'## Context: Post by /u/{parent.author.name if parent.author else "[deleted]"}\n') - f.write(f'- **Title:** {parent.title}\n') - f.write(f'- **Upvotes:** {parent.score} | **Permalink:** [Link](https://reddit.com{parent.permalink})\n') - if parent.is_self: - f.write(f'{parent.selftext}\n\n') - else: - f.write(f'[Link to post content]({parent.url})\n\n') - elif isinstance(parent, Comment): - f.write(f'## Context: Parent Comment by /u/{parent.author.name if parent.author else "[deleted]"}\n') - f.write(f'- **Upvotes:** {parent.score} | **Permalink:** [Link](https://reddit.com{parent.permalink})\n') - f.write(f'{parent.body}\n\n') - -def process_comments(comments, f, depth=0, simple_format=False): - """Process all comments and visualize depth using indentation.""" - for i, comment in enumerate(comments): - if isinstance(comment, Comment): - indent = ' ' * depth - f.write(f'{indent}### Comment {i+1} by /u/{comment.author.name if comment.author else "[deleted]"}\n') - f.write(f'{indent}- **Upvotes:** {comment.score} | **Permalink:** [Link](https://reddit.com{comment.permalink})\n') - f.write(f'{indent}{comment.body}\n\n') - - if not simple_format and comment.replies: - process_comments(comment.replies, f, depth + 1) - - f.write(f'{indent}---\n\n') - -def save_user_activity(reddit, save_directory): +def save_user_activity(reddit, save_directory, file_log): """Save user's posts, comments, and saved items.""" user = reddit.user.me() - # Retrieve all necessary data - submissions = list(user.submissions.new(limit=1000)) - comments = list(user.comments.new(limit=1000)) - saved_items = list(user.saved(limit=1000)) + # Determine how to check for existing files based on check_type + if check_type == 'LOG': + print("Check type is LOG. Using JSON log to find existing files.") + existing_files = get_existing_files_from_log(file_log) + elif check_type == 'DIR': + print("Check type is DIR. Using directory scan to find existing files.") + existing_files = get_existing_files_from_dir(save_directory) + else: + raise ValueError(f"Unknown check_type: {check_type}") - existing_files = get_existing_files(save_directory) created_dirs_cache = set() processed_count = 0 # Counter for processed items @@ -145,27 +111,31 @@ def save_user_activity(reddit, save_directory): if save_type == 'ALL': processed_count, skipped_count, total_size = save_all_user_activity( - submissions, comments, saved_items, save_directory, existing_files, - created_dirs_cache, processed_count, skipped_count, total_size + list(user.submissions.new(limit=1000)), + list(user.comments.new(limit=1000)), + save_directory, existing_files, created_dirs_cache, + processed_count, skipped_count, total_size, file_log ) processed_count, skipped_count, total_size = save_saved_user_activity( - saved_items, save_directory, existing_files, created_dirs_cache, - processed_count, skipped_count, total_size + list(user.saved(limit=1000)), save_directory, existing_files, + created_dirs_cache, processed_count, skipped_count, total_size, file_log ) elif save_type == 'SAVED': processed_count, skipped_count, total_size = save_saved_user_activity( - saved_items, save_directory, existing_files, created_dirs_cache, - processed_count, skipped_count, total_size + list(user.saved(limit=1000)), save_directory, existing_files, + created_dirs_cache, processed_count, skipped_count, total_size, file_log ) + # Save the updated file log + save_file_log(file_log, save_directory) + return processed_count, skipped_count, total_size -def save_all_user_activity(submissions, comments, saved_items, save_directory, existing_files, created_dirs_cache, processed_count, skipped_count, total_size): +def save_all_user_activity(submissions, comments, save_directory, existing_files, created_dirs_cache, processed_count, skipped_count, total_size, file_log): """Save all user posts and comments.""" for submission in tqdm(submissions, desc="Processing Submissions"): - sub_dir = create_directory(submission.subreddit.display_name, save_directory, created_dirs_cache) - file_path = os.path.join(sub_dir, f"POST_{submission.id}.md") - if save_to_file(submission, file_path, save_submission, existing_files): + file_path = os.path.join(save_directory, submission.subreddit.display_name, f"POST_{submission.id}.md") + if save_to_file(submission, file_path, save_submission, existing_files, file_log, save_directory, created_dirs_cache): skipped_count += 1 # Increment skipped count if the file already exists continue # Skip further processing if the file already exists @@ -173,9 +143,8 @@ def save_all_user_activity(submissions, comments, saved_items, save_directory, e total_size += os.path.getsize(file_path) # Accumulate total size of processed files for comment in tqdm(comments, desc="Processing Comments"): - sub_dir = create_directory(comment.subreddit.display_name, save_directory, created_dirs_cache) - file_path = os.path.join(sub_dir, f"COMMENT_{comment.id}.md") - if save_to_file(comment, file_path, save_comment_and_context, existing_files): + file_path = os.path.join(save_directory, comment.subreddit.display_name, f"COMMENT_{comment.id}.md") + if save_to_file(comment, file_path, save_comment_and_context, existing_files, file_log, save_directory, created_dirs_cache): skipped_count += 1 # Increment skipped count if the file already exists continue # Skip further processing if the file already exists @@ -185,20 +154,17 @@ def save_all_user_activity(submissions, comments, saved_items, save_directory, e return processed_count, skipped_count, total_size - -def save_saved_user_activity(saved_items, save_directory, existing_files, created_dirs_cache, processed_count, skipped_count, total_size): +def save_saved_user_activity(saved_items, save_directory, existing_files, created_dirs_cache, processed_count, skipped_count, total_size, file_log): """Save only saved user posts and comments.""" for item in tqdm(saved_items, desc="Processing Saved Items"): if isinstance(item, Submission): - sub_dir = create_directory(item.subreddit.display_name, save_directory, created_dirs_cache) - file_path = os.path.join(sub_dir, f"SAVED_POST_{item.id}.md") - if save_to_file(item, file_path, save_submission, existing_files): + file_path = os.path.join(save_directory, item.subreddit.display_name, f"SAVED_POST_{item.id}.md") + if save_to_file(item, file_path, save_submission, existing_files, file_log, save_directory, created_dirs_cache): skipped_count += 1 # Increment skipped count if the file already exists continue # Skip further processing if the file already exists elif isinstance(item, Comment): - sub_dir = create_directory(item.subreddit.display_name, save_directory, created_dirs_cache) - file_path = os.path.join(sub_dir, f"SAVED_COMMENT_{item.id}.md") - if save_to_file(item, file_path, save_comment_and_context, existing_files): + file_path = os.path.join(save_directory, item.subreddit.display_name, f"SAVED_COMMENT_{item.id}.md") + if save_to_file(item, file_path, save_comment_and_context, existing_files, file_log, save_directory, created_dirs_cache): skipped_count += 1 # Increment skipped count if the file already exists continue # Skip further processing if the file already exists time.sleep(dynamic_sleep(len(item.body))) diff --git a/utils/log_utils.py b/utils/log_utils.py new file mode 100644 index 0000000..f480b99 --- /dev/null +++ b/utils/log_utils.py @@ -0,0 +1,45 @@ +import os +import json + +def get_log_file_path(save_directory): + """Return the path to the log file inside the save_directory.""" + return os.path.join(save_directory, 'file_log.json') + +def load_file_log(save_directory): + """Load the file log from a JSON file in the specified directory.""" + log_file_path = get_log_file_path(save_directory) + if os.path.exists(log_file_path): + with open(log_file_path, 'r') as f: + return json.load(f) + return {} + +def save_file_log(log_data, save_directory): + """Save the file log to a JSON file in the specified directory.""" + log_file_path = get_log_file_path(save_directory) + with open(log_file_path, 'w') as f: + json.dump(log_data, f, indent=4) + +def is_file_logged(log_data, unique_key): + """Check if a unique key is already logged.""" + return unique_key in log_data + +def log_file(log_data, file_id, file_info, save_directory): + """Add a file ID and its information to the log.""" + # Create a unique key by combining file_id and subreddit + unique_key = f"{file_id}-{file_info['subreddit']}" + + # Convert the absolute file path to a relative one + relative_file_path = os.path.relpath(file_info['file_path'], start=save_directory) + + # Update the file info with the relative path + file_info['file_path'] = relative_file_path + + # Add the file info to the log with the unique key + log_data[unique_key] = file_info + + # Save the updated log + save_file_log(log_data, save_directory) + +def convert_to_absolute_path(relative_path, save_directory): + """Convert a relative path from the log back to an absolute path.""" + return os.path.join(save_directory, relative_path) \ No newline at end of file diff --git a/utils/save_utils.py b/utils/save_utils.py new file mode 100644 index 0000000..60fff44 --- /dev/null +++ b/utils/save_utils.py @@ -0,0 +1,84 @@ +import os +from datetime import datetime +from praw.models import Submission, Comment +from utils.time_utilities import lazy_load_comments + +def format_date(timestamp): + """Format a UTC timestamp into a human-readable date.""" + return datetime.utcfromtimestamp(timestamp).strftime('%Y-%m-%d %H:%M:%S') + +def extract_video_id(url): + """Extract the video ID from a YouTube URL.""" + if "youtube.com" in url: + return url.split("v=")[-1] + elif "youtu.be" in url: + return url.split("/")[-1] + return None + +def save_submission(submission, f): + """Save a submission and its metadata.""" + f.write('---\n') # Start of frontmatter + f.write(f'id: {submission.id}\n') + f.write(f'subreddit: /r/{submission.subreddit.display_name}\n') + f.write(f'timestamp: {format_date(submission.created_utc)}\n') + f.write(f'author: /u/{submission.author.name if submission.author else "[deleted]"}\n') + + if submission.link_flair_text: # Check if flair exists and is not None + f.write(f'flair: {submission.link_flair_text}\n') + + f.write(f'comments: {submission.num_comments}\n') + f.write(f'permalink: https://reddit.com{submission.permalink}\n') + f.write('---\n\n') # End of frontmatter + f.write(f'# {submission.title}\n\n') + f.write(f'**Upvotes:** {submission.score} | **Permalink:** [Link](https://reddit.com{submission.permalink})\n\n') + + if submission.is_self: + f.write(submission.selftext if submission.selftext else '[Deleted Post]') + else: + if submission.url.endswith(('.jpg', '.jpeg', '.png', '.gif')): + f.write(f"![Image]({submission.url})") + elif "youtube.com" in submission.url or "youtu.be" in submission.url: + video_id = extract_video_id(submission.url) + f.write(f"[![Video](https://img.youtube.com/vi/{video_id}/0.jpg)]({submission.url})") + else: + f.write(submission.url if submission.url else '[Deleted Post]') + + f.write('\n\n## Comments:\n\n') + lazy_comments = lazy_load_comments(submission) + process_comments(lazy_comments, f) + +def save_comment_and_context(comment, f): + """Save a comment and its context.""" + f.write('---\n') # Start of frontmatter + f.write(f'Comment by /u/{comment.author.name if comment.author else "[deleted]"}\n') + f.write(f'- **Upvotes:** {comment.score} | **Permalink:** [Link](https://reddit.com{comment.permalink})\n') + f.write(f'{comment.body}\n\n') + f.write('---\n\n') # End of frontmatter + + parent = comment.parent() + if isinstance(parent, Submission): + f.write(f'## Context: Post by /u/{parent.author.name if parent.author else "[deleted]"}\n') + f.write(f'- **Title:** {parent.title}\n') + f.write(f'- **Upvotes:** {parent.score} | **Permalink:** [Link](https://reddit.com{parent.permalink})\n') + if parent.is_self: + f.write(f'{parent.selftext}\n\n') + else: + f.write(f'[Link to post content]({parent.url})\n\n') + elif isinstance(parent, Comment): + f.write(f'## Context: Parent Comment by /u/{parent.author.name if parent.author else "[deleted]"}\n') + f.write(f'- **Upvotes:** {parent.score} | **Permalink:** [Link](https://reddit.com{parent.permalink})\n') + f.write(f'{parent.body}\n\n') + +def process_comments(comments, f, depth=0, simple_format=False): + """Process all comments and visualize depth using indentation.""" + for i, comment in enumerate(comments): + if isinstance(comment, Comment): + indent = ' ' * depth + f.write(f'{indent}### Comment {i+1} by /u/{comment.author.name if comment.author else "[deleted]"}\n') + f.write(f'{indent}- **Upvotes:** {comment.score} | **Permalink:** [Link](https://reddit.com{comment.permalink})\n') + f.write(f'{indent}{comment.body}\n\n') + + if not simple_format and comment.replies: + process_comments(comment.replies, f, depth + 1) + + f.write(f'{indent}---\n\n') \ No newline at end of file diff --git a/utils/time_utilities.py b/utils/time_utilities.py index 78d66f5..9266f24 100644 --- a/utils/time_utilities.py +++ b/utils/time_utilities.py @@ -1,4 +1,5 @@ import time +import math import random import logging import prawcore @@ -9,15 +10,48 @@ def exponential_backoff(attempt: int) -> None: logging.info(f"Retrying in {wait_time:.2f} seconds...") time.sleep(wait_time) -def dynamic_sleep(content_length): - """Dynamically adjust sleep time based on content length.""" - base_sleep_time = 1 - sleep_time = base_sleep_time +# def dynamic_sleep(content_length): +# """Dynamically adjust sleep time based on content length.""" +# base_sleep_time = 1 +# sleep_time = base_sleep_time - if content_length > 10000: - sleep_time *= 2 - elif content_length > 5000: - sleep_time *= 1.5 +# if content_length > 10000: +# sleep_time *= 2 +# elif content_length > 5000: +# sleep_time *= 1.5 + +# return sleep_time + +def dynamic_sleep(content_length, request_failures=0, max_sleep_time=5): + """ + Dynamically adjust sleep time based on content length and other factors, + with a more conservative approach to avoid slowing down the process too much. + + :param content_length: Length of the content being processed. + :param request_failures: Number of failed requests in a row (optional). + :param max_sleep_time: Maximum sleep time allowed (optional). + :return: Sleep time in seconds. + """ + base_sleep_time = 0.2 # Start with a lower base time + + # Use a very mild scaling factor + sleep_time = base_sleep_time + 0.05 * (content_length // 10000) + + # Adjust sleep time based on the number of recent request failures, but with a lower multiplier + if request_failures > 0: + sleep_time *= (1.5 ** request_failures) + + # Apply a lower cap to the sleep time + sleep_time = min(sleep_time, max_sleep_time) + + # Add a minimal jitter to avoid synchronization issues + jitter = random.uniform(0.9, 1.1) + sleep_time *= jitter + + # Logging the sleep time for monitoring and tuning + logging.info(f"Sleeping for {sleep_time:.2f} seconds based on content length {content_length} and {request_failures} failures.") + + time.sleep(sleep_time) return sleep_time