From a25fd4dda2fba70fb0b1d9cd1b2965856060e8cd Mon Sep 17 00:00:00 2001 From: Rehan Fazal Date: Sun, 1 Sep 2024 22:07:31 +0200 Subject: [PATCH] updated the code to have the upvoted history as well and enhanced the context logic with the option to save images which was posted on reddit --- .gitignore | 5 +- README.md | 16 +++--- dropbox_utils.py | 64 +++++++++++++++--------- reddit_stash.py | 2 +- requirements.txt | 1 + utils/file_operations.py | 105 +++++++++++++++++++++++++++++++-------- utils/log_utils.py | 6 +-- utils/save_utils.py | 65 ++++++++++++++++++++++-- utils/time_utilities.py | 12 ----- 9 files changed, 202 insertions(+), 74 deletions(-) diff --git a/.gitignore b/.gitignore index 47e6f88..15288a4 100644 --- a/.gitignore +++ b/.gitignore @@ -167,4 +167,7 @@ reddit/ *.md *.json reddit_acvhive* -file_log.json \ No newline at end of file +file_log.json + +#----- +Dockerfile diff --git a/README.md b/README.md index 55c64ad..3c8222c 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # Reddit Stash: Automatically Save Reddit Posts and Comments to Dropbox -**Reddit Stash** is a Python script designed to help you effortlessly back up your Reddit saved posts and comments to Dropbox or your local machine. Utilizing GitHub Actions, this script runs daily, automating the process of archiving your Reddit data in Dropbox after a simple setup. +**Reddit Stash** is a Python script designed to help you effortlessly back up your Reddit **saved/ posted/ upvoted** posts and comments to Dropbox or your local machine. Utilizing GitHub Actions, this script runs daily, automating the process of archiving your Reddit data in Dropbox after a simple setup. ## Key Features @@ -14,7 +14,7 @@ ### Prerequisites - Python 3.10 - Reddit API credentials. -- A Dropbox account with an API token. (Optional) +- A Dropbox account with an API token. ### Installation @@ -132,7 +132,7 @@ The `settings.ini` file in the root directory of the project allows you to confi [Settings] save_directory = reddit/ # your system save directory dropbox_directory = /reddit # your dropbox directory -save_type = ALL # Options: 'ALL' to save all activity, 'SAVED' to save only saved posts/comments +save_type = ALL # Options: 'ALL' to save all activity, 'SAVED' to save only saved posts/comments, 'ACTIVITY' to save only the users posts and comments, 'UPVOTED' to save users upvoted post and comments check_type = LOG # Options: 'LOG' to use the logging file to verify the file exisitnece, 'DIR' to verify the file exisitence based on the downloaded directory. [Configuration] @@ -144,8 +144,10 @@ password = None # Can be set here or via environment variables * save_directory: Specifies the directory where the Reddit content will be saved, modify it to the location you want it to be in. * dropbox_directory : Specifies the folder where the Reddit content will be saved on dropbox, modify it to the location you want it to be in. * save_type: Determines what user activity is saved, accepts these two values: - * `ALL`: Saves all posts and comments made by the user, along with the saved posts and comments with it's context. + * `ALL`: Saves all posts and comments made by the user, the saved posts and comments with it's context, along with the the upvoted posts and comments. * `SAVED`: Saves only the posts and comments the user has saved on Reddit with it's context. + * `ACTIVITY`: Saves only the posts and comments user has made/ posted on reddit with it's context. + * `UPVOTED`: Saves only the posts and comments the user has upvoted with it's context. * check_type : Determines if the file existence needs to be checked using the log file only or using the directory. * `LOG` : Uses the log file only to check the file exisitence, faster processing. Recommneded to use in the github action setup. * `DIR` : Uses the saved/ downloaded directory to check the file existence, slower processing. Recommended to use in the local setup. @@ -153,7 +155,7 @@ Note: You can still use environment variables as a fallback or override for the #### Setting Up Reddit Environment Variables -* Create a Reddit app at https://old.reddit.com/prefs/apps/ +* Create a Reddit app at https://www.reddit.com/prefs/apps or https://old.reddit.com/prefs/apps/ * Set up the name, select `script`, and provide the `redirect_uri` as per the [PRAW docs](https://praw.readthedocs.io/en/latest/getting_started/authentication.html#password-flow). ![Step 1](resources/reddit_create_app1.png) @@ -244,5 +246,5 @@ Feel free to open issues or submit pull requests if you have any improvements or - ~~The `reddit_stash.py` downloads all the file first and decides if the file is availble or not, implement early exit startegy while relevent fetching the content.~~ ### New Features for Future - -- Saving the upvoted post and comments with context (https://www.reddit.com/prefs/feeds/), runs two/three times a day. +- Build a Docker Image to run it on the Local/ NAS system etc. +- Processing the export of a user's data from reddit with context. (not so relevent to implement, based on how the repo has been built, but will look into the possibility). \ No newline at end of file diff --git a/dropbox_utils.py b/dropbox_utils.py index 58ad688..33b41f2 100644 --- a/dropbox_utils.py +++ b/dropbox_utils.py @@ -5,6 +5,7 @@ import requests import hashlib import configparser +from tqdm import tqdm from dropbox.exceptions import ApiError from dropbox.files import FileMetadata @@ -152,12 +153,17 @@ def upload_directory_to_dropbox(local_directory, dropbox_folder="/"): uploaded_size = 0 skipped_count = 0 - for root, dirs, files in os.walk(local_directory): - for file_name in files: - # Skip .DS_Store and other hidden files - if file_name.startswith('.'): - continue - + # Get a list of all files to upload + files_to_upload = [ + (root, file_name) + for root, dirs, files in os.walk(local_directory) + for file_name in files + if not file_name.startswith('.') # Skip hidden files like .DS_Store + ] + + # Initialize tqdm with the total number of files + with tqdm(total=len(files_to_upload), desc="Uploading files to Dropbox") as pbar: + for root, file_name in files_to_upload: sanitized_name = sanitize_filename(file_name) file_path = os.path.join(root, file_name) dropbox_path = f"{dropbox_folder}/{os.path.relpath(file_path, local_directory).replace(os.path.sep, '/')}" @@ -170,6 +176,7 @@ def upload_directory_to_dropbox(local_directory, dropbox_folder="/"): # Check if the file exists and is the same on Dropbox if dropbox_path.lower() in dropbox_files and dropbox_files[dropbox_path.lower()] == local_content_hash: skipped_count += 1 + pbar.update(1) continue # Upload the file since it doesn't exist or has changed @@ -182,6 +189,9 @@ def upload_directory_to_dropbox(local_directory, dropbox_folder="/"): except ApiError as e: print(f"Failed to upload {file_path} to Dropbox: {e}") + # Update the progress bar + pbar.update(1) + print(f"Upload completed. {uploaded_count} files uploaded ({uploaded_size / (1024 * 1024):.2f} MB).") print(f"{skipped_count} files were skipped (already existed or unchanged).") @@ -194,24 +204,30 @@ def download_directory_from_dropbox(dbx, dropbox_folder, local_directory): # List all files currently in the Dropbox folder along with their content hashes dropbox_files = list_dropbox_files_with_hashes(dbx, dropbox_folder) - try: - for dropbox_path, dropbox_hash in dropbox_files.items(): - local_path = os.path.join(local_directory, dropbox_path[len(dropbox_folder):].lstrip('/')) - - if os.path.exists(local_path): - local_content_hash = calculate_local_content_hash(local_path) - if local_content_hash == dropbox_hash: - skipped_count += 1 - continue - - os.makedirs(os.path.dirname(local_path), exist_ok=True) - with open(local_path, "wb") as f: - metadata, res = dbx.files_download(dropbox_path) - f.write(res.content) - downloaded_count += 1 - downloaded_size += metadata.size - except ApiError as err: - print(f"Failed to download files from Dropbox folder {dropbox_folder}: {err}") + # Initialize tqdm with the total number of files + with tqdm(total=len(dropbox_files), desc="Downloading files from Dropbox") as pbar: + try: + for dropbox_path, dropbox_hash in dropbox_files.items(): + local_path = os.path.join(local_directory, dropbox_path[len(dropbox_folder):].lstrip('/')) + + if os.path.exists(local_path): + local_content_hash = calculate_local_content_hash(local_path) + if local_content_hash == dropbox_hash: + skipped_count += 1 + pbar.update(1) + continue + + os.makedirs(os.path.dirname(local_path), exist_ok=True) + with open(local_path, "wb") as f: + metadata, res = dbx.files_download(dropbox_path) + f.write(res.content) + downloaded_count += 1 + downloaded_size += metadata.size + + # Update the progress bar + pbar.update(1) + except ApiError as err: + print(f"Failed to download files from Dropbox folder {dropbox_folder}: {err}") print(f"Download completed. {downloaded_count} files downloaded ({downloaded_size / (1024 * 1024):.2f} MB).") print(f"{skipped_count} files were skipped (already existed or unchanged).") diff --git a/reddit_stash.py b/reddit_stash.py index 5ffa903..8e303d5 100644 --- a/reddit_stash.py +++ b/reddit_stash.py @@ -36,4 +36,4 @@ # Print final statistics of processing print(f"Processing completed. {processed_count} items processed, {skipped_count} items skipped.") - print(f"Total size of processed data: {total_size / (1024 * 1024):.2f} MB") \ No newline at end of file + print(f"Total size of processed markdown file data: {total_size / (1024 * 1024):.2f} MB") \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index b3040e2..5d0767e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ praw tqdm +requests dropbox \ No newline at end of file diff --git a/utils/file_operations.py b/utils/file_operations.py index 242bfdb..92d864a 100644 --- a/utils/file_operations.py +++ b/utils/file_operations.py @@ -37,20 +37,27 @@ def get_existing_files_from_dir(save_directory): existing_files = set() for root, dirs, files in os.walk(save_directory): for file in files: - # Extract the unique key format (id-subreddit) from the file path + # Extract the unique key format (id-subreddit-content_type) from the file path filename = os.path.splitext(file)[0] subreddit_name = os.path.basename(root) + content_type = None + if filename.startswith("POST_"): file_id = filename.split("POST_")[1] + content_type = "Submission" elif filename.startswith("COMMENT_"): file_id = filename.split("COMMENT_")[1] + content_type = "Comment" elif filename.startswith("SAVED_POST_"): file_id = filename.split("SAVED_POST_")[1] + content_type = "Submission" elif filename.startswith("SAVED_COMMENT_"): file_id = filename.split("SAVED_COMMENT_")[1] + content_type = "Comment" else: continue - unique_key = f"{file_id}-{subreddit_name}" + + unique_key = f"{file_id}-{subreddit_name}-{content_type}" existing_files.add(unique_key) return existing_files @@ -59,8 +66,8 @@ def save_to_file(content, file_path, save_function, existing_files, file_log, sa file_id = content.id # Assuming `id` is unique for each Reddit content subreddit_name = content.subreddit.display_name # Get the subreddit name - # Create the unique key - unique_key = f"{file_id}-{subreddit_name}" + # Create the unique key including the content type + unique_key = f"{file_id}-{subreddit_name}-{type(content).__name__}" # If the file is already logged or exists in the directory, skip saving if unique_key in existing_files: @@ -78,7 +85,7 @@ def save_to_file(content, file_path, save_function, existing_files, file_log, sa save_function(content, f) # Log the file after saving successfully with the unique key - log_file(file_log, file_id, { + log_file(file_log, unique_key, { # Use the unique_key constructed in save_to_file 'subreddit': subreddit_name, 'type': type(content).__name__, 'file_path': file_path # This will be converted to relative in log_file @@ -89,8 +96,18 @@ def save_to_file(content, file_path, save_function, existing_files, file_log, sa print(f"Failed to save {file_path}: {e}") return False # Indicate that the file could not be saved +def handle_dynamic_sleep(item): + """Handle dynamic sleep based on the type of Reddit item.""" + if isinstance(item, Submission) and item.is_self and item.selftext: + time.sleep(dynamic_sleep(len(item.selftext))) + elif isinstance(item, Comment) and item.body: + time.sleep(dynamic_sleep(len(item.body))) + else: + time.sleep(dynamic_sleep(0)) # Minimal or no sleep for other types of posts + + def save_user_activity(reddit, save_directory, file_log): - """Save user's posts, comments, and saved items.""" + """Save user's posts, comments, saved items, and upvoted content.""" user = reddit.user.me() # Determine how to check for existing files based on check_type @@ -110,47 +127,73 @@ def save_user_activity(reddit, save_directory, file_log): total_size = 0 # Total size of processed data in bytes if save_type == 'ALL': - processed_count, skipped_count, total_size = save_all_user_activity( + # Save all user submissions and comments + processed_count, skipped_count, total_size = save_self_user_activity( list(user.submissions.new(limit=1000)), list(user.comments.new(limit=1000)), save_directory, existing_files, created_dirs_cache, processed_count, skipped_count, total_size, file_log ) + + # Save all saved items (posts and comments) processed_count, skipped_count, total_size = save_saved_user_activity( list(user.saved(limit=1000)), save_directory, existing_files, created_dirs_cache, processed_count, skipped_count, total_size, file_log ) + + # Save all upvoted posts and comments + processed_count, skipped_count, total_size = save_upvoted_posts_and_comments( + list(user.upvoted(limit=1000)), save_directory, existing_files, created_dirs_cache, + processed_count, skipped_count, total_size, file_log + ) + elif save_type == 'SAVED': processed_count, skipped_count, total_size = save_saved_user_activity( list(user.saved(limit=1000)), save_directory, existing_files, created_dirs_cache, processed_count, skipped_count, total_size, file_log ) + + elif save_type == 'ACTIVITY': + processed_count, skipped_count, total_size = save_self_user_activity( + list(user.submissions.new(limit=1000)), + list(user.comments.new(limit=1000)), + save_directory, existing_files, created_dirs_cache, + processed_count, skipped_count, total_size, file_log + ) + + elif save_type == 'UPVOTED': + processed_count, skipped_count, total_size = save_upvoted_posts_and_comments( + list(user.upvoted(limit=1000)), save_directory, existing_files, created_dirs_cache, + processed_count, skipped_count, total_size, file_log + ) # Save the updated file log save_file_log(file_log, save_directory) return processed_count, skipped_count, total_size -def save_all_user_activity(submissions, comments, save_directory, existing_files, created_dirs_cache, processed_count, skipped_count, total_size, file_log): + +def save_self_user_activity(submissions, comments, save_directory, existing_files, created_dirs_cache, processed_count, skipped_count, total_size, file_log): """Save all user posts and comments.""" - for submission in tqdm(submissions, desc="Processing Submissions"): + for submission in tqdm(submissions, desc="Processing Users Submissions"): file_path = os.path.join(save_directory, submission.subreddit.display_name, f"POST_{submission.id}.md") if save_to_file(submission, file_path, save_submission, existing_files, file_log, save_directory, created_dirs_cache): - skipped_count += 1 # Increment skipped count if the file already exists - continue # Skip further processing if the file already exists + skipped_count += 1 + continue - processed_count += 1 # Increment processed count - total_size += os.path.getsize(file_path) # Accumulate total size of processed files + processed_count += 1 + total_size += os.path.getsize(file_path) + handle_dynamic_sleep(submission) # Call the refactored sleep function - for comment in tqdm(comments, desc="Processing Comments"): + for comment in tqdm(comments, desc="Processing Users Comments"): file_path = os.path.join(save_directory, comment.subreddit.display_name, f"COMMENT_{comment.id}.md") if save_to_file(comment, file_path, save_comment_and_context, existing_files, file_log, save_directory, created_dirs_cache): - skipped_count += 1 # Increment skipped count if the file already exists - continue # Skip further processing if the file already exists + skipped_count += 1 + continue - processed_count += 1 # Increment processed count - total_size += os.path.getsize(file_path) # Accumulate total size of processed files - time.sleep(dynamic_sleep(len(comment.body))) + processed_count += 1 + total_size += os.path.getsize(file_path) + handle_dynamic_sleep(comment) # Call the refactored sleep function return processed_count, skipped_count, total_size @@ -167,9 +210,29 @@ def save_saved_user_activity(saved_items, save_directory, existing_files, create if save_to_file(item, file_path, save_comment_and_context, existing_files, file_log, save_directory, created_dirs_cache): skipped_count += 1 # Increment skipped count if the file already exists continue # Skip further processing if the file already exists - time.sleep(dynamic_sleep(len(item.body))) processed_count += 1 # Increment processed count total_size += os.path.getsize(file_path) # Accumulate total size of processed files + handle_dynamic_sleep(item) # Call the refactored sleep function + + return processed_count, skipped_count, total_size + +def save_upvoted_posts_and_comments(upvoted_items, save_directory, existing_files, created_dirs_cache, processed_count, skipped_count, total_size, file_log): + """Save only upvoted user posts and comments.""" + for item in tqdm(upvoted_items, desc="Processing Upvoted Items"): + if isinstance(item, Submission): + file_path = os.path.join(save_directory, item.subreddit.display_name, f"UPVOTE_POST_{item.id}.md") + if save_to_file(item, file_path, save_submission, existing_files, file_log, save_directory, created_dirs_cache): + skipped_count += 1 # Increment skipped count if the file already exists + continue # Skip further processing if the file already exists + elif isinstance(item, Comment): + file_path = os.path.join(save_directory, item.subreddit.display_name, f"UPVOTE_COMMENT_{item.id}.md") + if save_to_file(item, file_path, save_comment_and_context, existing_files, file_log, save_directory, created_dirs_cache): + skipped_count += 1 # Increment skipped count if the file already exists + continue # Skip further processing if the file already exists + + processed_count += 1 # Increment processed count + total_size += os.path.getsize(file_path) # Accumulate total size of processed files + handle_dynamic_sleep(item) # Call the refactored sleep function - return processed_count, skipped_count, total_size \ No newline at end of file + return processed_count, skipped_count, total_size diff --git a/utils/log_utils.py b/utils/log_utils.py index f480b99..6938f2b 100644 --- a/utils/log_utils.py +++ b/utils/log_utils.py @@ -23,10 +23,8 @@ def is_file_logged(log_data, unique_key): """Check if a unique key is already logged.""" return unique_key in log_data -def log_file(log_data, file_id, file_info, save_directory): - """Add a file ID and its information to the log.""" - # Create a unique key by combining file_id and subreddit - unique_key = f"{file_id}-{file_info['subreddit']}" +def log_file(log_data, unique_key, file_info, save_directory): + """Add a file information to the log with the provided unique key.""" # Convert the absolute file path to a relative one relative_file_path = os.path.relpath(file_info['file_path'], start=save_directory) diff --git a/utils/save_utils.py b/utils/save_utils.py index 60fff44..5bad577 100644 --- a/utils/save_utils.py +++ b/utils/save_utils.py @@ -1,4 +1,5 @@ import os +import requests from datetime import datetime from praw.models import Submission, Comment from utils.time_utilities import lazy_load_comments @@ -15,6 +16,29 @@ def extract_video_id(url): return url.split("/")[-1] return None +def download_image(image_url, save_directory, submission_id): + """Download an image from the given URL and save it locally.""" + try: + response = requests.get(image_url) + response.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx) + + # Determine the image extension from the URL + extension = os.path.splitext(image_url)[1] + if extension.lower() not in ['.jpg', '.jpeg', '.png', '.gif']: + extension = '.jpg' # Default to .jpg if the extension is unusual + + # Save the image with a unique name + image_filename = f"{submission_id}{extension}" + image_path = os.path.join(save_directory, image_filename) + + with open(image_path, 'wb') as f: + f.write(response.content) + + return image_path + except Exception as e: + print(f"Failed to download image from {image_url}: {e}") + return None + def save_submission(submission, f): """Save a submission and its metadata.""" f.write('---\n') # Start of frontmatter @@ -36,7 +60,13 @@ def save_submission(submission, f): f.write(submission.selftext if submission.selftext else '[Deleted Post]') else: if submission.url.endswith(('.jpg', '.jpeg', '.png', '.gif')): - f.write(f"![Image]({submission.url})") + # Download and save the image locally + image_path = download_image(submission.url, os.path.dirname(f.name), submission.id) + if image_path: + f.write(f"![Image]({image_path})\n") + f.write(f"**Original Image URL:** [Link]({submission.url})\n") + else: + f.write(f"![Image]({submission.url})\n") # Fallback to the URL if download fails elif "youtube.com" in submission.url or "youtu.be" in submission.url: video_id = extract_video_id(submission.url) f.write(f"[![Video](https://img.youtube.com/vi/{video_id}/0.jpg)]({submission.url})") @@ -48,13 +78,15 @@ def save_submission(submission, f): process_comments(lazy_comments, f) def save_comment_and_context(comment, f): - """Save a comment and its context.""" + """Save a comment, its context, and any child comments.""" + # Save the comment itself f.write('---\n') # Start of frontmatter f.write(f'Comment by /u/{comment.author.name if comment.author else "[deleted]"}\n') f.write(f'- **Upvotes:** {comment.score} | **Permalink:** [Link](https://reddit.com{comment.permalink})\n') f.write(f'{comment.body}\n\n') f.write('---\n\n') # End of frontmatter + # Save the parent context parent = comment.parent() if isinstance(parent, Submission): f.write(f'## Context: Post by /u/{parent.author.name if parent.author else "[deleted]"}\n') @@ -64,11 +96,24 @@ def save_comment_and_context(comment, f): f.write(f'{parent.selftext}\n\n') else: f.write(f'[Link to post content]({parent.url})\n\n') + + # Save the full submission context, including all comments + f.write('\n\n## Full Post Context:\n\n') + save_submission(parent, f) # Save the parent post context + elif isinstance(parent, Comment): f.write(f'## Context: Parent Comment by /u/{parent.author.name if parent.author else "[deleted]"}\n') f.write(f'- **Upvotes:** {parent.score} | **Permalink:** [Link](https://reddit.com{parent.permalink})\n') f.write(f'{parent.body}\n\n') + # Recursively save the parent comment's context + save_comment_and_context(parent, f) + + # Save child comments if any exist + if comment.replies: + f.write('\n\n## Child Comments:\n\n') + process_comments(comment.replies, f) + def process_comments(comments, f, depth=0, simple_format=False): """Process all comments and visualize depth using indentation.""" for i, comment in enumerate(comments): @@ -76,9 +121,21 @@ def process_comments(comments, f, depth=0, simple_format=False): indent = ' ' * depth f.write(f'{indent}### Comment {i+1} by /u/{comment.author.name if comment.author else "[deleted]"}\n') f.write(f'{indent}- **Upvotes:** {comment.score} | **Permalink:** [Link](https://reddit.com{comment.permalink})\n') - f.write(f'{indent}{comment.body}\n\n') + # Check for image URLs in the comment body + if any(comment.body.endswith(ext) for ext in ['.jpg', '.jpeg', '.png', '.gif']): + image_url = comment.body.split()[-1] # Assuming the URL is the last word in the comment + image_path = download_image(image_url, os.path.dirname(f.name), comment.id) + if image_path: + f.write(f'{indent}![Image]({image_path})\n') + f.write(f'{indent}**Original Image URL:** [Link]({image_url})\n') + else: + f.write(f'{indent}![Image]({image_url})\n') # Fallback to the URL if download fails + else: + f.write(f'{indent}{comment.body}\n\n') + + # Recursively process child comments if not simple_format and comment.replies: process_comments(comment.replies, f, depth + 1) - f.write(f'{indent}---\n\n') \ No newline at end of file + f.write(f'{indent}---\n\n') diff --git a/utils/time_utilities.py b/utils/time_utilities.py index 9266f24..e43c062 100644 --- a/utils/time_utilities.py +++ b/utils/time_utilities.py @@ -10,18 +10,6 @@ def exponential_backoff(attempt: int) -> None: logging.info(f"Retrying in {wait_time:.2f} seconds...") time.sleep(wait_time) -# def dynamic_sleep(content_length): -# """Dynamically adjust sleep time based on content length.""" -# base_sleep_time = 1 -# sleep_time = base_sleep_time - -# if content_length > 10000: -# sleep_time *= 2 -# elif content_length > 5000: -# sleep_time *= 1.5 - -# return sleep_time - def dynamic_sleep(content_length, request_failures=0, max_sleep_time=5): """ Dynamically adjust sleep time based on content length and other factors,