added early skip process to speed up the reddit processing

rhnfzl · Aug 30, 2024 · fc4d989 · fc4d989
1 parent a71f884
commit fc4d989
Show file tree

Hide file tree

Showing 3 changed files with 91 additions and 92 deletions.
diff --git a/README.md b/README.md
@@ -1,13 +1,13 @@
-# Reddit Stash
+# Reddit Stash: Automatically Save Reddit Posts and Comments to Dropbox
 
-Reddit Stash is a Python script that automatically saves your Reddit saved posts and comments to your local machine or Dropbox. It uses GitHub Actions to run the script on a daily schedule for Dropbox.
+**Reddit Stash** is a Python script designed to help you effortlessly back up your Reddit saved posts and comments to Dropbox or your local machine. Utilizing GitHub Actions, this script runs daily, automating the process of archiving your Reddit data in Dropbox after a simple setup.
 
-## Features
-- Downloads the saved Reddit folder from Dropbox.
-- Automatically retrieves saved posts and comments from Reddit.
-- Allows for flexible saving options (all activity or only saved items) via `settings.ini`.
-- Uploads the files to Dropbox for storage.
-- Saves the content as markdown files.
+## Key Features
+
+- **Automated Reddit Backup:** Automatically retrieves saved posts and comments from Reddit, even your posts and comments if you setit up.
+- **Flexible Storage Options:** Allows for flexible saving options (all activity or only saved items) via `settings.ini`.
+- **Dropbox Integration** : Downloads and Uploads the files to Dropbox for storage.
+- **Markdown Support:** Saves the content as markdown files.
 
 ## Setup
 
@@ -18,9 +18,9 @@ Reddit Stash is a Python script that automatically saves your Reddit saved posts
 
 ### Installation
 
-Before proceeding with any installation method, ensure that you have set the Reddit environment variables. Follow [this guide](#setting-up-reddit-environment-variables) to create a Reddit app and obtain the necessary credentials.
+Before proceeding with any installation method, ensure that you have set the Reddit environment variables. Follow [Reddit API guide](#setting-up-reddit-environment-variables) to create a Reddit app and obtain the necessary credentials.
 
-#### GitHub Action Installation
+#### GitHub Action Installation (Recommended)
 
 **Note:** The following process requires the [Dropbox App setup](#setting-up-dropbox-app). The GitHub Actions workflow runs the script daily at midnight CET, uploading the files to Dropbox. The workflow is defined in `.github/workflows/reddit_scraper.yml`.
 
@@ -235,7 +235,5 @@ Feel free to open issues or submit pull requests if you have any improvements or
 - This project was inspired by [reddit-saved-saver](https://github.com/tobiasvl/reddit-saved-saver).
 
 ### Issues:
-~~The dropbox isn't working at the moment because the token expiration, I need to find out a way to tackle that here, the main code `reddit_stash.py` works as expected.~~
-- The dropbox code needs to have the hashing mechanism, to make the upload faster.
-- The `reddit_stash.py` downloads all the file first and decides if the file is availble or not, implement early exit startegy while relevent fetching the content.
-- The file size calculation should be done once rather than in each iterations.
+- ~~The dropbox isn't working at the moment because the token expiration, I need to find out a way to tackle that here, the main code `reddit_stash.py` works as expected.~~
+- ~~The `reddit_stash.py` downloads all the file first and decides if the file is availble or not, implement early exit startegy while relevent fetching the content.~~
diff --git a/reddit_stash.py b/reddit_stash.py
@@ -1,14 +1,8 @@
-import os
-import sys
-import time
 import praw
-from praw.models import Submission, Comment
 import configparser
-from tqdm import tqdm
 from utils.file_path_validate import validate_and_set_directory
-from utils.file_operations import save_user_activity, save_submission, save_comment_and_context
-from utils.time_utilities import dynamic_sleep
-from utils.env_config import load_config_and_env  # Import the new utility function
+from utils.file_operations import save_user_activity
+from utils.env_config import load_config_and_env
 
 # Load configuration
 config_parser = configparser.ConfigParser()
@@ -32,44 +26,10 @@
     user_agent=f'Reddit Saved Saver by /u/{username}'
 )
 
-# Initialize statistics
-processed_count = 0  # Counter for processed items
-skipped_count = 0  # Counter for skipped items
-total_size = 0  # Total size of processed data in bytes
-
 if __name__ == "__main__":
-    # Process saved items
-    for saved_item in tqdm(reddit.user.me().saved(limit=1000), desc="Processing Saved Items"):
-        sub_dir = os.path.join(save_directory, saved_item.subreddit.display_name)
-        if not os.path.exists(sub_dir):
-            os.makedirs(sub_dir)
-
-        # Use a detailed naming convention
-        if isinstance(saved_item, Submission):
-            file_name = f"POST_{saved_item.id}.md"
-        elif isinstance(saved_item, Comment):
-            file_name = f"COMMENT_{saved_item.id}.md"
-
-        file_path = os.path.join(sub_dir, file_name)
-
-        if os.path.exists(file_path):
-            skipped_count += 1  # Increment skipped count if the file already exists
-            continue
-
-        with open(file_path, 'w', encoding="utf-8") as f:
-            if isinstance(saved_item, Submission):
-                save_submission(saved_item, f)
-            elif isinstance(saved_item, Comment):
-                save_comment_and_context(saved_item, f)
-
-        processed_count += 1  # Increment processed count
-        total_size += os.path.getsize(file_path)  # Accumulate total size of processed files
-
-        time.sleep(dynamic_sleep(len(saved_item.body if isinstance(saved_item, Comment) else saved_item.selftext or saved_item.url)))
-
-    # Process user activity (submissions and comments)
-    save_user_activity(reddit, save_directory)
+    # Process user activity (submissions, comments, and saved items) and get statistics
+    processed_count, skipped_count, total_size = save_user_activity(reddit, save_directory)
 
-    # Print final statistics
+    # Print final statistics of processing
     print(f"Processing completed. {processed_count} items processed, {skipped_count} items skipped.")
     print(f"Total size of processed data: {total_size / (1024 * 1024):.2f} MB")
diff --git a/utils/file_operations.py b/utils/file_operations.py
@@ -4,9 +4,8 @@
 from tqdm import tqdm
 from datetime import datetime
 
-import prawcore
 from praw.models import Submission, Comment
-from utils.time_utilities import dynamic_sleep, exponential_backoff, lazy_load_comments
+from utils.time_utilities import dynamic_sleep, lazy_load_comments
 
 # Dynamically determine the path to the root directory
 BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
@@ -19,11 +18,6 @@
 config.read(config_path)
 save_type = config.get('Settings', 'save_type', fallback='ALL').upper()
 
-# Initialize statistics
-processed_count = 0  # Counter for processed items
-skipped_count = 0  # Counter for skipped items
-total_size = 0  # Total size of processed data in bytes
-
 def format_date(timestamp):
     """Format a UTC timestamp into a human-readable date."""
     return datetime.utcfromtimestamp(timestamp).strftime('%Y-%m-%d %H:%M:%S')
@@ -36,22 +30,34 @@ def extract_video_id(url):
         return url.split("/")[-1]
     return None
 
-def create_directory(subreddit_name, save_directory):
+def create_directory(subreddit_name, save_directory, created_dirs_cache):
     """Create the directory for saving data if it does not exist."""
     sub_dir = os.path.join(save_directory, subreddit_name)
-    if not os.path.exists(sub_dir):
-        os.makedirs(sub_dir)
+    if sub_dir not in created_dirs_cache:
+        os.makedirs(sub_dir, exist_ok=True)
+        created_dirs_cache.add(sub_dir)
     return sub_dir
 
-def save_to_file(content, file_path, save_function):
+def get_existing_files(save_directory):
+    """Build a set of all existing files in the save directory."""
+    existing_files = set()
+    for root, dirs, files in os.walk(save_directory):
+        for file in files:
+            existing_files.add(os.path.join(root, file))
+    return existing_files
+
+def save_to_file(content, file_path, save_function, existing_files):
     """Save content to a file using the specified save function."""
-    if os.path.exists(file_path):
-        return
+    if file_path in existing_files:
+        # File already exists, skip saving
+        return True  # Indicate that the file already exists and no saving was performed
     try:
         with open(file_path, 'w', encoding="utf-8") as f:
             save_function(content, f)
+        return False  # Indicate that the file was saved successfully
     except Exception as e:
         print(f"Failed to save {file_path}: {e}")
+        return False  # Indicate that the file could not be saved
 
 def save_submission(submission, f):
     """Save a submission and its metadata."""
@@ -122,47 +128,82 @@ def process_comments(comments, f, depth=0, simple_format=False):
             f.write(f'{indent}---\n\n')
 
 def save_user_activity(reddit, save_directory):
-    """Save user's posts and comments based on the save_type setting."""
+    """Save user's posts, comments, and saved items."""
     user = reddit.user.me()
 
     # Retrieve all necessary data
     submissions = list(user.submissions.new(limit=1000))
     comments = list(user.comments.new(limit=1000))
     saved_items = list(user.saved(limit=1000))
 
+    existing_files = get_existing_files(save_directory)
+    created_dirs_cache = set()
+
+    processed_count = 0  # Counter for processed items
+    skipped_count = 0  # Counter for skipped items
+    total_size = 0  # Total size of processed data in bytes
+
     if save_type == 'ALL':
-        # Process all submissions, comments and saved items
-        save_all_user_activity(submissions, comments, save_directory)
-        save_saved_user_activity(saved_items, save_directory)
+        processed_count, skipped_count, total_size = save_all_user_activity(
+            submissions, comments, saved_items, save_directory, existing_files, 
+            created_dirs_cache, processed_count, skipped_count, total_size
+        )
+        processed_count, skipped_count, total_size = save_saved_user_activity(
+            saved_items, save_directory, existing_files, created_dirs_cache, 
+            processed_count, skipped_count, total_size
+        )
     elif save_type == 'SAVED':
-        # Only process saved items
-        save_saved_user_activity(saved_items, save_directory)
+        processed_count, skipped_count, total_size = save_saved_user_activity(
+            saved_items, save_directory, existing_files, created_dirs_cache, 
+            processed_count, skipped_count, total_size
+        )
+
+    return processed_count, skipped_count, total_size
 
-def save_all_user_activity(submissions, comments, save_directory):
+def save_all_user_activity(submissions, comments, saved_items, save_directory, existing_files, created_dirs_cache, processed_count, skipped_count, total_size):
     """Save all user posts and comments."""
-    # Save submissions
     for submission in tqdm(submissions, desc="Processing Submissions"):
-        sub_dir = create_directory(submission.subreddit.display_name, save_directory)
+        sub_dir = create_directory(submission.subreddit.display_name, save_directory, created_dirs_cache)
         file_path = os.path.join(sub_dir, f"POST_{submission.id}.md")
-        save_to_file(submission, file_path, save_submission)
+        if save_to_file(submission, file_path, save_submission, existing_files):
+            skipped_count += 1  # Increment skipped count if the file already exists
+            continue  # Skip further processing if the file already exists
+
+        processed_count += 1  # Increment processed count
+        total_size += os.path.getsize(file_path)  # Accumulate total size of processed files
 
-    # Save comments
     for comment in tqdm(comments, desc="Processing Comments"):
-        sub_dir = create_directory(comment.subreddit.display_name, save_directory)
+        sub_dir = create_directory(comment.subreddit.display_name, save_directory, created_dirs_cache)
         file_path = os.path.join(sub_dir, f"COMMENT_{comment.id}.md")
-        save_to_file(comment, file_path, save_comment_and_context)
+        if save_to_file(comment, file_path, save_comment_and_context, existing_files):
+            skipped_count += 1  # Increment skipped count if the file already exists
+            continue  # Skip further processing if the file already exists
+
+        processed_count += 1  # Increment processed count
+        total_size += os.path.getsize(file_path)  # Accumulate total size of processed files
         time.sleep(dynamic_sleep(len(comment.body)))
 
-def save_saved_user_activity(saved_items, save_directory):
+    return processed_count, skipped_count, total_size
+
+
+def save_saved_user_activity(saved_items, save_directory, existing_files, created_dirs_cache, processed_count, skipped_count, total_size):
     """Save only saved user posts and comments."""
-    # Process saved submissions and comments in one loop
     for item in tqdm(saved_items, desc="Processing Saved Items"):
         if isinstance(item, Submission):
-            sub_dir = create_directory(item.subreddit.display_name, save_directory)
+            sub_dir = create_directory(item.subreddit.display_name, save_directory, created_dirs_cache)
             file_path = os.path.join(sub_dir, f"SAVED_POST_{item.id}.md")
-            save_to_file(item, file_path, save_submission)
+            if save_to_file(item, file_path, save_submission, existing_files):
+                skipped_count += 1  # Increment skipped count if the file already exists
+                continue  # Skip further processing if the file already exists
         elif isinstance(item, Comment):
-            sub_dir = create_directory(item.subreddit.display_name, save_directory)
+            sub_dir = create_directory(item.subreddit.display_name, save_directory, created_dirs_cache)
             file_path = os.path.join(sub_dir, f"SAVED_COMMENT_{item.id}.md")
-            save_to_file(item, file_path, save_comment_and_context)
-            time.sleep(dynamic_sleep(len(item.body)))
+            if save_to_file(item, file_path, save_comment_and_context, existing_files):
+                skipped_count += 1  # Increment skipped count if the file already exists
+                continue  # Skip further processing if the file already exists
+            time.sleep(dynamic_sleep(len(item.body)))
+
+        processed_count += 1  # Increment processed count
+        total_size += os.path.getsize(file_path)  # Accumulate total size of processed files
+
+    return processed_count, skipped_count, total_size