Reddit Stash code bug fixes

rhnfzl · Aug 25, 2024 · d1e1d36 · d1e1d36
1 parent 3158e94
commit d1e1d36
Show file tree

Hide file tree

Showing 2 changed files with 56 additions and 48 deletions.
diff --git a/dropbox_utils.py b/dropbox_utils.py
@@ -38,6 +38,10 @@ def upload_directory_to_dropbox(local_directory, dropbox_folder="/"):
     # List all files currently in the Dropbox folder
     existing_files = list_dropbox_files(dbx, dropbox_folder)
 
+    uploaded_count = 0
+    uploaded_size = 0
+    skipped_count = 0
+
     for root, dirs, files in os.walk(local_directory):
         for file_name in files:
             # Skip .DS_Store and other hidden files
@@ -52,32 +56,54 @@ def upload_directory_to_dropbox(local_directory, dropbox_folder="/"):
             dropbox_path = dropbox_path.replace(file_name, sanitized_name)
 
             if dropbox_path.lower() in existing_files:
+                skipped_count += 1
                 continue
 
             try:
                 with open(file_path, "rb") as f:
+                    file_size = os.path.getsize(file_path)
                     dbx.files_upload(f.read(), dropbox_path)
+                    uploaded_count += 1
+                    uploaded_size += file_size
             except dropbox.exceptions.ApiError as e:
                 print(f"Failed to upload {file_path} to Dropbox: {e}")
 
+    print(f"Upload completed. {uploaded_count} files uploaded ({uploaded_size / (1024 * 1024):.2f} MB).")
+    print(f"{skipped_count} files were skipped (already existed).")
+
 def download_directory_from_dropbox(dbx, dropbox_folder, local_directory):
     """Downloads all files in the specified Dropbox folder to the local directory."""
+    downloaded_count = 0
+    downloaded_size = 0
+    skipped_count = 0
+
     try:
         result = dbx.files_list_folder(dropbox_folder, recursive=True)
         while True:
             for entry in result.entries:
                 if isinstance(entry, dropbox.files.FileMetadata):
                     local_path = os.path.join(local_directory, entry.path_lower[len(dropbox_folder):].lstrip('/'))
+
+                    # Skip the download if the file already exists locally
+                    if os.path.exists(local_path):
+                        skipped_count += 1
+                        continue
+
                     os.makedirs(os.path.dirname(local_path), exist_ok=True)
                     with open(local_path, "wb") as f:
                         metadata, res = dbx.files_download(entry.path_lower)
                         f.write(res.content)
+                        downloaded_count += 1
+                        downloaded_size += metadata.size
             if not result.has_more:
                 break
             result = dbx.files_list_folder_continue(result.cursor)
     except dropbox.exceptions.ApiError as err:
         print(f"Failed to download files from Dropbox folder {dropbox_folder}: {err}")
 
+    print(f"Download completed. {downloaded_count} files downloaded ({downloaded_size / (1024 * 1024):.2f} MB).")
+    print(f"{skipped_count} files were skipped (already existed locally).")
+
 if __name__ == "__main__":
     dbx = dropbox.Dropbox(os.getenv('DROPBOX_TOKEN'))
     local_dir = 'reddit/'  # Local directory for Reddit data

diff --git a/reddit_stash.py b/reddit_stash.py
@@ -1,4 +1,8 @@
-import praw, os, config, time, sys
+import os
+import re
+import sys
+import dropbox
+import praw
 from praw.models import Submission, Comment
 from datetime import datetime
 import prawcore
@@ -22,12 +26,10 @@
     user_agent='Reddit Saved Saver by /u/complexrexton'
 )
 
-print("Fetching...")
-
-try:
-    saved = reddit.user.me().saved(limit=1000)
-except:
-    sys.exit("Failed to find your saved posts, did you add your credentials to config.py?")
+# Initialize statistics
+processed_count = 0
+skipped_count = 0
+total_size = 0  # in bytes
 
 top_dir = 'reddit/'
 
@@ -38,46 +40,31 @@ def process_comments(comments, f, depth=0):
     """Process all comments and visualize depth using indentation."""
     for i, comment in enumerate(comments):
         if isinstance(comment, Comment):
-            # Write the comment with indentation based on depth
             indent = '    ' * depth
             f.write(f'{indent}### Comment {i+1} by /u/{comment.author.name if comment.author else "[deleted]"}\n')
             f.write(f'{indent}- **Upvotes:** {comment.score} | **Permalink:** [Link](https://reddit.com{comment.permalink})\n')
             f.write(f'{indent}{comment.body}\n\n')
-
-            # Process replies recursively without limit
+
             if comment.replies:
                 process_comments(comment.replies, f, depth + 1)
 
             f.write(f'{indent}---\n\n')
 
 def dynamic_sleep(processed_count, content_length):
-    """
-    Dynamically adjust sleep time based on the number of submissions processed
-    and the cumulative content length processed.
-    
-    Parameters:
-    processed_count (int): The number of submissions processed so far.
-    content_length (int): The estimated cumulative length of the content processed.
-    
-    Returns:
-    float: The number of seconds to sleep.
-    """
-    base_sleep_time = 1  # Base time to start with
-
-    # Adjust sleep based on the number of submissions processed
+    """Dynamically adjust sleep time based on processed submissions and content length."""
+    base_sleep_time = 1
+    sleep_time = base_sleep_time
+
     if processed_count > 100:
-        sleep_time = base_sleep_time * 2
+        sleep_time *= 2
     elif processed_count > 50:
-        sleep_time = base_sleep_time * 1.5
-    else:
-        sleep_time = base_sleep_time
-
-    # Further adjust sleep based on the content length
-    if content_length > 10000:  # Large content length threshold
+        sleep_time *= 1.5
+
+    if content_length > 10000:
         sleep_time *= 2
-    elif content_length > 5000:  # Moderate content length threshold
+    elif content_length > 5000:
         sleep_time *= 1.5
-    
+
     return sleep_time
 
 def lazy_load_comments(submission):
@@ -87,8 +74,8 @@ def lazy_load_comments(submission):
             yield comment
     except prawcore.exceptions.TooManyRequests:
         print("Rate limit exceeded. Sleeping for 120 seconds...")
-        time.sleep(120)  # Sleep for 120 seconds before retrying
-        yield from lazy_load_comments(submission)  # Retry the request
+        time.sleep(120)
+        yield from lazy_load_comments(submission)
 
 def save_comment_and_context(comment, f):
     """Save a comment and its context."""
@@ -97,8 +84,7 @@ def save_comment_and_context(comment, f):
     f.write(f'- **Upvotes:** {comment.score} | **Permalink:** [Link](https://reddit.com{comment.permalink})\n')
     f.write(f'{comment.body}\n\n')
     f.write('---\n\n')
-
-    # Fetch and save the parent post or comment for context
+
     parent = comment.parent()
     if isinstance(parent, Submission):
         f.write(f'## Context: Post by /u/{parent.author.name if parent.author else "[deleted]"}\n')
@@ -113,24 +99,19 @@ def save_comment_and_context(comment, f):
         f.write(f'- **Upvotes:** {parent.score} | **Permalink:** [Link](https://reddit.com{parent.permalink})\n')
         f.write(f'{parent.body}\n\n')
 
-processed_count = 0  # Counter to keep track of processed submissions
-
-for saved_item in tqdm(saved, desc="Processing Saved Items"):
-    # Determine the save location based on subreddit
+for saved_item in tqdm(reddit.user.me().saved(limit=1000), desc="Processing Saved Items"):
     sub_dir = top_dir + saved_item.subreddit.display_name + '/'
     if not os.path.exists(sub_dir):
         os.mkdir(sub_dir)
 
     file_path = sub_dir + saved_item.id + '.md'
 
-    # Check if the file already exists to avoid overwriting
     if os.path.exists(file_path):
-        print(f"File {file_path} already exists. Skipping to prevent overwriting.")
+        skipped_count += 1
         continue
 
     with open(file_path, 'w', encoding="utf-8") as f:
         if isinstance(saved_item, Submission):
-            # Save the post and its comments
             f.write('---\n')
             f.write(f'id: {saved_item.id}\n')
             f.write(f'subreddit: /r/{saved_item.subreddit.display_name}\n')
@@ -148,10 +129,11 @@ def save_comment_and_context(comment, f):
             lazy_comments = lazy_load_comments(saved_item)
             process_comments(lazy_comments, f)
         elif isinstance(saved_item, Comment):
-            # Save the comment and its context
             save_comment_and_context(saved_item, f)
-
-    processed_count += 1  # Increment the processed counter
+
+    processed_count += 1
+    total_size += os.path.getsize(file_path)
     time.sleep(dynamic_sleep(processed_count, len(saved_item.body if isinstance(saved_item, Comment) else saved_item.selftext or saved_item.url)))
 
-print("All saved items have been processed.")
+print(f"Processing completed. {processed_count} items processed, {skipped_count} items skipped.")
+print(f"Total size of processed data: {total_size / (1024 * 1024):.2f} MB")