Reddit Stash code initial

rhnfzl · Aug 25, 2024 · 3158e94 · 3158e94
1 parent d6a67a7
commit 3158e94
Show file tree

Hide file tree

Showing 5 changed files with 358 additions and 2 deletions.
diff --git a/.github/workflows/reddit_scraper.yml b/.github/workflows/reddit_scraper.yml
@@ -0,0 +1,45 @@
+name: Reddit Stash Scraper
+
+on:
+  schedule:
+    - cron: "0 23 * * *"  # Adjust according to your desired schedule (e.g., 23:00 UTC for midnight CET)
+  workflow_dispatch:  # Allows manual trigger of the workflow
+
+jobs:
+  run-script:
+    runs-on: ubuntu-latest
+
+    steps:
+    - name: Checkout repository
+      uses: actions/checkout@v2
+
+    - name: Set up Python
+      uses: actions/setup-python@v2
+      with:
+        python-version: '3.10'
+
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -r requirements.txt
+
+    - name: Download existing Reddit data from Dropbox
+      env:
+        DROPBOX_TOKEN: ${{ secrets.DROPBOX_TOKEN }}
+      run: |
+        python dropbox_utils.py --download  # Add a command-line option to control what the script does
+
+    - name: Run Reddit Stash Script
+      env:
+        REDDIT_CLIENT_ID: ${{ secrets.REDDIT_CLIENT_ID }}
+        REDDIT_CLIENT_SECRET: ${{ secrets.REDDIT_CLIENT_SECRET }}
+        REDDIT_USERNAME: ${{ secrets.REDDIT_USERNAME }}
+        REDDIT_PASSWORD: ${{ secrets.REDDIT_PASSWORD }}
+      run: |
+        python reddit_stash.py  # Run the Reddit processing script
+
+    - name: Upload updated Reddit data to Dropbox
+      env:
+        DROPBOX_TOKEN: ${{ secrets.DROPBOX_TOKEN }}
+      run: |
+        python dropbox_utils.py --upload  # Another command-line option for the upload process
diff --git a/README.md b/README.md
@@ -1,2 +1,64 @@
-# reddit-stash
-Save your reddit Saved Content and the comments
+# Reddit Stash
+
+Reddit Stash is a Python script that automatically saves your Reddit saved posts and comments to Dropbox. It uses GitHub Actions to run the script on a daily schedule.
+
+## Features
+- Automatically retrieves saved posts and comments from Reddit.
+- Saves the content as markdown files.
+- Uploads the files to Dropbox for secure storage.
+
+## Setup
+
+### Prerequisites
+- Python 3.10
+- A Dropbox account with an API token.
+- Reddit API credentials.
+
+### Installation
+
+1. Clone this repository:
+   ```
+   git clone https://github.com/rhnfzl/reddit-stash.git
+   cd reddit-stash
+   ```
+
+2. Install the required Python packages:
+    ```
+    pip install -r requirements.txt
+    ```
+
+3. Set these environment variables in your OS before running the script, create a Reddit app on https://old.reddit.com/prefs/apps/ and then put the client ID, secret key, as well as your reddit username and password.
+
+MacOS and Linux:
+    ```
+    export REDDIT_CLIENT_ID='your_client_id'
+    export REDDIT_CLIENT_SECRET='your_client_secret'
+    export REDDIT_USERNAME='your_username'
+    export REDDIT_PASSWORD='your_password'
+    ```
+Windows:
+
+    ```
+    set REDDIT_CLIENT_ID='your_client_id'
+    set REDDIT_CLIENT_SECRET='your_client_secret'
+    set REDDIT_USERNAME='your_username'
+    set REDDIT_PASSWORD='your_password'
+    ```
+You can check the config has been setup properly or not below:
+
+    ```
+    echo $REDDIT_CLIENT_ID
+    echo $REDDIT_CLIENT_SECRET
+    echo $REDDIT_USERNAME
+    echo $REDDIT_PASSWORD
+    ```
+4. Usage Run the script manually:
+    ```
+    python reddit_stash.py
+    ```
+
+    or using the following Automation
+This project uses GitHub Actions to automatically run the script daily at midnight CET and upload the files to Dropbox. The workflow is defined in .github/workflows/reddit_scraper.yml.
+
+### Contributing
+Feel free to open issues or submit pull requests if you have any improvements or bug fixes.
diff --git a/dropbox_utils.py b/dropbox_utils.py
@@ -0,0 +1,89 @@
+import os
+import re
+import sys
+import dropbox
+
+def sanitize_filename(filename):
+    """Sanitize the filename to be Dropbox-compatible."""
+    sanitized_name = re.sub(r'[<>:"/\\|?*\x00-\x1F]', '_', filename)  # Also remove control characters
+    sanitized_name = sanitized_name.strip()  # Remove leading and trailing spaces
+    reserved_names = {"CON", "PRN", "AUX", "NUL", "COM1", "LPT1", "COM2", "LPT2", "COM3", "LPT3",
+                      "COM4", "LPT4", "COM5", "LPT5", "COM6", "LPT6", "COM7", "LPT7", "COM8", "LPT8",
+                      "COM9", "LPT9"}
+    if sanitized_name.upper() in reserved_names:
+        sanitized_name = "_" + sanitized_name  # Prefix with underscore to avoid reserved names
+
+    return sanitized_name
+
+def list_dropbox_files(dbx, dropbox_folder):
+    """List all files in the specified Dropbox folder."""
+    file_names = set()
+    try:
+        result = dbx.files_list_folder(dropbox_folder, recursive=True)
+        while True:
+            for entry in result.entries:
+                if isinstance(entry, dropbox.files.FileMetadata):
+                    file_names.add(entry.path_lower)
+            if not result.has_more:
+                break
+            result = dbx.files_list_folder_continue(result.cursor)
+    except dropbox.exceptions.ApiError as err:
+        print(f"Failed to list files in Dropbox folder {dropbox_folder}: {err}")
+    return file_names
+
+def upload_directory_to_dropbox(local_directory, dropbox_folder="/"):
+    """Uploads all files in the specified local directory to Dropbox without overwriting."""
+    dbx = dropbox.Dropbox(os.getenv('DROPBOX_TOKEN'))
+
+    # List all files currently in the Dropbox folder
+    existing_files = list_dropbox_files(dbx, dropbox_folder)
+
+    for root, dirs, files in os.walk(local_directory):
+        for file_name in files:
+            # Skip .DS_Store and other hidden files
+            if file_name.startswith('.'):
+                continue
+
+            sanitized_name = sanitize_filename(file_name)
+            file_path = os.path.join(root, file_name)
+            dropbox_path = f"{dropbox_folder}/{os.path.relpath(file_path, local_directory).replace(os.path.sep, '/')}"
+
+            # Adjust for sanitized name
+            dropbox_path = dropbox_path.replace(file_name, sanitized_name)
+
+            if dropbox_path.lower() in existing_files:
+                continue
+
+            try:
+                with open(file_path, "rb") as f:
+                    dbx.files_upload(f.read(), dropbox_path)
+            except dropbox.exceptions.ApiError as e:
+                print(f"Failed to upload {file_path} to Dropbox: {e}")
+
+def download_directory_from_dropbox(dbx, dropbox_folder, local_directory):
+    """Downloads all files in the specified Dropbox folder to the local directory."""
+    try:
+        result = dbx.files_list_folder(dropbox_folder, recursive=True)
+        while True:
+            for entry in result.entries:
+                if isinstance(entry, dropbox.files.FileMetadata):
+                    local_path = os.path.join(local_directory, entry.path_lower[len(dropbox_folder):].lstrip('/'))
+                    os.makedirs(os.path.dirname(local_path), exist_ok=True)
+                    with open(local_path, "wb") as f:
+                        metadata, res = dbx.files_download(entry.path_lower)
+                        f.write(res.content)
+            if not result.has_more:
+                break
+            result = dbx.files_list_folder_continue(result.cursor)
+    except dropbox.exceptions.ApiError as err:
+        print(f"Failed to download files from Dropbox folder {dropbox_folder}: {err}")
+
+if __name__ == "__main__":
+    dbx = dropbox.Dropbox(os.getenv('DROPBOX_TOKEN'))
+    local_dir = 'reddit/'  # Local directory for Reddit data
+    dropbox_folder = "/reddit"  # Dropbox folder where Reddit data is stored
+
+    if '--download' in sys.argv:
+        download_directory_from_dropbox(dbx, dropbox_folder, local_dir)
+    elif '--upload' in sys.argv:
+        upload_directory_to_dropbox(local_dir, dropbox_folder)
diff --git a/reddit_stash.py b/reddit_stash.py
@@ -0,0 +1,157 @@
+import praw, os, config, time, sys
+from praw.models import Submission, Comment
+from datetime import datetime
+import prawcore
+from tqdm import tqdm
+import time
+
+# Reddit API configuration
+client_id = os.getenv('REDDIT_CLIENT_ID')
+client_secret = os.getenv('REDDIT_CLIENT_SECRET')
+username = os.getenv('REDDIT_USERNAME')
+password = os.getenv('REDDIT_PASSWORD')
+
+if not all([client_id, client_secret, username, password]):
+    raise Exception("One or more environment variables for Reddit API are missing.")
+
+reddit = praw.Reddit(
+    client_id=client_id,
+    client_secret=client_secret,
+    username=username,
+    password=password,
+    user_agent='Reddit Saved Saver by /u/complexrexton'
+)
+
+print("Fetching...")
+
+try:
+    saved = reddit.user.me().saved(limit=1000)
+except:
+    sys.exit("Failed to find your saved posts, did you add your credentials to config.py?")
+
+top_dir = 'reddit/'
+
+if not os.path.exists(top_dir):
+    os.mkdir(top_dir)
+
+def process_comments(comments, f, depth=0):
+    """Process all comments and visualize depth using indentation."""
+    for i, comment in enumerate(comments):
+        if isinstance(comment, Comment):
+            # Write the comment with indentation based on depth
+            indent = '    ' * depth
+            f.write(f'{indent}### Comment {i+1} by /u/{comment.author.name if comment.author else "[deleted]"}\n')
+            f.write(f'{indent}- **Upvotes:** {comment.score} | **Permalink:** [Link](https://reddit.com{comment.permalink})\n')
+            f.write(f'{indent}{comment.body}\n\n')
+
+            # Process replies recursively without limit
+            if comment.replies:
+                process_comments(comment.replies, f, depth + 1)
+
+            f.write(f'{indent}---\n\n')
+
+def dynamic_sleep(processed_count, content_length):
+    """
+    Dynamically adjust sleep time based on the number of submissions processed
+    and the cumulative content length processed.
+    
+    Parameters:
+    processed_count (int): The number of submissions processed so far.
+    content_length (int): The estimated cumulative length of the content processed.
+    
+    Returns:
+    float: The number of seconds to sleep.
+    """
+    base_sleep_time = 1  # Base time to start with
+
+    # Adjust sleep based on the number of submissions processed
+    if processed_count > 100:
+        sleep_time = base_sleep_time * 2
+    elif processed_count > 50:
+        sleep_time = base_sleep_time * 1.5
+    else:
+        sleep_time = base_sleep_time
+
+    # Further adjust sleep based on the content length
+    if content_length > 10000:  # Large content length threshold
+        sleep_time *= 2
+    elif content_length > 5000:  # Moderate content length threshold
+        sleep_time *= 1.5
+
+    return sleep_time
+
+def lazy_load_comments(submission):
+    """Lazily load comments instead of replacing all at once."""
+    try:
+        for comment in submission.comments.list():
+            yield comment
+    except prawcore.exceptions.TooManyRequests:
+        print("Rate limit exceeded. Sleeping for 120 seconds...")
+        time.sleep(120)  # Sleep for 120 seconds before retrying
+        yield from lazy_load_comments(submission)  # Retry the request
+
+def save_comment_and_context(comment, f):
+    """Save a comment and its context."""
+    f.write('---\n')
+    f.write(f'Comment by /u/{comment.author.name if comment.author else "[deleted]"}\n')
+    f.write(f'- **Upvotes:** {comment.score} | **Permalink:** [Link](https://reddit.com{comment.permalink})\n')
+    f.write(f'{comment.body}\n\n')
+    f.write('---\n\n')
+
+    # Fetch and save the parent post or comment for context
+    parent = comment.parent()
+    if isinstance(parent, Submission):
+        f.write(f'## Context: Post by /u/{parent.author.name if parent.author else "[deleted]"}\n')
+        f.write(f'- **Title:** {parent.title}\n')
+        f.write(f'- **Upvotes:** {parent.score} | **Permalink:** [Link](https://reddit.com{parent.permalink})\n')
+        if parent.is_self:
+            f.write(f'{parent.selftext}\n\n')
+        else:
+            f.write(f'[Link to post content]({parent.url})\n\n')
+    elif isinstance(parent, Comment):
+        f.write(f'## Context: Parent Comment by /u/{parent.author.name if parent.author else "[deleted]"}\n')
+        f.write(f'- **Upvotes:** {parent.score} | **Permalink:** [Link](https://reddit.com{parent.permalink})\n')
+        f.write(f'{parent.body}\n\n')
+
+processed_count = 0  # Counter to keep track of processed submissions
+
+for saved_item in tqdm(saved, desc="Processing Saved Items"):
+    # Determine the save location based on subreddit
+    sub_dir = top_dir + saved_item.subreddit.display_name + '/'
+    if not os.path.exists(sub_dir):
+        os.mkdir(sub_dir)
+
+    file_path = sub_dir + saved_item.id + '.md'
+
+    # Check if the file already exists to avoid overwriting
+    if os.path.exists(file_path):
+        print(f"File {file_path} already exists. Skipping to prevent overwriting.")
+        continue
+
+    with open(file_path, 'w', encoding="utf-8") as f:
+        if isinstance(saved_item, Submission):
+            # Save the post and its comments
+            f.write('---\n')
+            f.write(f'id: {saved_item.id}\n')
+            f.write(f'subreddit: /r/{saved_item.subreddit.display_name}\n')
+            f.write(f'timestamp: {str(datetime.utcfromtimestamp(saved_item.created_utc))}\n')
+            f.write(f'author: /u/{saved_item.author.name if saved_item.author else "[deleted]"}\n')
+            f.write(f'permalink: https://reddit.com{saved_item.permalink}\n')
+            f.write('---\n\n')
+            f.write(f'# {saved_item.title}\n\n')
+            f.write(f'**Upvotes:** {saved_item.score} | **Permalink:** [Link](https://reddit.com{saved_item.permalink})\n\n')
+            if saved_item.is_self:
+                f.write(saved_item.selftext if saved_item.selftext else '[Deleted Post]')
+            else:
+                f.write(saved_item.url if saved_item.url else '[Deleted Post]')
+            f.write('\n\n## Comments:\n\n')
+            lazy_comments = lazy_load_comments(saved_item)
+            process_comments(lazy_comments, f)
+        elif isinstance(saved_item, Comment):
+            # Save the comment and its context
+            save_comment_and_context(saved_item, f)
+
+    processed_count += 1  # Increment the processed counter
+    time.sleep(dynamic_sleep(processed_count, len(saved_item.body if isinstance(saved_item, Comment) else saved_item.selftext or saved_item.url)))
+
+print("All saved items have been processed.")
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,3 @@
+praw
+tqdm
+dropbox
-Original file line number
+Diff line change
@@ -0,0 +1,3 @@
+    praw
+    tqdm
+    dropbox