Added the LOG mechanism to accelerate the process without downloading…

… the entire directory for processing
rhnfzl · Aug 31, 2024 · 4ac09b4 · 4ac09b4
1 parent fc4d989
commit 4ac09b4
Show file tree

Hide file tree

Showing 9 changed files with 407 additions and 171 deletions.
diff --git a/.gitignore b/.gitignore
@@ -165,4 +165,6 @@ cython_debug/
 archive/
 reddit/
 *.md
-reddit_acvhive*
+*.json
+reddit_acvhive*
+file_log.json
diff --git a/README.md b/README.md
@@ -43,6 +43,8 @@ After adding all secrets: ![Repository Secrets](resources/repositiory_secrets.pn
 3. **Manually Trigger the Workflow**:
 - Go to the **Actions** tab > Select the **Reddit Stash Workflow** from the list on the left > Click **Run workflow** > Select the branch `main` > Click the green **Run workflow** button. The workflow will then be triggered, and you can monitor its progress in the Actions tab. Upon successful completion, you should see the Reddit folder in your Dropbox.
 
+4. The workflow should run automatically at midnight CET time automatically.
+
 #### Local Installation
 
 1. **Clone this repository**:
@@ -131,19 +133,22 @@ The `settings.ini` file in the root directory of the project allows you to confi
 save_directory = reddit/ # your system save directory
 dropbox_directory = /reddit # your dropbox directory
 save_type = ALL  # Options: 'ALL' to save all activity, 'SAVED' to save only saved posts/comments
+check_type = LOG # Options: 'LOG' to use the logging file to verify the file exisitnece, 'DIR' to verify the file exisitence based on the downloaded directory. 
 
 [Configuration]
 client_id = None  # Can be set here or via environment variables
 client_secret = None  # Can be set here or via environment variables
 username = None  # Can be set here or via environment variables
 password = None  # Can be set here or via environment variables
 ```
-save_directory: Specifies the directory where the Reddit content will be saved, modify it to the location you want it to be in.
-dropbox_directory : Specifies the folder where the Reddit content will be saved on dropbox, modify it to the location you want it to be in.
-save_type: Determines what user activity is saved, accepts these two values:
-* `ALL`: Saves all posts and comments made by the user, along with the saved posts and comments with it's context.
-* `SAVED`: Saves only the posts and comments the user has saved on Reddit with it's context.
-
+* save_directory: Specifies the directory where the Reddit content will be saved, modify it to the location you want it to be in.
+* dropbox_directory : Specifies the folder where the Reddit content will be saved on dropbox, modify it to the location you want it to be in.
+* save_type: Determines what user activity is saved, accepts these two values:
+    * `ALL`: Saves all posts and comments made by the user, along with the saved posts and comments with it's context.
+    * `SAVED`: Saves only the posts and comments the user has saved on Reddit with it's context.
+* check_type : Determines if the file existence needs to be checked using the log file only or using the directory.
+* `LOG` : Uses the log file only to check the file exisitence, faster processing. Recommneded to use in the github action setup.
+* `DIR` : Uses the saved/ downloaded directory to check the file existence, slower processing. Recommended to use in the local setup.
 Note: You can still use environment variables as a fallback or override for the Reddit API credentials if they are not set in the settings.ini file.
 
 #### Setting Up Reddit Environment Variables

diff --git a/dropbox_utils.py b/dropbox_utils.py
@@ -3,10 +3,68 @@
 import sys
 import dropbox
 import requests
+import hashlib
 import configparser
+from dropbox.exceptions import ApiError
+from dropbox.files import FileMetadata
+
 # Import the validate_and_set_directory function from utils
 from utils.file_path_validate import validate_and_set_directory
 
+
+class DropboxContentHasher:
+    """Implements Dropbox content hashing as per the provided reference code."""
+
+    BLOCK_SIZE = 4 * 1024 * 1024
+
+    def __init__(self):
+        self._overall_hasher = hashlib.sha256()
+        self._block_hasher = hashlib.sha256()
+        self._block_pos = 0
+
+        self.digest_size = self._overall_hasher.digest_size
+
+    def update(self, new_data):
+        if self._overall_hasher is None:
+            raise AssertionError(
+                "can't use this object anymore; you already called digest()")
+
+        assert isinstance(new_data, bytes), (
+            "Expecting a byte string, got {!r}".format(new_data))
+
+        new_data_pos = 0
+        while new_data_pos < len(new_data):
+            if self._block_pos == self.BLOCK_SIZE:
+                self._overall_hasher.update(self._block_hasher.digest())
+                self._block_hasher = hashlib.sha256()
+                self._block_pos = 0
+
+            space_in_block = self.BLOCK_SIZE - self._block_pos
+            part = new_data[new_data_pos:(new_data_pos+space_in_block)]
+            self._block_hasher.update(part)
+
+            self._block_pos += len(part)
+            new_data_pos += len(part)
+
+    def _finish(self):
+        if self._overall_hasher is None:
+            raise AssertionError(
+                "can't use this object anymore; you already called digest() or hexdigest()")
+
+        if self._block_pos > 0:
+            self._overall_hasher.update(self._block_hasher.digest())
+            self._block_hasher = None
+        h = self._overall_hasher
+        self._overall_hasher = None  # Make sure we can't use this object anymore.
+        return h
+
+    def digest(self):
+        return self._finish().digest()
+
+    def hexdigest(self):
+        return self._finish().hexdigest()
+
+
 def refresh_dropbox_token():
     refresh_token = os.getenv('DROPBOX_REFRESH_TOKEN')
     client_id = os.getenv('DROPBOX_APP_KEY')
@@ -41,6 +99,9 @@ def refresh_dropbox_token():
 # Fetch the dropbox_folder from the settings.ini file with a fallback
 dropbox_folder = config_parser.get('Settings', 'dropbox_directory', fallback='/reddit')
 
+# Fetch the check_type from the settings.ini file with a fallback
+check_type = config_parser.get('Settings', 'check_type', fallback='LOG').upper()
+
 def sanitize_filename(filename):
     """Sanitize the filename to be Dropbox-compatible."""
     sanitized_name = re.sub(r'[<>:"/\\|?*\x00-\x1F]', '_', filename)  # Also remove control characters
@@ -53,28 +114,39 @@ def sanitize_filename(filename):
 
     return sanitized_name
 
-def list_dropbox_files(dbx, dropbox_folder):
-    """List all files in the specified Dropbox folder."""
-    file_names = set()
+def calculate_local_content_hash(file_path):
+    """Calculate the Dropbox content hash for a local file."""
+    hasher = DropboxContentHasher()
+    with open(file_path, 'rb') as f:
+        while True:
+            chunk = f.read(1024 * 1024)
+            if len(chunk) == 0:
+                break
+            hasher.update(chunk)
+    return hasher.hexdigest()
+
+def list_dropbox_files_with_hashes(dbx, dropbox_folder):
+    """List all files in the specified Dropbox folder along with their content hashes."""
+    file_metadata = {}
     try:
         result = dbx.files_list_folder(dropbox_folder, recursive=True)
         while True:
             for entry in result.entries:
-                if isinstance(entry, dropbox.files.FileMetadata):
-                    file_names.add(entry.path_lower)
+                if isinstance(entry, FileMetadata):
+                    file_metadata[entry.path_lower] = entry.content_hash
             if not result.has_more:
                 break
             result = dbx.files_list_folder_continue(result.cursor)
-    except dropbox.exceptions.ApiError as err:
+    except ApiError as err:
         print(f"Failed to list files in Dropbox folder {dropbox_folder}: {err}")
-    return file_names
+    return file_metadata
 
 def upload_directory_to_dropbox(local_directory, dropbox_folder="/"):
-    """Uploads all files in the specified local directory to Dropbox without overwriting."""
+    """Uploads all files in the specified local directory to Dropbox, replacing only changed files."""
     dbx = dropbox.Dropbox(os.getenv('DROPBOX_TOKEN'))
 
-    # List all files currently in the Dropbox folder
-    existing_files = list_dropbox_files(dbx, dropbox_folder)
+    # List all files currently in the Dropbox folder along with their content hashes
+    dropbox_files = list_dropbox_files_with_hashes(dbx, dropbox_folder)
 
     uploaded_count = 0
     uploaded_size = 0
@@ -93,61 +165,84 @@ def upload_directory_to_dropbox(local_directory, dropbox_folder="/"):
             # Adjust for sanitized name
             dropbox_path = dropbox_path.replace(file_name, sanitized_name)
 
-            if dropbox_path.lower() in existing_files:
+            local_content_hash = calculate_local_content_hash(file_path)
+
+            # Check if the file exists and is the same on Dropbox
+            if dropbox_path.lower() in dropbox_files and dropbox_files[dropbox_path.lower()] == local_content_hash:
                 skipped_count += 1
                 continue
 
+            # Upload the file since it doesn't exist or has changed
             try:
                 with open(file_path, "rb") as f:
                     file_size = os.path.getsize(file_path)
-                    dbx.files_upload(f.read(), dropbox_path)
+                    dbx.files_upload(f.read(), dropbox_path, mode=dropbox.files.WriteMode.overwrite)
                     uploaded_count += 1
                     uploaded_size += file_size
-            except dropbox.exceptions.ApiError as e:
+            except ApiError as e:
                 print(f"Failed to upload {file_path} to Dropbox: {e}")
 
     print(f"Upload completed. {uploaded_count} files uploaded ({uploaded_size / (1024 * 1024):.2f} MB).")
-    print(f"{skipped_count} files were skipped (already existed).")
+    print(f"{skipped_count} files were skipped (already existed or unchanged).")
 
 def download_directory_from_dropbox(dbx, dropbox_folder, local_directory):
-    """Downloads all files in the specified Dropbox folder to the local directory."""
+    """Downloads all files in the specified Dropbox folder to the local directory, replacing only changed files."""
     downloaded_count = 0
     downloaded_size = 0
     skipped_count = 0
 
+    # List all files currently in the Dropbox folder along with their content hashes
+    dropbox_files = list_dropbox_files_with_hashes(dbx, dropbox_folder)
+
     try:
-        result = dbx.files_list_folder(dropbox_folder, recursive=True)
-        while True:
-            for entry in result.entries:
-                if isinstance(entry, dropbox.files.FileMetadata):
-                    local_path = os.path.join(local_directory, entry.path_lower[len(dropbox_folder):].lstrip('/'))
-
-                    # Skip the download if the file already exists locally
-                    if os.path.exists(local_path):
-                        skipped_count += 1
-                        continue
-
-                    os.makedirs(os.path.dirname(local_path), exist_ok=True)
-                    with open(local_path, "wb") as f:
-                        metadata, res = dbx.files_download(entry.path_lower)
-                        f.write(res.content)
-                        downloaded_count += 1
-                        downloaded_size += metadata.size
-            if not result.has_more:
-                break
-            result = dbx.files_list_folder_continue(result.cursor)
-    except dropbox.exceptions.ApiError as err:
+        for dropbox_path, dropbox_hash in dropbox_files.items():
+            local_path = os.path.join(local_directory, dropbox_path[len(dropbox_folder):].lstrip('/'))
+
+            if os.path.exists(local_path):
+                local_content_hash = calculate_local_content_hash(local_path)
+                if local_content_hash == dropbox_hash:
+                    skipped_count += 1
+                    continue
+
+            os.makedirs(os.path.dirname(local_path), exist_ok=True)
+            with open(local_path, "wb") as f:
+                metadata, res = dbx.files_download(dropbox_path)
+                f.write(res.content)
+                downloaded_count += 1
+                downloaded_size += metadata.size
+    except ApiError as err:
         print(f"Failed to download files from Dropbox folder {dropbox_folder}: {err}")
 
     print(f"Download completed. {downloaded_count} files downloaded ({downloaded_size / (1024 * 1024):.2f} MB).")
-    print(f"{skipped_count} files were skipped (i.e. they already existed).")
+    print(f"{skipped_count} files were skipped (already existed or unchanged).")
+
+def download_log_file_from_dropbox(dbx, dropbox_folder, local_directory):
+    """Download only the log file from Dropbox."""
+    log_file_path = os.path.join(local_directory, 'file_log.json')
+
+    try:
+        # Download the log file
+        metadata, res = dbx.files_download(f"{dropbox_folder}/file_log.json")
+        os.makedirs(os.path.dirname(log_file_path), exist_ok=True)
+        with open(log_file_path, "wb") as f:
+            f.write(res.content)
+        print(f"Log file downloaded successfully to {log_file_path}.")
+    except ApiError as err:
+        print(f"Failed to download the log file from Dropbox: {err}")
 
 if __name__ == "__main__":
     # Refresh the access token because it expires
     refresh_dropbox_token()
     dbx = dropbox.Dropbox(os.getenv('DROPBOX_TOKEN'))
 
     if '--download' in sys.argv:
-        download_directory_from_dropbox(dbx, dropbox_folder, local_dir)
+        if check_type == 'LOG':
+            print("Downloading only the log file as check_type is LOG.")
+            download_log_file_from_dropbox(dbx, dropbox_folder, local_dir)
+        elif check_type == 'DIR':
+            print("Downloading the entire directory as check_type is DIR.")
+            download_directory_from_dropbox(dbx, dropbox_folder, local_dir)
+        else:
+            raise ValueError(f"Unknown check_type: {check_type}")
     elif '--upload' in sys.argv:
         upload_directory_to_dropbox(local_dir, dropbox_folder)
diff --git a/reddit_stash.py b/reddit_stash.py
@@ -3,6 +3,7 @@
 from utils.file_path_validate import validate_and_set_directory
 from utils.file_operations import save_user_activity
 from utils.env_config import load_config_and_env
+from utils.log_utils import load_file_log, log_file, is_file_logged
 
 # Load configuration
 config_parser = configparser.ConfigParser()
@@ -27,8 +28,11 @@
 )
 
 if __name__ == "__main__":
+    # Load the log file from the save directory
+    file_log = load_file_log(save_directory)
+
     # Process user activity (submissions, comments, and saved items) and get statistics
-    processed_count, skipped_count, total_size = save_user_activity(reddit, save_directory)
+    processed_count, skipped_count, total_size = save_user_activity(reddit, save_directory, file_log)
 
     # Print final statistics of processing
     print(f"Processing completed. {processed_count} items processed, {skipped_count} items skipped.")

diff --git a/settings.ini b/settings.ini
@@ -2,6 +2,7 @@
 save_directory = reddit/
 dropbox_directory = /reddit
 save_type = ALL
+check_type = LOG
 
 [Configuration]
 client_id = None