diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml
index cf6a85d..4058e0c 100644
--- a/.github/workflows/python-publish.yml
+++ b/.github/workflows/python-publish.yml
@@ -1,11 +1,3 @@
-# This workflow will upload a Python Package using Twine when a release is created
-# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries
-
-# This workflow uses actions that are not certified by GitHub.
-# They are provided by a third-party and are governed by
-# separate terms of service, privacy policy, and support
-# documentation.
-
 name: Upload Python Package
 
 on:
diff --git a/README.md b/README.md
index e27880e..e9ee375 100644
--- a/README.md
+++ b/README.md
@@ -16,9 +16,11 @@ DropGrad is a regularization method for neural networks that works by randomly (
 
 - Enhanced cross-platform compatibility: The codebase now works seamlessly on macOS, Windows, and Linux
 - Improved device selection logic: Automatically detects and utilizes the available hardware (MPS, CUDA, or CPU) for training
-- Updated dependencies: Added `torchvision` and `matplotlib` as dependencies in `requirements.txt` and `pyproject.toml`
+- Updated dependencies: Added `torchvision`, `torchaudio`, `matplotlib`, and `scipy` as dependencies in `requirements.txt` and `pyproject.toml`
 - Improved visualization: Enhanced `visualize.py` with better plot layout and cross-platform file paths
 - Code cleanup and refactoring: Improved code structure and readability
+- Added mathematical analysis: Introduced `mathematical_analysis.py` to analyze the effect of DropGrad on various optimizers
+- Added benchmark visualizations: Introduced `benchmark_visualizations.py` to compare the behavior of DropGrad across optimizers and benchmarks
 
 ## Code Structure
 
@@ -26,7 +28,8 @@ DropGrad is a regularization method for neural networks that works by randomly (
 dropgrad/
 │
 ├── docs/
-│   └── analysis.md
+│   ├── analysis.md
+│   └── windows_cuda_setup.md
 │
 ├── dropgrad/
 │   ├── __init__.py
@@ -58,24 +61,29 @@ dropgrad/
 
 ## Installation
 
-The PyTorch implementation of DropGrad can be installed simply using pip or by cloning the current GitHub repo.
-
 ### Requirements
 
-The requirements for DropGrad are PyTorch, torchvision, and matplotlib. (Only versions of PyTorch >= 1.9.0 have been tested, although DropGrad should be compatible with any version of PyTorch)
+- Python >= 3.7
+- PyTorch >= 1.12.0
+- torchvision >= 0.13.0
+- torchaudio >= 0.12.0
+- matplotlib
+- scipy
 
 ### Using pip
 
-To install using pip:
+To install DropGrad using pip, run the following command:
 
 ```bash
 pip install dropgrad
 ```
 
-### Using git
+### From source
+
+To install DropGrad from source, follow these steps:
 
 ```bash
-git clone https://github.com/muditbhargava66/dropgrad.git
+git clone https://github.com/dingo-actual/dropgrad.git
 cd dropgrad
 pip install -r requirements.txt
 pip install .
@@ -90,15 +98,15 @@ To use DropGrad in your neural network optimization, simply import the `DropGrad
 ```python
 from dropgrad import DropGrad
 
-opt_unwrapped = Adam(net.parameters(), lr=1e-3)
-opt = DropGrad(opt_unwrapped, drop_rate=0.1)
+optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
+optimizer = DropGrad(optimizer, drop_rate=0.1)
 ```
 
 During training, call `.step()` on the wrapped optimizer to apply DropGrad, and then call `.zero_grad()` to reset the gradients:
 
 ```python
-opt.step()
-opt.zero_grad()
+optimizer.step()
+optimizer.zero_grad()
 ```
 
 ### Drop Rate Schedulers
@@ -109,7 +117,7 @@ DropGrad supports drop rate schedulers to dynamically adjust the drop rate durin
 from dropgrad import DropGrad, LinearDropRateScheduler
 
 scheduler = LinearDropRateScheduler(initial_drop_rate=0.1, final_drop_rate=0.0, num_steps=1000)
-opt = DropGrad(opt_unwrapped, drop_rate_scheduler=scheduler)
+optimizer = DropGrad(optimizer, drop_rate_scheduler=scheduler)
 ```
 
 ### Full Update Drop
@@ -117,7 +125,7 @@ opt = DropGrad(opt_unwrapped, drop_rate_scheduler=scheduler)
 DropGrad provides an option to apply "full" update drop by interrupting the `.step()` method. To enable this feature, pass `full_update_drop=True` to the `DropGrad` constructor:
 
 ```python
-opt = DropGrad(opt_unwrapped, drop_rate=0.1, full_update_drop=True)
+optimizer = DropGrad(optimizer, drop_rate=0.1, full_update_drop=True)
 ```
 
 ### Varying Drop Rates per Parameter
@@ -129,19 +137,13 @@ params = {
     'encoder': 0.1,
     'decoder': 0.2
 }
-opt = DropGrad(opt_unwrapped, params=params)
+optimizer = DropGrad(optimizer, params=params)
 ```
 
 ## Examples
 
 The `examples` directory contains sample code demonstrating various use cases of DropGrad, including basic usage, integration with learning rate schedulers, applying full update drop, and training a Vision Transformer (ViT) on the CIFAR-10 dataset under different regularization scenarios.
 
-```bash
-python basic_usage.py
-python lr_scheduler_integration.py
-python full_update_drop.py
-```
-
 ## Testing
 
 DropGrad includes a test suite to ensure the correctness of the implementation. The tests cover the functionality of the `DropGrad` optimizer and the drop rate schedulers. To run the tests, use the following command:
@@ -154,6 +156,10 @@ pytest tests/
 
 For a detailed analysis of the DropGrad method, including its theoretical foundations, advantages, and empirical results, please refer to the `docs/analysis.md` file.
 
+## Windows CUDA Setup
+
+For instructions on setting up CUDA on Windows for PyTorch and DropGrad, please refer to the `docs/windows_cuda_setup.md` file.
+
 ## Contributing
 
 Contributions to DropGrad are welcome! If you find any issues or have suggestions for improvements, please open an issue or submit a pull request on the GitHub repository.
diff --git a/docs/analysis.md b/docs/analysis.md
index add4066..354b1ed 100644
--- a/docs/analysis.md
+++ b/docs/analysis.md
@@ -19,6 +19,14 @@
   - [Image Classification](#image-classification)
   - [Language Modeling](#language-modeling)
   - [Ablation Studies](#ablation-studies)
+- [Mathematical Analysis](#mathematical-analysis)
+  - [Effect on Stochastic Gradient Descent (SGD)](#effect-on-stochastic-gradient-descent-sgd)
+  - [Effect on Adaptive Optimizers (Adam, AdamW, Adagrad, Adadelta)](#effect-on-adaptive-optimizers-adam-adamw-adagrad-adadelta)
+  - [Effect on Lion Optimizer](#effect-on-lion-optimizer)
+- [Benchmark Visualizations](#benchmark-visualizations)
+  - [Optimization Trajectories](#optimization-trajectories)
+  - [Convergence Rates](#convergence-rates)
+  - [Sensitivity to Hyperparameters](#sensitivity-to-hyperparameters)
 - [Usage and Integration](#usage-and-integration)
   - [Installation](#installation)
   - [Basic Usage](#basic-usage)
@@ -84,6 +92,35 @@ DropGrad has also been applied to language modeling tasks using recurrent neural
 ### Ablation Studies
 Ablation studies have been conducted to investigate the impact of different hyperparameters and design choices in DropGrad. These studies have explored the effect of varying drop rates, using different drop rate schedulers, and applying DropGrad to specific layers or parameter groups. The results provide insights into the optimal configuration of DropGrad for different tasks and architectures.
 
+## Mathematical Analysis
+
+### Effect on Stochastic Gradient Descent (SGD)
+DropGrad has interesting effects when applied to Stochastic Gradient Descent (SGD). It causes the optimization process to do two useful things during training:
+
+1. Move through oblong, narrow regions of the parameter space by sometimes "ignoring" the directions that cause the optimization to "zig-zag" through the region.
+2. "Virtually" incorporate approximations of higher-order derivatives in the directions that were dropped during one or more consecutive previous steps.
+
+### Effect on Adaptive Optimizers (Adam, AdamW, Adagrad, Adadelta)
+When DropGrad is applied to adaptive optimizers like Adam, AdamW, Adagrad, and Adadelta, it modifies the update rules by randomly dropping gradient values. This introduces stochasticity into the optimization process and helps in regularization.
+
+The mathematical analysis in `mathematical_analysis.py` investigates the properties of the optimization trajectories and convergence behavior when DropGrad is applied to these optimizers. It provides theoretical insights and approximations to explain the observed benefits of DropGrad.
+
+### Effect on Lion Optimizer
+DropGrad has shown to work particularly well with the Lion optimizer. The mathematical analysis explores why DropGrad enhances the performance of Lion compared to other optimizers.
+
+The analysis derives theoretical justifications for the effectiveness of DropGrad in combination with Lion, considering the specific update rules and adaptive learning rate mechanisms employed by the Lion optimizer.
+
+## Benchmark Visualizations
+
+### Optimization Trajectories
+The `benchmark_visualizations.py` script visualizes the optimization trajectories of different optimizers with and without DropGrad on various optimization benchmarks. It plots the trajectories in a 2D space, allowing for a clear comparison of the behavior of DropGrad across optimizers.
+
+### Convergence Rates
+The benchmark visualizations also analyze the convergence rates of the optimizers with and without DropGrad. It demonstrates how DropGrad affects the speed and stability of convergence for different optimizers on the selected benchmarks.
+
+### Sensitivity to Hyperparameters
+The visualizations explore the sensitivity of DropGrad to different hyperparameter settings, such as the drop rate and learning rate. It provides insights into the robustness and performance trade-offs of DropGrad under various hyperparameter configurations.
+
 ## Usage and Integration
 
 ### Installation
diff --git a/examples/vit_experiments/train.py b/examples/vit_experiments/train.py
index 002b678..042c7e7 100644
--- a/examples/vit_experiments/train.py
+++ b/examples/vit_experiments/train.py
@@ -5,17 +5,18 @@
 from torchvision import datasets, transforms
 from dropgrad import DropGrad
 from vit_model import vit_base_patch16_224
+from lion_pytorch import Lion
 
-# Check if MPS (Metal Performance Shaders) is available
-if torch.backends.mps.is_available():
-    device = torch.device("mps")
-    print("Using MPS (Metal Performance Shaders) device")
-elif torch.cuda.is_available():
+# Check the available device
+if torch.cuda.is_available():
     device = torch.device("cuda")
     print("Using CUDA (GPU) device")
+elif torch.backends.mps.is_available():
+    device = torch.device("mps")
+    print("Using MPS (Metal Performance Shaders) device on macOS")
 else:
     device = torch.device("cpu")
-    print("Using CPU device (MPS and CUDA not available)")
+    print("Using CPU device")
 
 def train(model, optimizer, criterion, train_loader, test_loader, epochs, device):
     train_losses = []
@@ -40,8 +41,12 @@ def train(model, optimizer, criterion, train_loader, test_loader, epochs, device
                 loss = criterion(outputs, labels)
 
             scaler.scale(loss).backward()
-            scaler.step(optimizer)
-            scaler.update()
+
+            if isinstance(optimizer, DropGrad):
+                optimizer.step()
+            else:
+                scaler.step(optimizer)
+                scaler.update()
 
             train_loss += loss.item() * images.size(0)
             train_total += images.size(0)
@@ -111,6 +116,7 @@ def main():
         optim.SGD,
         optim.Adagrad,
         optim.Adadelta,
+        Lion,
     ]
 
     # Hyperparameter grid search
diff --git a/examples/vit_experiments/visualize.py b/examples/vit_experiments/visualize.py
index b1dc79f..a88c475 100644
--- a/examples/vit_experiments/visualize.py
+++ b/examples/vit_experiments/visualize.py
@@ -1,3 +1,4 @@
+import os
 import torch
 import matplotlib.pyplot as plt
 
@@ -15,23 +16,30 @@ def main():
         "SGD",
         "Adagrad",
         "Adadelta",
-        # "Lion",  # Uncomment if you have the Lion optimizer available
     ]
 
     for optimizer_name in optimizers:
         plt.figure(figsize=(10, 5))
         for scenario in scenarios:
-            losses = torch.load(f"losses_{scenario['name']}_{optimizer_name}.pth")
-            train_losses = losses["train_losses"]
-            test_losses = losses["test_losses"]
-            plt.plot(train_losses, label=f"{scenario['name']} - Train Loss")
-            plt.plot(test_losses, label=f"{scenario['name']} - Test Loss")
+            file_path = os.path.join(".", f"losses_{scenario['name']}_{optimizer_name}.pth")
+            if os.path.exists(file_path):
+                losses = torch.load(file_path)
+                train_losses = losses["train_losses"]
+                test_losses = losses["test_losses"]
+                plt.plot(train_losses, label=f"{scenario['name']} - Train Loss")
+                plt.plot(test_losses, label=f"{scenario['name']} - Test Loss")
 
         plt.xlabel("Epoch")
         plt.ylabel("Loss")
         plt.legend()
         plt.title(f"Train and Test Losses - {optimizer_name}")
-        plt.savefig(f"loss_plot_{optimizer_name}.png")
+        plt.tight_layout()
+        
+        output_dir = os.path.join(".", "output")
+        os.makedirs(output_dir, exist_ok=True)
+        output_path = os.path.join(output_dir, f"loss_plot_{optimizer_name}.png")
+        plt.savefig(output_path)
+        
         plt.close()
 
 if __name__ == "__main__":
diff --git a/pyproject.toml b/pyproject.toml
index cf906f1..a821ac0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -3,9 +3,8 @@ requires = ["setuptools>=61.0", "wheel"]
 build-backend = "setuptools.build_meta"
 
 [project]
-name = "dropgrad_dingo_actual"
+name = "dropgrad"
 version = "0.3.0"
-dependencies = ["torch", "torchvision", "matplotlib"]
 authors = [
     {name = "Ryan Taylor", email = "ryan@beta-reduce.net"},
     {name = "Mudit Bhargava", email = "muditbhargava666@gmail.com"}
@@ -14,22 +13,45 @@ maintainers = [
     {name = "Ryan Taylor", email = "ryan@beta-reduce.net"},
     {name = "Mudit Bhargava", email = "muditbhargava666@gmail.com"}
 ]
-description = "A Torch implementation of DropGrad regularization for Federated Learning."
+description = "A PyTorch implementation of DropGrad regularization for Federated Learning."
 readme = "README.md"
 requires-python = ">=3.7"
 keywords = ["neural networks", "machine learning", "federated learning", "torch", "dropgrad"]
 license = {file = "LICENSE"}
 classifiers = [
-    "Programming Language :: Python :: 3",
     "Development Status :: 4 - Beta",
+    "Intended Audience :: Developers",
     "Intended Audience :: Science/Research",
     "License :: OSI Approved :: MIT License",
-    "Natural Language :: English",
+    "Operating System :: OS Independent",
+    "Programming Language :: Python",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.7",
+    "Programming Language :: Python :: 3.8",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+    "Topic :: Scientific/Engineering",
     "Topic :: Scientific/Engineering :: Artificial Intelligence",
-    "Operating System :: OS Independent"
+    "Topic :: Software Development",
+    "Topic :: Software Development :: Libraries",
+    "Topic :: Software Development :: Libraries :: Python Modules",
+]
+dependencies = [
+    "numpy",
+    "torch>=1.12.0",
+    "torchvision>=0.13.0",
+    "torchaudio>=0.12.0",
+    "matplotlib",
+    "scipy",
 ]
 
 [project.urls]
 Repository = "https://github.com/muditbhargava66/dropgrad"
 Homepage = "https://github.com/muditbhargava66/dropgrad"
-Issues = "https://github.com/muditbhargava66/dropgrad/issues"
\ No newline at end of file
+Issues = "https://github.com/muditbhargava66/dropgrad/issues"
+
+[tool.setuptools]
+packages = ["dropgrad"]
+
+[tool.setuptools.package-data]
+dropgrad = ["*.py"]
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 326b7d6..c05c13c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,7 @@
-torch>=1.9.0
-torchvision
+numpy
+torch>=1.12.0
+torchvision>=0.13.0
+torchaudio>=0.12.0
 matplotlib
 scipy
 lion-pytorch