diff --git a/README.md b/README.md
index b5920e63..8f4a5f3b 100644
--- a/README.md
+++ b/README.md
@@ -190,12 +190,13 @@ arguments:
   --rng {std_default, cuda}          RNG (default: cuda)
   -s SEED, --seed SEED               RNG seed (default: 42, use random seed for < 0)
   -b, --batch-count COUNT            number of images to generate.
-  --schedule {discrete, karras}      Denoiser sigma schedule (default: discrete)
+  --schedule {discrete, karras, ays} Denoiser sigma schedule (default: discrete)
   --clip-skip N                      ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1)
                                      <= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x
   --vae-tiling                       process vae in tiles to reduce memory usage
   --control-net-cpu                  keep controlnet in cpu (for low vram)
   --canny                            apply canny preprocessor (edge detection)
+  --color                            colors the logging tags according to level
   -v, --verbose                      print extra info
 ```
 
diff --git a/denoiser.hpp b/denoiser.hpp
index fd934540..255167c2 100644
--- a/denoiser.hpp
+++ b/denoiser.hpp
@@ -13,6 +13,7 @@ struct SigmaSchedule {
     float alphas_cumprod[TIMESTEPS];
     float sigmas[TIMESTEPS];
     float log_sigmas[TIMESTEPS];
+    int version = 0;
 
     virtual std::vector<float> get_sigmas(uint32_t n) = 0;
 
@@ -75,6 +76,144 @@ struct DiscreteSchedule : SigmaSchedule {
     }
 };
 
+/*
+https://research.nvidia.com/labs/toronto-ai/AlignYourSteps/howto.html
+*/
+struct AYSSchedule : SigmaSchedule {
+    /* interp and linear_interp adapted from dpilger26's NumCpp library:
+     * https://github.com/dpilger26/NumCpp/tree/5e40aab74d14e257d65d3dc385c9ff9e2120c60e */
+    constexpr double interp(double left, double right, double perc) noexcept {
+        return (left * (1. - perc)) + (right * perc);
+    }
+
+    /* This will make the assumption that the reference x and y values are
+     * already sorted in ascending order because they are being generated as
+     * such in the calling function */
+    std::vector<double> linear_interp(std::vector<float> new_x,
+                                      const std::vector<float> ref_x,
+                                      const std::vector<float> ref_y) {
+        const size_t len_x = new_x.size();
+        size_t i           = 0;
+        size_t j           = 0;
+        std::vector<double> new_y(len_x);
+
+        if (ref_x.size() != ref_y.size()) {
+            LOG_ERROR("Linear Interoplation Failed: length mismatch");
+            return new_y;
+        }
+
+        /* serves as the bounds checking for the below while loop */
+        if ((new_x[0] < ref_x[0]) || (new_x[new_x.size() - 1] > ref_x[ref_x.size() - 1])) {
+            LOG_ERROR("Linear Interpolation Failed: bad bounds");
+            return new_y;
+        }
+
+        while (i < len_x) {
+            if ((ref_x[j] > new_x[i]) || (new_x[i] > ref_x[j + 1])) {
+                j++;
+                continue;
+            }
+
+            const double perc = static_cast<double>(new_x[i] - ref_x[j]) / static_cast<double>(ref_x[j + 1] - ref_x[j]);
+
+            new_y[i] = interp(ref_y[j], ref_y[j + 1], perc);
+            i++;
+        }
+
+        return new_y;
+    }
+
+    std::vector<float> linear_space(const float start, const float end, const size_t num_points) {
+        std::vector<float> result(num_points);
+        const float inc = (end - start) / (static_cast<float>(num_points - 1));
+
+        if (num_points > 0) {
+            result[0] = start;
+
+            for (size_t i = 1; i < num_points; i++) {
+                result[i] = result[i - 1] + inc;
+            }
+        }
+
+        return result;
+    }
+
+    std::vector<float> log_linear_interpolation(std::vector<float> sigma_in,
+                                                const size_t new_len) {
+        const size_t s_len        = sigma_in.size();
+        std::vector<float> x_vals = linear_space(0.f, 1.f, s_len);
+        std::vector<float> y_vals(s_len);
+
+        /* Reverses the input array to be ascending instead of descending,
+         * also hits it with a log, it is log-linear interpolation after all */
+        for (size_t i = 0; i < s_len; i++) {
+            y_vals[i] = std::log(sigma_in[s_len - i - 1]);
+        }
+
+        std::vector<float> new_x_vals  = linear_space(0.f, 1.f, new_len);
+        std::vector<double> new_y_vals = linear_interp(new_x_vals, x_vals, y_vals);
+        std::vector<float> results(new_len);
+
+        for (size_t i = 0; i < new_len; i++) {
+            results[i] = static_cast<float>(std::exp(new_y_vals[new_len - i - 1]));
+        }
+
+        return results;
+    }
+
+    std::vector<float> get_sigmas(uint32_t len) {
+        const std::vector<float> noise_levels[] = {
+            /* SD1.5 */
+            {14.6146412293f, 6.4745760956f, 3.8636745985f, 2.6946151520f,
+             1.8841921177f, 1.3943805092f, 0.9642583904f, 0.6523686016f,
+             0.3977456272f, 0.1515232662f, 0.0291671582f},
+            /* SDXL */
+            {14.6146412293f, 6.3184485287f, 3.7681790315f, 2.1811480769f,
+             1.3405244945f, 0.8620721141f, 0.5550693289f, 0.3798540708f,
+             0.2332364134f, 0.1114188177f, 0.0291671582f},
+            /* SVD */
+            {700.00f, 54.5f, 15.886f, 7.977f, 4.248f, 1.789f, 0.981f, 0.403f,
+             0.173f, 0.034f, 0.002f},
+        };
+
+        std::vector<float> inputs;
+        std::vector<float> results(len + 1);
+
+        switch (version) {
+            case VERSION_2_x: /* fallthrough */
+                LOG_WARN("AYS not designed for SD2.X models");
+            case VERSION_1_x:
+                LOG_INFO("AYS using SD1.5 noise levels");
+                inputs = noise_levels[0];
+                break;
+            case VERSION_XL:
+                LOG_INFO("AYS using SDXL noise levels");
+                inputs = noise_levels[1];
+                break;
+            case VERSION_SVD:
+                LOG_INFO("AYS using SVD noise levels");
+                inputs = noise_levels[2];
+                break;
+            default:
+                LOG_ERROR("Version not compatable with AYS scheduler");
+                return results;
+        }
+
+        /* Stretches those pre-calculated reference levels out to the desired
+         * size using log-linear interpolation */
+        if ((len + 1) != inputs.size()) {
+            results = log_linear_interpolation(inputs, len + 1);
+        } else {
+            results = inputs;
+        }
+
+        /* Not sure if this is strictly neccessary */
+        results[len] = 0.0f;
+
+        return results;
+    }
+};
+
 struct KarrasSchedule : SigmaSchedule {
     std::vector<float> get_sigmas(uint32_t n) {
         // These *COULD* be function arguments here,
@@ -122,4 +261,4 @@ struct CompVisVDenoiser : public Denoiser {
     }
 };
 
-#endif  // __DENOISER_HPP__
\ No newline at end of file
+#endif  // __DENOISER_HPP__
diff --git a/examples/cli/main.cpp b/examples/cli/main.cpp
index 0f26644b..565af74a 100644
--- a/examples/cli/main.cpp
+++ b/examples/cli/main.cpp
@@ -43,6 +43,7 @@ const char* schedule_str[] = {
     "default",
     "discrete",
     "karras",
+    "ays",
 };
 
 const char* modes_str[] = {
@@ -190,12 +191,13 @@ void print_usage(int argc, const char* argv[]) {
     printf("  --rng {std_default, cuda}          RNG (default: cuda)\n");
     printf("  -s SEED, --seed SEED               RNG seed (default: 42, use random seed for < 0)\n");
     printf("  -b, --batch-count COUNT            number of images to generate.\n");
-    printf("  --schedule {discrete, karras}      Denoiser sigma schedule (default: discrete)\n");
+    printf("  --schedule {discrete, karras, ays} Denoiser sigma schedule (default: discrete)\n");
     printf("  --clip-skip N                      ignore last layers of CLIP network; 1 ignores none, 2 ignores one layer (default: -1)\n");
     printf("                                     <= 0 represents unspecified, will be 1 for SD1.x, 2 for SD2.x\n");
     printf("  --vae-tiling                       process vae in tiles to reduce memory usage\n");
     printf("  --control-net-cpu                  keep controlnet in cpu (for low vram)\n");
     printf("  --canny                            apply canny preprocessor (edge detection)\n");
+    printf("  --color                            Colors the logging tags according to level\n");
     printf("  -v, --verbose                      print extra info\n");
 }
 
diff --git a/model.cpp b/model.cpp
index 3db919be..684317d2 100644
--- a/model.cpp
+++ b/model.cpp
@@ -890,6 +890,7 @@ bool ModelLoader::init_from_safetensors_file(const std::string& file_path, const
 
         // ggml/src/ggml.c:2745
         if (n_dims < 1 || n_dims > GGML_MAX_DIMS) {
+            LOG_ERROR("skip tensor '%s' with n_dims %d", name.c_str(), n_dims);
             continue;
         }
 
diff --git a/stable-diffusion.cpp b/stable-diffusion.cpp
index abaae693..e4eb56e7 100644
--- a/stable-diffusion.cpp
+++ b/stable-diffusion.cpp
@@ -450,6 +450,11 @@ class StableDiffusionGGML {
                     LOG_INFO("running with Karras schedule");
                     denoiser->schedule = std::make_shared<KarrasSchedule>();
                     break;
+                case AYS:
+                    LOG_INFO("Running with Align-Your-Steps schedule");
+                    denoiser->schedule          = std::make_shared<AYSSchedule>();
+                    denoiser->schedule->version = version;
+                    break;
                 case DEFAULT:
                     // Don't touch anything.
                     break;
diff --git a/stable-diffusion.h b/stable-diffusion.h
index 0de17ae2..4031a093 100644
--- a/stable-diffusion.h
+++ b/stable-diffusion.h
@@ -49,6 +49,7 @@ enum schedule_t {
     DEFAULT,
     DISCRETE,
     KARRAS,
+    AYS,
     N_SCHEDULES
 };