Merge LoCo with Zero++ (#6730)

### Integration of LoCo Method into ZeRO++ #### Overview This PR introduces the integration of the **LoCo** method, as outlined in [this paper](https://arxiv.org/abs/2407.04480), into the ZeRO++ framework of DeepSpeed. The key enhancement involves applying error feedback compensation to 4-bit gradients before communication. This approach ***improves pre-training loss outcomes without additional time overhead***, though it requires extra GPU memory. The extent of this memory increase depends on model size and training configuration. #### Experimental Results We conducted pre-training experiments using the Llama2 architecture, adjusting the number of layers and hidden size. The experiments included: - **A smaller-scale model with 0.8B parameters trained on 30B tokens**. - **A larger-scale model with 8B parameters trained on 5B tokens**. The training data was sampled from **Redpajama-V2**. <p align="center"> <img src="https://github.com/user-attachments/assets/e7db9487-728c-4a17-9806-c15afa12f62e" width="49%" /> <img src="https://github.com/user-attachments/assets/3efec895-b71d-43ab-b5ce-65468ba8b9f1" width="49%" /> </p> **Findings**: - **Smaller Models (0.8B parameters)**: Significant gains were observed when applying the LoCo method. - **Larger Models (8B parameters)**: The gains were present but less pronounced. This could be due to: 1. Relatively smaller data volume. 2. Lower pre-training loss for larger models, making significant improvements harder to achieve. However, even a smaller pre-training loss gap in larger models can translate to meaningful gains in downstream tasks. #### Example Script For reference, the [run.sh](https://github.com/user-attachments/files/17679552/zeroplus-7b3.zip) script used for the 8B parameter, 5B tokens experiment is attached. The experiment was conducted using the **DeepSpeed-Megatron** platform. #### Acknowledgments Special thanks to cc @GuanhuaWang for ongoing communication and guidance throughout this work. --- We appreciate your consideration of this PR and welcome any feedback or questions! --------- Co-authored-by: ChuanxinTang <[email protected]> Co-authored-by: root <[email protected]> Co-authored-by: Logan Adams <[email protected]> Co-authored-by: Logan Adams <[email protected]> Co-authored-by: Hongwei Chen <[email protected]>
microsoft · Dec 10, 2024 · 1b58ba5 · 1b58ba5
1 parent 06f1d36
commit 1b58ba5
Show file tree

Hide file tree

Showing 10 changed files with 851 additions and 5 deletions.
diff --git a/csrc/includes/quantization.h b/csrc/includes/quantization.h
@@ -52,6 +52,36 @@ void launch_swizzled_quant(int8_t* q_data,
                            int devices_per_node,
                            cudaStream_t stream);
 
+void launch_loco_swizzled_quant(int8_t* quantized_data,
+                                float* quantized_scales,
+                                const __half* uncompressed_data,
+                                __half* error_feedback,
+                                const float err_beta,
+                                int num_bits,
+                                quantize::Type quant_type,
+                                int groups,
+                                int elems_per_group,
+                                int pipelining,
+                                int nodes,
+                                int devices_per_node,
+                                cudaStream_t stream);
+
+void launch_loco_dequant_reduce(int8_t* reduced_data,
+                                float* reduced_scales,
+                                const int8_t* input_data,
+                                const float* input_scales,
+                                int num_gpus,
+                                int num_bits,
+                                quantize::Type quant_type,
+                                int out_groups,
+                                int elems_per_out_group,
+                                int elems_per_in_tensor,
+                                int groups_per_in_tensor,
+                                int elems_per_in_group,
+                                __half2* error_feedback,
+                                const float err_beta,
+                                cudaStream_t stream);
+
 void launch_dequant_reduce(int8_t* reduced_data,
                            float* reduced_scales,
                            const int8_t* input_data,

diff --git a/csrc/includes/quantization_utils.h b/csrc/includes/quantization_utils.h
@@ -24,6 +24,7 @@ constexpr int max_threads = 1024;
 Class to hold the quantization parameters for a given tensor.
 Holds the implementation of the quantization operation.
 */
+
 template <Type qType, int numBits>
 class Params {
 public:

diff --git a/csrc/quantization/pt_binding.cpp b/csrc/quantization/pt_binding.cpp
@@ -176,6 +176,53 @@ at::Tensor dequantize_int8_to_half_experimental(at::Tensor& data_in,
     return output;
 }
 
+std::vector<at::Tensor> ds_loco_swizzle_quant(at::Tensor& input_vals,
+                                              at::Tensor& error_feedback,
+                                              float err_beta,
+                                              int groups,
+                                              int num_bits,
+                                              quantize::Type quant_type,
+                                              int pipeline_size,
+                                              int nodes,
+                                              int devices_per_node)
+{
+    auto scales_options = at::TensorOptions()
+                              .dtype(at::kFloat)
+                              .layout(at::kStrided)
+                              .device(at::kCUDA)
+                              .requires_grad(false);
+    const int scales_elems = (quantize::requires_offset(quant_type)) ? 2 : 1;
+    auto scales = torch::empty({groups, scales_elems}, scales_options);
+
+    auto output_options = at::TensorOptions()
+                              .dtype(at::kChar)
+                              .layout(at::kStrided)
+                              .device(at::kCUDA)
+                              .requires_grad(false);
+
+    const int quantization_scalar = 8 / num_bits;
+    const int compressed_vals = at::numel(input_vals) / quantization_scalar;
+
+    auto output = torch::empty({compressed_vals}, output_options);
+    const int elems_per_group = at::numel(input_vals) / groups;
+
+    launch_loco_swizzled_quant(reinterpret_cast<int8_t*>(output.data_ptr()),
+                               reinterpret_cast<float*>(scales.data_ptr()),
+                               reinterpret_cast<const __half*>(input_vals.data_ptr()),
+                               reinterpret_cast<__half*>(error_feedback.data_ptr()),
+                               err_beta,
+                               num_bits,
+                               quant_type,
+                               groups,
+                               elems_per_group,
+                               pipeline_size,
+                               nodes,
+                               devices_per_node,
+                               at::cuda::getCurrentCUDAStream());
+
+    return {output, scales};
+}
+
 std::vector<at::Tensor> ds_swizzle_quant(at::Tensor& input_vals,
                                          int groups,
                                          int num_bits,
@@ -265,6 +312,61 @@ std::vector<at::Tensor> quantized_reduction(at::Tensor& input_vals,
     return {output, scales};
 }
 
+std::vector<at::Tensor> loco_quantized_reduction(at::Tensor& input_vals,
+                                                 at::Tensor& input_scales,
+                                                 at::Tensor& error_feedback,
+                                                 float err_beta,
+                                                 int in_groups,
+                                                 int out_groups,
+                                                 int num_bits,
+                                                 quantize::Type quant_type,
+                                                 int devices_per_node)
+{
+    auto scales_options = at::TensorOptions()
+                              .dtype(at::kFloat)
+                              .layout(at::kStrided)
+                              .device(at::kCUDA)
+                              .requires_grad(false);
+
+    const int scales_elems = (quantize::requires_offset(quant_type)) ? 2 : 1;
+
+    auto scales = torch::empty({out_groups, scales_elems}, scales_options);
+
+    auto output_options = at::TensorOptions()
+                              .dtype(at::kChar)
+                              .layout(at::kStrided)
+                              .device(at::kCUDA)
+                              .requires_grad(false);
+
+    std::vector<int64_t> sz(input_vals.sizes().begin(), input_vals.sizes().end());
+    sz[sz.size() - 1] = sz.back() / devices_per_node;
+
+    const int elems_per_in_tensor = at::numel(input_vals) / devices_per_node;
+
+    auto output = torch::empty(sz, output_options);
+
+    const int elems_per_in_group = elems_per_in_tensor / (in_groups / devices_per_node);
+    const int elems_per_out_group = elems_per_in_tensor / out_groups;
+
+    launch_loco_dequant_reduce((int8_t*)output.data_ptr(),
+                               (float*)scales.data_ptr(),
+                               (const int8_t*)input_vals.data_ptr(),
+                               (const float*)input_scales.data_ptr(),
+                               devices_per_node,
+                               num_bits,
+                               quant_type,
+                               out_groups,
+                               elems_per_out_group,
+                               elems_per_in_tensor,
+                               in_groups / devices_per_node,
+                               elems_per_in_group,
+                               (__half2*)error_feedback.data_ptr(),
+                               err_beta,
+                               at::cuda::getCurrentCUDAStream());
+
+    return {output, scales};
+}
+
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
 {
     m.def("ds_quantize_fp32", &ds_quantize<float>, "DeepSpeed Quantize with fp32 (CUDA)");
@@ -295,4 +397,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
           "Dequantize int8 to half (experimental)");
     m.def("swizzle_quant", &ds_swizzle_quant);
     m.def("quantized_reduction", &quantized_reduction);
+    m.def("loco_swizzle_quant", &ds_loco_swizzle_quant, "LoCo Swizzled Quantization Kernel");
+    m.def("loco_quantized_reduction",
+          &loco_quantized_reduction,
+          "LoCo Quantization and Reduction Kernel");
 }