diff --git a/experiments/demo_dgx2.sh b/experiments/demo_dgx2.sh
index a0a8aa1e..6e5d8fa4 100644
--- a/experiments/demo_dgx2.sh
+++ b/experiments/demo_dgx2.sh
@@ -7,8 +7,13 @@ echo activate virtual ENV
 PYTHON_ENV=${ROOT}/project/scripts/v2306.sh
 source $PYTHON_ENV
 
+
+# CUDA
+export CUDA_VISIBLE_DEVICES=0,1
+export CUDA_LAUNCH_BLOCKING="1"
+
 # number of GPUs; here we use all GPUs for demo
-WORLD_SIZE=3
+WORLD_SIZE=2
 
 # HF cache
 export TMPDIR="${ROOT}/project/.cache/"
@@ -32,9 +37,17 @@ export ACCELERATE_LOG_LEVEL=debug
 export ACCELERATE_DEBUG_MODE="1"
 export DEEPSPEED_TIMEOUT=120
 
+# accelerate launch
 accelerate launch \
     --config_file ${ROOT}/project/alignment_handbook/recipes/accelerate_configs/deepspeed_zero2.yaml \
     --num_processes $WORLD_SIZE \
     --tee 3 \
    ${ROOT}/project/alignment_handbook/scripts/run_sft.py \
    ${ROOT}/project/alignment_handbook/recipes/llama3-8b/sft/config_qlora.yaml 
+#    ${ROOT}/project/alignment_handbook/recipes/llama3-8b/sft/config_full.yaml 
+
+
+# deepspeed launch
+
+
+# torch launch
\ No newline at end of file
diff --git a/experiments/demo_dgx2_launch.sh b/experiments/demo_dgx2_launch.sh
index 637d4bdd..f552b5b3 100644
--- a/experiments/demo_dgx2_launch.sh
+++ b/experiments/demo_dgx2_launch.sh
@@ -5,9 +5,6 @@ ROOT=$(realpath ~)
 # singularity container
 CONTAINER=${ROOT}/project/singularity_containers/py2402.sig
 
-# CUDA
-export CUDA_VISIBLE_DEVICES=0,1
-
 # PATH
 DEMO_PATH=${ROOT}/project/alignment_handbook/experiments
 
diff --git a/experiments/demo_magtrain_slurm.sh b/experiments/demo_magtrain_slurm.sh
new file mode 100644
index 00000000..e69de29b
diff --git a/recipes/accelerate_configs/deepspeed_zero2.yaml b/recipes/accelerate_configs/deepspeed_zero2.yaml
index d6c76abf..af509c7d 100644
--- a/recipes/accelerate_configs/deepspeed_zero2.yaml
+++ b/recipes/accelerate_configs/deepspeed_zero2.yaml
@@ -1,15 +1,15 @@
 compute_environment: LOCAL_MACHINE
 debug: true
 deepspeed_config:
-  deepspeed_config_file: /home/l069561/project/alignment-handbook/recipes/accelerate_configs/deepspeed_zs2.json
-  zero3_init_flag: true
-  # deepspeed_multinode_launcher: standard
-  # offload_optimizer_device: none
-  # offload_param_device: none
+  # deepspeed_config_file: '/home/l069561/project/alignment_handbook/recipes/accelerate_configs/ds_acc_conf.json'
   # zero3_init_flag: true
-  # zero3_save_16bit_model: false
-  # zero_stage: 2
-  # mixed_precision: bf16
+  deepspeed_multinode_launcher: standard
+  offload_optimizer_device: none
+  offload_param_device: none
+  zero3_init_flag: true
+  zero3_save_16bit_model: false
+  zero_stage: 2
+  mixed_precision: bf16
 distributed_type: DEEPSPEED
 downcast_bf16: 'no'
 machine_rank: 0
diff --git a/recipes/accelerate_configs/deepspeed_zs2.json b/recipes/accelerate_configs/deepspeed_zs2.json
index dfa80708..b3347327 100644
--- a/recipes/accelerate_configs/deepspeed_zs2.json
+++ b/recipes/accelerate_configs/deepspeed_zs2.json
@@ -41,7 +41,7 @@
         "reduce_bucket_size": "auto",
         "contiguous_gradients": true
     },
-    "gradient_accumulation_steps": 1,
+    "gradient_accumulation_steps": "auto",
     "gradient_clipping": "auto",
     "steps_per_print": 2000,
     "train_batch_size": "auto",
diff --git a/recipes/accelerate_configs/ds_acc_conf.json b/recipes/accelerate_configs/ds_acc_conf.json
new file mode 100644
index 00000000..cea626d6
--- /dev/null
+++ b/recipes/accelerate_configs/ds_acc_conf.json
@@ -0,0 +1,23 @@
+{
+    "bf16": {
+      "enabled": true
+    },
+    "zero_optimization": {
+      "stage": 2,
+      "stage3_gather_16bit_weights_on_model_save": true,
+      "offload_optimizer": {
+        "device": "none"
+      },
+      "offload_param": {
+        "device": "none"
+      }
+    },
+    "gradient_clipping": "auto",
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "gradient_accumulation_steps": "auto",
+    "steps_per_print": 200000,
+    "fp16": {
+      "enabled": false
+    }
+  }
\ No newline at end of file
diff --git a/recipes/accelerate_configs/readme.md b/recipes/accelerate_configs/readme.md
index 3134d397..083cc2d0 100644
--- a/recipes/accelerate_configs/readme.md
+++ b/recipes/accelerate_configs/readme.md
@@ -1,7 +1,6 @@
 ## deepspeed optimizers
 - DeepSpeed natively supports Adam, AdamW, OneBitAdam, Lamb, OneBitLamb, FusedLamb, FusedAdam
 - see for details on how to config https://deepspeed.readthedocs.io/en/latest/optimizers.html
-- 
 ```json
 {
   "optimizer": {
@@ -20,7 +19,7 @@
       "factor_min": 0.5,
       "factor_threshold": 0.1
     }
-  },
+  }
 }
 
 {
@@ -33,6 +32,25 @@
       "max_coeff": 0.3,
       "min_coeff": 0.01
     }
-  },
+  }
+}
+```
+
+- fp16 vs bf16
+```json
+{
+    "fp16": {
+        "enabled": false,
+        "loss_scale": 0,
+        "auto_cast": false,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "consecutive_hysteresis": false,
+        "min_loss_scale": 1
+    },
+    "bf16": {
+        "enabled": true
+    }
 }
 ```
\ No newline at end of file
diff --git a/recipes/llama3-8b/sft/config_full.yaml b/recipes/llama3-8b/sft/config_full.yaml
index 1cafb813..80dbcdbe 100644
--- a/recipes/llama3-8b/sft/config_full.yaml
+++ b/recipes/llama3-8b/sft/config_full.yaml
@@ -11,7 +11,7 @@ dataset_mixer:
 dataset_splits:
 - train_sft
 - test_sft
-preprocessing_num_workers: 8
+preprocessing_num_workers: 32
 
 # SFT trainer config
 bf16: true
@@ -27,6 +27,7 @@ logging_steps: 5
 logging_strategy: steps
 learning_rate: 2.0e-05
 optim: galore_adamw # adamw_torch paged_adamw_32bit galore_adamw lion_32bit
+optim_target_modules: all-linear
 weight_decay: 0.01
 lr_scheduler_type: cosine
 max_seq_length: 4096
diff --git a/recipes/llama3-8b/sft/config_qlora.yaml b/recipes/llama3-8b/sft/config_qlora.yaml
index d2e19745..c1465d93 100644
--- a/recipes/llama3-8b/sft/config_qlora.yaml
+++ b/recipes/llama3-8b/sft/config_qlora.yaml
@@ -5,8 +5,9 @@ torch_dtype: bfloat16
 use_flash_attention_2: true
 
 # LoRA arguments
-load_in_4bit: true
+use_unsloth: false # unsloth not support deepspeed yet
 use_peft: true
+load_in_4bit: true
 lora_r: 32
 lora_alpha: 32
 lora_dropout: 0.05
@@ -26,14 +27,13 @@ dataset_mixer:
 dataset_splits:
 - train_sft
 - test_sft
-preprocessing_num_workers: 16
+preprocessing_num_workers: 32
 auto_insert_empty_system_msg: true
 
 # SFT trainer config
 bf16: true
 do_eval: true
 evaluation_strategy: epoch
-gradient_accumulation_steps: 16
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
   use_reentrant: false
@@ -41,18 +41,22 @@ learning_rate: 1.0e-04
 log_level: info
 logging_steps: 5  
 logging_strategy: steps
+optim: adamw_torch # adamw_torch paged_adamw_32bit galore_adamw lion_32bit
+# optim_target_modules: all-linear
+weight_decay: 0.01
 lr_scheduler_type: cosine
-max_seq_length: 4096
+max_seq_length: 2048
 max_steps: -1
 num_train_epochs: 1
 output_dir: /home/l069561/project/models/fine-tuned/demo-llama-3-8b-lora-ultrachat
 overwrite_output_dir: true
-per_device_eval_batch_size: 8
+per_device_eval_batch_size: 2
+gradient_accumulation_steps: 32
 per_device_train_batch_size: 4
 push_to_hub: false
 report_to:
 - tensorboard  
-# - wandb
+- wandb
 save_strategy: "steps"
 save_steps: 100
 save_total_limit: 1
diff --git a/requirements.txt b/requirements.txt
index f66031f2..a83745a3 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -19,4 +19,7 @@ flash-attn>=2.1.0
 pynvml>=11.4.0
 
 # optional
-galore-torch
\ No newline at end of file
+galore-torch
+
+# unsloth
+# with NV pytorch container install -> pip install git+https://github.com/unslothai/unsloth.git --no-deps
\ No newline at end of file
diff --git a/scripts/run_sft.py b/scripts/run_sft.py
index d7bf4b4d..3a7879b1 100644
--- a/scripts/run_sft.py
+++ b/scripts/run_sft.py
@@ -141,6 +141,7 @@ def main():
     if (
         "<|im_start|>" in tokenizer.chat_template
         and "gemma-tokenizer-chatml" not in tokenizer.name_or_path
+        and not model_args.use_unsloth
     ):
         model = AutoModelForCausalLM.from_pretrained(
             model_args.model_name_or_path, **model_kwargs
@@ -151,6 +152,7 @@ def main():
     #####################
     # Apply chat template
     #####################
+    logger.info("*** apply chat template ***")
     raw_datasets = raw_datasets.map(
         apply_chat_template,
         fn_kwargs={
@@ -192,15 +194,23 @@ def main():
     ########################
     # Initialize the Trainer
     ########################
+
     if model_args.use_unsloth:
+        logger.info("*** use unsloth ***")
         from alignment.unsloth import get_unsloth_peft_model
 
         peft_config = get_peft_config(model_args)
-        model = AutoModelForCausalLM.from_pretrained(
-            model_args.model_name_or_path, **model_kwargs
+        model, tokenizer = get_unsloth_peft_model(
+            model_args.model_name_or_path,
+            training_args.max_seq_length,
+            peft_config.to_dict(),
         )
-        model, tokenizer = setup_chat_format(model, tokenizer)
-        model = get_unsloth_peft_model(model, training_args.max_seq_length, peft_config)
+
+        if (
+            "<|im_start|>" in tokenizer.chat_template
+            and "gemma-tokenizer-chatml" not in tokenizer.name_or_path
+        ):
+            model, tokenizer = setup_chat_format(model, tokenizer)
 
         trainer = SFTTrainer(
             model=model,
@@ -208,6 +218,7 @@ def main():
             train_dataset=train_dataset,
             eval_dataset=eval_dataset,
             dataset_text_field="text",
+            dataset_num_proc=data_args.preprocessing_num_workers,
             max_seq_length=training_args.max_seq_length,
             tokenizer=tokenizer,
             packing=True,
@@ -222,6 +233,7 @@ def main():
             train_dataset=train_dataset,
             eval_dataset=eval_dataset,
             dataset_text_field="text",
+            dataset_num_proc=data_args.preprocessing_num_workers,
             max_seq_length=training_args.max_seq_length,
             tokenizer=tokenizer,
             packing=True,
diff --git a/src/alignment/unsloth.py b/src/alignment/unsloth.py
index 5d1f7369..44bc42e9 100644
--- a/src/alignment/unsloth.py
+++ b/src/alignment/unsloth.py
@@ -66,18 +66,32 @@ def load_unsloth_pretrained_model(config, model_args):
     return model
 
 
-def get_unsloth_peft_model(model, max_seq_length, peft_kwargs: Dict[str, Any]):
+def get_unsloth_peft_model(model_name, max_seq_length, peft_kwargs: Dict[str, Any]):
     r"""
     Gets the peft model for the pretrained model with unsloth. Used in training.
     """
     from unsloth import FastLanguageModel
 
+    model, tokenizer = FastLanguageModel.from_pretrained(
+        model_name=model_name,
+        max_seq_length=max_seq_length,
+        dtype=None,
+        load_in_4bit=True,
+    )
+
     unsloth_peft_kwargs = {
         "model": model,
         "max_seq_length": max_seq_length,
         "use_gradient_checkpointing": "unsloth",
     }
-    return FastLanguageModel.get_peft_model(**peft_kwargs, **unsloth_peft_kwargs)
+
+    peft_kwargs["lora_dropout"] = 0.0
+    peft_kwargs.pop("task_type", None)
+
+    return (
+        FastLanguageModel.get_peft_model(**unsloth_peft_kwargs, **peft_kwargs),
+        tokenizer,
+    )
 
 
 def load_unsloth_peft_model(config, model_args, is_trainable: bool):