diff --git a/llmfoundry/data/finetuning/dataloader.py b/llmfoundry/data/finetuning/dataloader.py
index 661729ff8a..fce694f160 100644
--- a/llmfoundry/data/finetuning/dataloader.py
+++ b/llmfoundry/data/finetuning/dataloader.py
@@ -336,6 +336,7 @@ def build_finetuning_dataloader(
             replication_factor if replication_factor > 1 else None,
             rank=dist.get_global_rank() //
             replication_factor if replication_factor > 1 else None,
+            seed=dataset_cfg.get('shuffle_seed', 0),
         )
 
     assert streaming_dataset is not None  # for pyright
diff --git a/llmfoundry/data/packing.py b/llmfoundry/data/packing.py
index 5eacced549..6fed96a13d 100644
--- a/llmfoundry/data/packing.py
+++ b/llmfoundry/data/packing.py
@@ -474,7 +474,9 @@ def profile_packing(
 
     # If streaming dataset, use a temporary local folder for profiling
     local_rank_zero = dist.get_global_rank() - dist.get_local_rank()
-    if dataset_cfg.get('remote') is not None:
+    if dataset_cfg.get(
+        'remote',
+    ) is not None and dataset_cfg.get('local') is None:
         tmp_path_to_broadcast = tempfile.TemporaryDirectory().name
         gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
         tmp_path = gathered_paths[local_rank_zero]
@@ -485,7 +487,8 @@ def profile_packing(
             tmp_path_to_broadcast = tempfile.TemporaryDirectory().name
             gathered_paths = dist.all_gather_object(tmp_path_to_broadcast)
             tmp_path = gathered_paths[local_rank_zero]
-            stream_config['local'] = tmp_path
+            if stream_config.get('local') is None:
+                stream_config['local'] = tmp_path
 
     # Determine the packing_ratio values we'll try
     packing_ratios, raw_batch_sizes = [], []
diff --git a/llmfoundry/utils/config_utils.py b/llmfoundry/utils/config_utils.py
index 997273de7f..252841cb50 100644
--- a/llmfoundry/utils/config_utils.py
+++ b/llmfoundry/utils/config_utils.py
@@ -8,6 +8,7 @@
 import os
 import warnings
 from dataclasses import dataclass, fields
+from pathlib import Path
 from typing import (
     Any,
     Callable,
@@ -703,6 +704,8 @@ def _process_data_source(
         true_split (str): The split of the dataset to be added (i.e. train or eval)
         data_paths (List[Tuple[str, str, str]]): A list of tuples formatted as (data type, path, split)
     """
+    if source_dataset_path:
+        source_dataset_path = str(Path(source_dataset_path))
     # Check for Delta table
     if source_dataset_path and len(source_dataset_path.split('.')) == 3:
         data_paths.append(('delta_table', source_dataset_path, true_split))
@@ -788,7 +791,6 @@ def log_dataset_uri(cfg: dict[str, Any]) -> None:
 
     # Map data source types to their respective MLFlow DataSource.
     for dataset_type, path, split in data_paths:
-
         if dataset_type in dataset_source_mapping:
             source_class = dataset_source_mapping[dataset_type]
             if dataset_type == 'delta_table':
diff --git a/mcli/mcli-1b-eval.yaml b/mcli/mcli-1b-eval.yaml
index 35a1165ae0..b4f7d7ae42 100644
--- a/mcli/mcli-1b-eval.yaml
+++ b/mcli/mcli-1b-eval.yaml
@@ -1,7 +1,7 @@
 integrations:
 - integration_type: git_repo
   git_repo: mosaicml/llm-foundry
-  git_branch: v0.15.0
+  git_branch: v0.15.1
   # git_commit: # OR use your commit hash
   pip_install: .[gpu]
   ssh_clone: false  # Should be true if using a private repo
diff --git a/mcli/mcli-1b-max-seq-len-8k.yaml b/mcli/mcli-1b-max-seq-len-8k.yaml
index fb83165b75..dc0c80f488 100644
--- a/mcli/mcli-1b-max-seq-len-8k.yaml
+++ b/mcli/mcli-1b-max-seq-len-8k.yaml
@@ -1,7 +1,7 @@
 integrations:
 - integration_type: git_repo
   git_repo: mosaicml/llm-foundry
-  git_branch: v0.15.0
+  git_branch: v0.15.1
   # git_commit:  # OR use your commit hash
   pip_install: .[gpu]
   ssh_clone: false  # Should be true if using a private repo
diff --git a/mcli/mcli-1b.yaml b/mcli/mcli-1b.yaml
index f88d6cbac2..37df667ce0 100644
--- a/mcli/mcli-1b.yaml
+++ b/mcli/mcli-1b.yaml
@@ -1,7 +1,7 @@
 integrations:
 - integration_type: git_repo
   git_repo: mosaicml/llm-foundry
-  git_branch: v0.15.0
+  git_branch: v0.15.1
   # git_commit:  # OR use your commit hash
   pip_install: .[gpu]
   ssh_clone: false  # Should be true if using a private repo
diff --git a/mcli/mcli-benchmark-mpt.yaml b/mcli/mcli-benchmark-mpt.yaml
index 916337cb7b..27c96c466f 100644
--- a/mcli/mcli-benchmark-mpt.yaml
+++ b/mcli/mcli-benchmark-mpt.yaml
@@ -11,7 +11,7 @@ image: mosaicml/llm-foundry:2.5.1_cu124-latest
 integrations:
 - integration_type: git_repo
   git_repo: mosaicml/llm-foundry
-  git_branch: v0.15.0
+  git_branch: v0.15.1
   # git_commit: # OR use your commit hash
   pip_install: .[gpu]
 
diff --git a/mcli/mcli-convert-composer-to-hf.yaml b/mcli/mcli-convert-composer-to-hf.yaml
index ab02024280..90029c8d56 100644
--- a/mcli/mcli-convert-composer-to-hf.yaml
+++ b/mcli/mcli-convert-composer-to-hf.yaml
@@ -1,7 +1,7 @@
 integrations:
 - integration_type: git_repo
   git_repo: mosaicml/llm-foundry
-  git_branch: v0.15.0
+  git_branch: v0.15.1
   # git_commit:  # OR use your commit hash
   pip_install: .
   ssh_clone: false  # Should be true if using a private repo
diff --git a/mcli/mcli-hf-eval.yaml b/mcli/mcli-hf-eval.yaml
index ab72e99f97..146848555f 100644
--- a/mcli/mcli-hf-eval.yaml
+++ b/mcli/mcli-hf-eval.yaml
@@ -1,7 +1,7 @@
 integrations:
 - integration_type: git_repo
   git_repo: mosaicml/llm-foundry
-  git_branch: v0.15.0
+  git_branch: v0.15.1
   # git_commit:  # OR use your commit hash
   pip_install: .[gpu]
   ssh_clone: false  # Should be true if using a private repo
diff --git a/mcli/mcli-hf-generate.yaml b/mcli/mcli-hf-generate.yaml
index b885568f66..1dbf6afdd6 100644
--- a/mcli/mcli-hf-generate.yaml
+++ b/mcli/mcli-hf-generate.yaml
@@ -1,7 +1,7 @@
 integrations:
 - integration_type: git_repo
   git_repo: mosaicml/llm-foundry
-  git_branch: v0.15.0
+  git_branch: v0.15.1
   # git_commit: # OR use your commit hash
   pip_install: .[gpu]
   ssh_clone: false  # Should be true if using a private repo
diff --git a/mcli/mcli-llama2-finetune.yaml b/mcli/mcli-llama2-finetune.yaml
index 8f3b6bac4e..1b52826e6f 100644
--- a/mcli/mcli-llama2-finetune.yaml
+++ b/mcli/mcli-llama2-finetune.yaml
@@ -1,13 +1,14 @@
 integrations:
 - integration_type: git_repo
   git_repo: mosaicml/llm-foundry
-  git_branch: v0.15.0
+  git_branch: v0.15.1
   # git_commit: # OR use your commit hash
   pip_install: .[gpu]
   ssh_clone: false  # Should be true if using a private repo
 
 command: |
   cd llm-foundry/scripts
+  export HF_HUB_ENABLE_HF_TRANSFER=1
   composer train/train.py /mnt/config/parameters.yaml
 image: mosaicml/llm-foundry:2.5.1_cu124-latest
 name: llama2-finetune
@@ -21,9 +22,12 @@ compute:
 
 # The below is injected as a YAML file: /mnt/config/parameters.yaml
 parameters:
-  tokenizer_name: meta-llama/Llama-2-7b-hf
-  max_seq_len: 4096
-  global_seed: 17
+  variables:
+    tokenizer_name: meta-llama/Llama-2-7b-hf
+    global_seed: 17
+    max_seq_len: 4096
+
+  max_seq_len: ${variables.max_seq_len}
 
   # Run Name
   run_name:  # If left blank, will be read from env var $RUN_NAME
@@ -42,9 +46,9 @@ parameters:
 
   # Tokenizer
   tokenizer:
-    name: ${tokenizer_name}
+    name: ${variables.tokenizer_name}
     kwargs:
-      model_max_length: ${max_seq_len}
+      model_max_length: ${variables.max_seq_len}
 
   # Dataloaders
   train_loader:
@@ -52,7 +56,7 @@ parameters:
     dataset:
       hf_name: mosaicml/dolly_hhrlhf
       split: train
-      max_seq_len: ${max_seq_len}
+      max_seq_len: ${variables.max_seq_len}
       allow_pad_trimming: false
       decoder_only_format: true
       shuffle: true
@@ -75,7 +79,7 @@ parameters:
     dataset:
       hf_name: mosaicml/dolly_hhrlhf
       split: test
-      max_seq_len: ${max_seq_len}
+      max_seq_len: ${variables.max_seq_len}
       allow_pad_trimming: false
       decoder_only_format: true
       # packing_ratio:
@@ -114,7 +118,7 @@ parameters:
   global_train_batch_size: 64
 
   # System
-  seed: ${global_seed}
+  seed: ${variables.global_seed}
   device_eval_batch_size: 8
   device_train_microbatch_size: auto
   precision: amp_bf16
diff --git a/mcli/mcli-llama3-70b-instruct-finetune.yaml b/mcli/mcli-llama3-70b-instruct-finetune.yaml
new file mode 100644
index 0000000000..c84f20473b
--- /dev/null
+++ b/mcli/mcli-llama3-70b-instruct-finetune.yaml
@@ -0,0 +1,160 @@
+integrations:
+- integration_type: git_repo
+  git_repo: mosaicml/llm-foundry
+  git_branch: v0.15.1
+  # git_commit: # OR use your commit hash
+  pip_install: .[gpu]
+  ssh_clone: false  # Should be true if using a private repo
+
+command: |
+  cd llm-foundry/scripts
+  export HF_HUB_ENABLE_HF_TRANSFER=1
+  composer train/train.py /mnt/config/parameters.yaml
+image: mosaicml/llm-foundry:2.5.1_cu124-latest
+name: llama3.1-70b-finetune
+
+compute:
+  # Note: Finetuning the 70b model requires at least 16x80GB GPUs
+  gpus: 16  # Number of GPUs to use
+  ## These configurations are optional
+  # cluster: TODO # Name of the cluster to use for this run
+  # gpu_type: h100_80gb # Type of GPU to use. We use h100_80gb in our experiments
+
+# The below is injected as a YAML file: /mnt/config/parameters.yaml
+parameters:
+  variables:
+    tokenizer_name: meta-llama/Llama-3.1-70B-Instruct
+    global_seed: 17
+    max_seq_len: 4096
+
+  max_seq_len: ${variables.max_seq_len}
+  # Run Name
+  run_name:  # If left blank, will be read from env var $RUN_NAME
+
+  max_split_size_mb: 512
+  dist_timeout: 3600  # set to avoid NCCL timeouts
+
+  # Model
+  model:
+    name: hf_causal_lm
+    init_device: mixed
+    pretrained_model_name_or_path: meta-llama/Llama-3.1-70B-Instruct
+    pretrained: true
+    # Note: you must have set the HF_TOKEN environment variable and have access to the llama3 models
+    use_auth_token: true
+    use_flash_attention_2: true
+
+  # Tokenizer
+  tokenizer:
+    name: ${variables.tokenizer_name}
+    kwargs:
+      model_max_length: ${variables.max_seq_len}
+  # Dataloaders
+  train_loader:
+    name: finetuning
+    dataset:
+      hf_name: mosaicml/dolly_hhrlhf
+      split: train
+      max_seq_len: ${variables.max_seq_len}
+      allow_pad_trimming: false
+      decoder_only_format: true
+      shuffle: true
+      # # Use packing_ratio: 'auto' to automatically profile and select the highest observed packing ratio with
+      # # zero waste. In practice, this may result in > 0 waste because profiling is done on only a portion
+      # # of the dataset.
+      # # Or use `python llmfoundry/scripts/misc/profile_packing.py --yaml-path /path/to/this/yaml/ ...`
+      # # to profile this run's optimal packing_ratio as it depends on GPU count,
+      # # batch size, sequence length
+      # packing_ratio: auto
+    drop_last: true
+    num_workers: 8
+    pin_memory: false
+    prefetch_factor: 2
+    persistent_workers: true
+    timeout: 0
+
+  eval_loader:
+    name: finetuning
+    dataset:
+      hf_name: mosaicml/dolly_hhrlhf
+      split: test
+      max_seq_len: ${variables.max_seq_len}
+      allow_pad_trimming: false
+      decoder_only_format: true
+      # packing_ratio:
+      shuffle: false
+    drop_last: true
+    num_workers: 8
+    pin_memory: false
+    prefetch_factor: 2
+    persistent_workers: true
+    timeout: 0
+
+  # Optimization
+  scheduler:
+    name: cosine_with_warmup
+    t_warmup: 100ba
+    alpha_f: 0.1
+
+  # Note: You may want to change learning rate, betas, weight decay
+  optimizer:
+    name: decoupled_lionw
+    lr: 5.0e-7
+    betas:
+    - 0.9
+    - 0.95
+    weight_decay: 0.0
+
+  algorithms:
+    gradient_clipping:
+      clipping_type: norm
+      clipping_threshold: 1.0
+
+  max_duration: 1ep
+  eval_first: false
+  eval_interval: 1ep
+  eval_subset_num_batches: -1
+  global_train_batch_size: 16
+
+  # System
+  seed: ${variables.global_seed}
+  device_eval_batch_size: 1
+  device_train_microbatch_size: 1
+  precision: amp_bf16
+
+  # FSDP
+  fsdp_config:
+    state_dict_type: sharded  # Note: we enable sharded checkpointing to avoid GPU OOM
+    sharding_strategy: FULL_SHARD
+    mixed_precision: PURE
+    activation_checkpointing: true
+    activation_checkpointing_reentrant: false
+    activation_cpu_offload: false
+    limit_all_gathers: true
+
+  # Logging
+  progress_bar: false
+  log_to_console: true
+  console_log_interval: 1ba
+
+  callbacks:
+    speed_monitor:
+      window_size: 10
+    lr_monitor: {}
+    memory_monitor: {}
+    runtime_estimator: {}
+
+  load_weights_only: true  # Only load the weights, not the optimizer state, LR schedule, etc
+
+#   loggers:
+#     wandb: {}
+
+#   Checkpoint to local filesystem or remote object store
+#   save_interval: 2000ba
+#   save_num_checkpoints_to_keep: 1  # Important, this cleans up checkpoints saved to DISK
+#   save_folder: ./{run_name}/checkpoints
+#   save_folder: s3://my-bucket/my-folder/{run_name}/checkpoints
+
+#   Load from local filesystem or remote object store
+#   load_path: ./gpt-1b/checkpoints/latest-rank{rank}.pt
+#   load_path: s3://my-bucket/my-folder/gpt-1b/checkpoints/latest-rank{rank}.pt
diff --git a/mcli/mcli-openai-eval.yaml b/mcli/mcli-openai-eval.yaml
index 563eb6b8c2..818afff545 100644
--- a/mcli/mcli-openai-eval.yaml
+++ b/mcli/mcli-openai-eval.yaml
@@ -1,7 +1,7 @@
 integrations:
 - integration_type: git_repo
   git_repo: mosaicml/llm-foundry
-  git_branch: v0.15.0
+  git_branch: v0.15.1
   # git_commit:  # OR use your commit hash
   pip_install: .[gpu,openai]
   ssh_clone: false  # Should be true if using a private repo
diff --git a/mcli/mcli-pretokenize-oci-upload.yaml b/mcli/mcli-pretokenize-oci-upload.yaml
index f344531049..7991bf5e13 100644
--- a/mcli/mcli-pretokenize-oci-upload.yaml
+++ b/mcli/mcli-pretokenize-oci-upload.yaml
@@ -14,7 +14,7 @@ integrations:
   - oci-cli==3.23.2
 - integration_type: git_repo
   git_repo: mosaicml/llm-foundry
-  git_branch: v0.15.0
+  git_branch: v0.15.1
   # git_commit: # OR use your commit hash
   pip_install: .
   ssh_clone: false  # Should be true if using a private repo
diff --git a/setup.py b/setup.py
index 6aa1553b5d..311d3da736 100644
--- a/setup.py
+++ b/setup.py
@@ -52,13 +52,13 @@
 ]
 
 install_requires = [
-    'mosaicml[libcloud,wandb,oci,gcs,mlflow]>=0.27.0,<0.28',
+    'mosaicml[libcloud,wandb,oci,gcs,mlflow]>=0.28.0,<0.29',
     'mlflow>=2.14.1,<2.19',
     'accelerate>=0.25,<1.2',  # for HF inference `device_map`
     'transformers>=4.43.2,<4.47',
-    'mosaicml-streaming>=0.9.0,<0.10',
+    'mosaicml-streaming>=0.10.0,<0.11',
     'torch>=2.5.1,<2.5.2',
-    'datasets>=2.20.0,<2.21',
+    'datasets>=2.20.0,<3.2',
     'fsspec==2023.6.0',  # newer version results in a bug in datasets that duplicates data
     'sentencepiece==0.2.0',
     'einops==0.8.0',
@@ -91,7 +91,7 @@
 ]
 
 extra_deps['databricks'] = [
-    'mosaicml[databricks]>=0.27.0,<0.28',
+    'mosaicml[databricks]>=0.28.0,<0.29',
     'numpy<2',
     'databricks-sql-connector>=3,<4',
     'databricks-connect==14.1.0',
@@ -99,7 +99,7 @@
 ]
 
 extra_deps['tensorboard'] = [
-    'mosaicml[tensorboard]>=0.27.0,<0.28',
+    'mosaicml[tensorboard]>=0.28.0,<0.29',
 ]
 
 # Flash 2 group kept for backwards compatibility
@@ -110,11 +110,11 @@
 extra_deps['gpu'] = copy.deepcopy(extra_deps['gpu-flash2'])
 
 extra_deps['peft'] = [
-    'mosaicml[peft]>=0.27.0,<0.28',
+    'mosaicml[peft]>=0.28.0,<0.29',
 ]
 
 extra_deps['openai'] = [
-    'openai==1.3.8',
+    'openai>=1.56.0,<2.0',
     'tiktoken>=0.4,<0.8.1',
 ]
 
diff --git a/tests/data/test_packing.py b/tests/data/test_packing.py
index 48713f8a19..8402694672 100644
--- a/tests/data/test_packing.py
+++ b/tests/data/test_packing.py
@@ -2,7 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from pathlib import Path
-from typing import Any
+from typing import Any, Callable
 from unittest.mock import Mock, patch
 
 import pytest
@@ -161,27 +161,73 @@ def test_dist_auto_packing(profile_packing: Mock):
     assert packing_ratio == 2
 
 
+def get_remote_config(
+    base_cfg: dict,
+    remote_dir: str,
+    local_dir: str,
+) -> DictConfig:
+    return DictConfig({
+        **base_cfg,
+        'dataset': {
+            **base_cfg['dataset'],
+            'remote': remote_dir,
+            'local': local_dir,
+        },
+    })
+
+
+def get_streams_config(
+    base_cfg: dict,
+    remote_dir: str,
+    local_dir: str,
+) -> DictConfig:
+    return DictConfig({
+        **base_cfg,
+        'dataset': {
+            **base_cfg['dataset'],
+            'streams': {
+                'stream_with_remote': {
+                    'remote': remote_dir,
+                    'local': local_dir,
+                },
+                'stream_without_remote': {
+                    'local': remote_dir,
+                },
+            },
+        },
+    })
+
+
 def patched_packing_ratio(*args: Any, **kwargs: Any):
     from llmfoundry.data.packing import auto_packing_ratio
 
     return auto_packing_ratio(*args, **kwargs, num_packing_ratios=4)
 
 
+@pytest.mark.parametrize(
+    'get_config',
+    [
+        get_remote_config,
+        get_streams_config,
+    ],
+)
 @patch(
     'llmfoundry.data.finetuning.dataloader.auto_packing_ratio',
     patched_packing_ratio,
 )
-def test_auto_packing_with_streaming_dataloader(tmp_path: Path):
+def test_auto_packing_with_streaming_dataloader(
+    get_config: Callable[[dict, str, str], DictConfig],
+    tmp_path: Path,
+):
     columns = {'prompt': 'str', 'response': 'str'}
     tokenizer = build_tokenizer('gpt2', {})
     remote_dir = str(tmp_path / 'remote')
     local_dir = str(tmp_path / 'local')
     with MDSWriter(out=remote_dir, columns=columns, compression=None) as out:
         out.write({'prompt': 'HELLO', 'response': 'WORLD'})
-    cfg = DictConfig({
+
+    base_cfg = {
         'dataset': {
-            'remote': remote_dir,
-            'local': local_dir,
             'packing_ratio': 'auto',
             'max_seq_len': 200,
             'decoder_only_format': True,
@@ -194,7 +240,9 @@ def test_auto_packing_with_streaming_dataloader(tmp_path: Path):
         'prefetch_factor': None,
         'persistent_workers': False,
         'timeout': 0,
-    })
+    }
+
+    cfg = get_config(base_cfg, remote_dir, local_dir)
 
     loader = build_finetuning_dataloader(
         **cfg,
@@ -214,7 +262,10 @@ def test_auto_packing_with_streaming_dataloader(tmp_path: Path):
     assert isinstance(loader.batch_size, int)
     assert loader.dataset.packing_ratio == int(loader.batch_size / 6)
 
-    state_dict = loader.dataset.state_dict(num_samples=2, from_beginning=False)
+    state_dict = loader.dataset.state_dict(
+        num_samples=2,
+        from_beginning=False,
+    )
     assert state_dict['sample_in_epoch'] == 2 * loader.dataset.packing_ratio