diff --git a/llmfoundry/command_utils/train.py b/llmfoundry/command_utils/train.py
index 5cca197415..8e05bb390b 100644
--- a/llmfoundry/command_utils/train.py
+++ b/llmfoundry/command_utils/train.py
@@ -69,6 +69,8 @@
 ic.configureOutput(includeContext=True)
 install()
 
+ic.disable()
+
 def validate_config(train_config: TrainConfig):
     """Validates compatible model and dataloader selection."""
     # Validate the rest of the config
diff --git a/llmfoundry/models/layers/blocks.py b/llmfoundry/models/layers/blocks.py
index d191b1e277..cf07e453fa 100644
--- a/llmfoundry/models/layers/blocks.py
+++ b/llmfoundry/models/layers/blocks.py
@@ -206,7 +206,7 @@ def forward(
                 m = self.norm_2(x)
 
         n = self.apply_ffn(attention_mask, m)
-        ic(x.shape, x.device, n.shape, n.device)
+        ic(x.shape, x.device, m.shape, m.device, n.shape, n.device)
         # In the following line we move the `x` tensor to the same devices as the output of ffn layer. This operation should be a no-op during training.
         # This is done to fix pipeline parallel generation using hf.generate. Please see this comment for details: https://github.com/mosaicml/llm-foundry/pull/1332#issue-2386827204
         x = x.to(device=n.device,
diff --git a/llmfoundry/models/utils/tp_strategy.py b/llmfoundry/models/utils/tp_strategy.py
index c6af5d0a37..2e2b253c87 100644
--- a/llmfoundry/models/utils/tp_strategy.py
+++ b/llmfoundry/models/utils/tp_strategy.py
@@ -26,7 +26,8 @@ def ffn_tp_strategy(model: ComposerModel) -> dict[str, ParallelStyle]:
         elif name.split('.')[-2:] == ['ffn', 'down_proj']:
             layer_plan[name] = RowwiseParallel(
                 input_layouts = Shard(-1),
-                output_layouts = Replicate(),
+                # output_layouts = Replicate(),
+                output_layouts = Shard(0),
             )
         elif name.split('.')[-1] == 'ffn':
             layer_plan[name] = PrepareModuleInput(