diff --git a/device/cluster.cpp b/device/cluster.cpp
index 33c89668..fcd9c609 100644
--- a/device/cluster.cpp
+++ b/device/cluster.cpp
@@ -1415,6 +1415,14 @@ void Cluster::init_pcie_iatus() {
         int logical_id = src_device_it.first;
         PCIDevice* src_pci_device = src_device_it.second.get();
 
+        // TODO: with the IOMMU case, I think we can get away with using just
+        // one iATU region for WH.  (On BH, we don't need iATU).  We can only
+        // cover slightly less than 4GB with WH, and the iATU can cover 4GB.
+        // Splitting it into multiple regions is fine, but it's not necessary.
+        //
+        // ... something to consider when this code is refactored into PCIDevice
+        // where it belongs.
+
         // Device to Host (multiple channels)
         for (int channel_id = 0; channel_id < src_pci_device->get_num_host_mem_channels(); channel_id++) {
             hugepage_mapping hugepage_map = src_pci_device->get_hugepage_mapping(channel_id);
diff --git a/device/pcie/pci_device.cpp b/device/pcie/pci_device.cpp
index bc2fd5f6..0c62d7f3 100644
--- a/device/pcie/pci_device.cpp
+++ b/device/pcie/pci_device.cpp
@@ -88,6 +88,15 @@ T read_sysfs(const PciDeviceInfo &device_info, const std::string &attribute_name
     }
 }
 
+static bool detect_iommu(const PciDeviceInfo &device_info) {
+    try {
+        auto iommu_type = read_sysfs<std::string>(device_info, "iommu_group/type");
+        return iommu_type.substr(0, 3) == "DMA";  // DMA or DMA-FQ
+    } catch (...) {
+        return false;
+    }
+}
+
 static PciDeviceInfo read_device_info(int fd) {
     tenstorrent_get_device_info info{};
     info.in.output_size_bytes = sizeof(info.out);
@@ -258,6 +267,8 @@ tt::ARCH PciDeviceInfo::get_arch() const {
     return infos;
 }
 
+static const semver_t kmd_ver_for_iommu = semver_t(1, 29, 0);
+
 PCIDevice::PCIDevice(int pci_device_number, int logical_device_id) :
     device_path(fmt::format("/dev/tenstorrent/{}", pci_device_number)),
     pci_device_num(pci_device_number),
@@ -267,9 +278,19 @@ PCIDevice::PCIDevice(int pci_device_number, int logical_device_id) :
     numa_node(read_sysfs<int>(info, "numa_node", -1)),  // default to -1 if not found
     revision(read_sysfs<int>(info, "revision")),
     arch(detect_arch(info.device_id, revision)),
-    architecture_implementation(tt::umd::architecture_implementation::create(arch)),
-    kmd_version(read_kmd_version()) {
-    log_info(LogSiliconDriver, "Opened PCI device {}; KMD version: {}", pci_device_num, kmd_version.to_string());
+    kmd_version(read_kmd_version()),
+    behind_iommu(detect_iommu(info)),
+    architecture_implementation(tt::umd::architecture_implementation::create(arch)) {
+    if (behind_iommu && kmd_version < kmd_ver_for_iommu) {
+        TT_THROW("Running with IOMMU support requires KMD version {} or newer", kmd_ver_for_iommu.to_string());
+    }
+
+    log_info(
+        LogSiliconDriver,
+        "Opened PCI device {}; KMD version: {}, IOMMU: {}",
+        pci_device_num,
+        kmd_version.to_string(),
+        behind_iommu ? "enabled" : "disabled");
 
     struct {
         tenstorrent_query_mappings query_mappings;
@@ -688,6 +709,11 @@ tt::umd::architecture_implementation *PCIDevice::get_architecture_implementation
 bool PCIDevice::init_hugepage(uint32_t num_host_mem_channels) {
     const size_t hugepage_size = HUGEPAGE_REGION_SIZE;
 
+    if (is_behind_iommu()) {
+        size_t size = hugepage_size * num_host_mem_channels;
+        return init_nohugepage(size);
+    }
+
     // Convert from logical (device_id in netlist) to physical device_id (in case of virtualization)
     auto physical_device_id = get_device_num();
 
@@ -802,6 +828,37 @@ bool PCIDevice::init_hugepage(uint32_t num_host_mem_channels) {
     return success;
 }
 
+bool PCIDevice::init_nohugepage(size_t size) {
+    const size_t num_fake_mem_channels = size / HUGEPAGE_REGION_SIZE;
+
+    if (!is_behind_iommu()) {
+        TT_THROW("IOMMU is required for sysmem without hugepages.");
+    }
+
+    log_info(LogSiliconDriver, "Allocating sysmem without hugepages (size: {:#x}).", size);
+    void *mapping = mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE | MAP_POPULATE, -1, 0);
+
+    if (mapping == MAP_FAILED) {
+        TT_THROW(
+            "UMD: Failed to allocate memory for device/host shared buffer (size: {} errno: {}).",
+            size,
+            strerror(errno));
+    }
+
+    uint64_t iova = map_for_dma(mapping, size);
+    log_info(LogSiliconDriver, "Mapped sysmem without hugepages to IOVA {:#x}.", iova);
+
+    hugepage_mapping_per_channel.resize(num_fake_mem_channels);
+
+    // Support for more than 1GB host memory accessible per device, via channels.
+    for (size_t ch = 0; ch < num_fake_mem_channels; ch++) {
+        uint8_t *base = static_cast<uint8_t *>(mapping) + ch * HUGEPAGE_REGION_SIZE;
+        hugepage_mapping_per_channel[ch] = {base, HUGEPAGE_REGION_SIZE, iova + ch * HUGEPAGE_REGION_SIZE};
+    }
+
+    return true;
+}
+
 int PCIDevice::get_num_host_mem_channels() const { return hugepage_mapping_per_channel.size(); }
 
 hugepage_mapping PCIDevice::get_hugepage_mapping(int channel) const {
@@ -812,6 +869,55 @@ hugepage_mapping PCIDevice::get_hugepage_mapping(int channel) const {
     }
 }
 
+uint64_t PCIDevice::map_for_dma(void *buffer, size_t size) {
+    static const auto page_size = sysconf(_SC_PAGESIZE);
+
+    const uint64_t vaddr = reinterpret_cast<uint64_t>(buffer);
+    const uint32_t flags = is_behind_iommu() ? 0 : TENSTORRENT_PIN_PAGES_CONTIGUOUS;
+
+    if (vaddr % page_size != 0 || size % page_size != 0) {
+        TT_THROW("Buffer must be page-aligned with a size that is a multiple of the page size");
+    }
+
+    tenstorrent_pin_pages pin_pages{};
+    pin_pages.in.output_size_bytes = sizeof(pin_pages.out);
+    pin_pages.in.flags = flags;
+    pin_pages.in.virtual_address = vaddr;
+    pin_pages.in.size = size;
+
+    // With IOMMU, this will probably fail on you if you're mapping something
+    // large.  The situation today is that the kernel driver uses a 32-bit DMA
+    // address mask, so all DMA allocations and mappings show up in the IOVA
+    // range of 0x0 to 0xffff'ffff.  According to syseng, we can get up to 3GB
+    // on Intel, 3.75GB on AMD, but this requires multiple mappings with small
+    // chunks, down to 2MB.  It's possible to make such non-contiguous mappings
+    // appear both virtually contiguous (to the application) and physically
+    // contiguous (to the NOC, using iATU), but it's not clear that this is
+    // worth the effort...  the scheme this is intended to replace supports up
+    // to 4GB which is what application developers want.
+    //
+    // What can we do here?
+    // 1. Use hugepages (part of what we are trying to avoid here).
+    // 2. Use a larger value for the driver's dma_address_bits (currently 32;
+    //    has implications for non-UMD based applications -- basically that any
+    //    DMA buffer mapped beyond the 4GB boundary requires iATU configuration
+    //    for the hardware to be able to reach it).
+    // 3. Use multiple mappings with small chunks (won't get us to 4GB; adds
+    //    complexity).
+    // 4. Modify the driver so that DMA allocations are in the low 4GB IOVA
+    //    range but mappings from userspace can be further up (requires driver
+    //    changes).
+    // 5. ???
+    //
+    // If you need a quick workaround here, I suggest:
+    //   sudo insmod ./tenstorrent.ko dma_address_bits=48
+    if (ioctl(pci_device_file_desc, TENSTORRENT_IOCTL_PIN_PAGES, &pin_pages) == -1) {
+        TT_THROW("Failed to pin pages for DMA: {}", strerror(errno));
+    }
+
+    return pin_pages.out.physical_address;
+}
+
 void PCIDevice::print_file_contents(std::string filename, std::string hint) {
     if (std::filesystem::exists(filename)) {
         std::ifstream meminfo(filename);
diff --git a/device/pcie/pci_device.hpp b/device/pcie/pci_device.hpp
index 62ffb4c2..b9f121b1 100644
--- a/device/pcie/pci_device.hpp
+++ b/device/pcie/pci_device.hpp
@@ -41,10 +41,11 @@ struct dynamic_tlb {
     uint64_t remaining_size;  // Bytes remaining between bar_offset and end of the TLB.
 };
 
+// These are not necessarily hugepages if IOMMU is enabled.
 struct hugepage_mapping {
     void *mapping = nullptr;
     size_t mapping_size = 0;
-    uint64_t physical_address = 0;
+    uint64_t physical_address = 0;  // or IOVA, if IOMMU is enabled
 };
 
 struct PciDeviceInfo {
@@ -73,6 +74,7 @@ class PCIDevice {
     const int revision;              // PCI revision value from sysfs
     const tt::ARCH arch;             // e.g. Grayskull, Wormhole, Blackhole
     const semver_t kmd_version;      // KMD version
+    const bool behind_iommu;         // Whether the system is protected from this device by an IOMMU
     std::unique_ptr<tt::umd::architecture_implementation> architecture_implementation;
 
 public:
@@ -152,6 +154,11 @@ class PCIDevice {
      */
     tt::ARCH get_arch() const { return arch; }
 
+    /**
+     * @return whether the system is protected from this device by an IOMMU
+     */
+    bool is_behind_iommu() const { return behind_iommu; }
+
     // Note: byte_addr is (mostly but not always) offset into BAR0.  This
     // interface assumes the caller knows what they are doing - but it's unclear
     // how to use this interface correctly without knowing details of the chip
@@ -200,9 +207,31 @@ class PCIDevice {
 
     // TODO: this also probably has more sense to live in the future TTDevice class.
     bool init_hugepage(uint32_t num_host_mem_channels);
+
+    /**
+     * Allocate sysmem without hugepages and map it through IOMMU.
+     * This is used when the system is protected by an IOMMU.  The mappings will
+     * still appear as hugepages to the caller.
+     * @param size sysmem size in bytes; size % (1UL << 30) == 0
+     * @return whether allocation/mapping succeeded.
+     */
+    bool init_nohugepage(size_t size);
+
     int get_num_host_mem_channels() const;
     hugepage_mapping get_hugepage_mapping(int channel) const;
 
+    /**
+     * Map a buffer for DMA access by the device.
+     *
+     * Supports mapping physically-contiguous buffers (e.g. hugepages) for the
+     * no-IOMMU case.
+     *
+     * @param buffer must be page-aligned
+     * @param size must be a multiple of the page size
+     * @return uint64_t PA (no IOMMU) or IOVA (with IOMMU) for use by the device
+     */
+    uint64_t map_for_dma(void *buffer, size_t size);
+
 public:
     // TODO: we can and should make all of these private.
     void *bar0_uc = nullptr;