diff --git a/device/cluster.cpp b/device/cluster.cpp index 33c89668..fcd9c609 100644 --- a/device/cluster.cpp +++ b/device/cluster.cpp @@ -1415,6 +1415,14 @@ void Cluster::init_pcie_iatus() { int logical_id = src_device_it.first; PCIDevice* src_pci_device = src_device_it.second.get(); + // TODO: with the IOMMU case, I think we can get away with using just + // one iATU region for WH. (On BH, we don't need iATU). We can only + // cover slightly less than 4GB with WH, and the iATU can cover 4GB. + // Splitting it into multiple regions is fine, but it's not necessary. + // + // ... something to consider when this code is refactored into PCIDevice + // where it belongs. + // Device to Host (multiple channels) for (int channel_id = 0; channel_id < src_pci_device->get_num_host_mem_channels(); channel_id++) { hugepage_mapping hugepage_map = src_pci_device->get_hugepage_mapping(channel_id); diff --git a/device/pcie/pci_device.cpp b/device/pcie/pci_device.cpp index bc2fd5f6..0c62d7f3 100644 --- a/device/pcie/pci_device.cpp +++ b/device/pcie/pci_device.cpp @@ -88,6 +88,15 @@ T read_sysfs(const PciDeviceInfo &device_info, const std::string &attribute_name } } +static bool detect_iommu(const PciDeviceInfo &device_info) { + try { + auto iommu_type = read_sysfs(device_info, "iommu_group/type"); + return iommu_type.substr(0, 3) == "DMA"; // DMA or DMA-FQ + } catch (...) { + return false; + } +} + static PciDeviceInfo read_device_info(int fd) { tenstorrent_get_device_info info{}; info.in.output_size_bytes = sizeof(info.out); @@ -258,6 +267,8 @@ tt::ARCH PciDeviceInfo::get_arch() const { return infos; } +static const semver_t kmd_ver_for_iommu = semver_t(1, 29, 0); + PCIDevice::PCIDevice(int pci_device_number, int logical_device_id) : device_path(fmt::format("/dev/tenstorrent/{}", pci_device_number)), pci_device_num(pci_device_number), @@ -267,9 +278,19 @@ PCIDevice::PCIDevice(int pci_device_number, int logical_device_id) : numa_node(read_sysfs(info, "numa_node", -1)), // default to -1 if not found revision(read_sysfs(info, "revision")), arch(detect_arch(info.device_id, revision)), - architecture_implementation(tt::umd::architecture_implementation::create(arch)), - kmd_version(read_kmd_version()) { - log_info(LogSiliconDriver, "Opened PCI device {}; KMD version: {}", pci_device_num, kmd_version.to_string()); + kmd_version(read_kmd_version()), + behind_iommu(detect_iommu(info)), + architecture_implementation(tt::umd::architecture_implementation::create(arch)) { + if (behind_iommu && kmd_version < kmd_ver_for_iommu) { + TT_THROW("Running with IOMMU support requires KMD version {} or newer", kmd_ver_for_iommu.to_string()); + } + + log_info( + LogSiliconDriver, + "Opened PCI device {}; KMD version: {}, IOMMU: {}", + pci_device_num, + kmd_version.to_string(), + behind_iommu ? "enabled" : "disabled"); struct { tenstorrent_query_mappings query_mappings; @@ -688,6 +709,11 @@ tt::umd::architecture_implementation *PCIDevice::get_architecture_implementation bool PCIDevice::init_hugepage(uint32_t num_host_mem_channels) { const size_t hugepage_size = HUGEPAGE_REGION_SIZE; + if (is_behind_iommu()) { + size_t size = hugepage_size * num_host_mem_channels; + return init_nohugepage(size); + } + // Convert from logical (device_id in netlist) to physical device_id (in case of virtualization) auto physical_device_id = get_device_num(); @@ -802,6 +828,37 @@ bool PCIDevice::init_hugepage(uint32_t num_host_mem_channels) { return success; } +bool PCIDevice::init_nohugepage(size_t size) { + const size_t num_fake_mem_channels = size / HUGEPAGE_REGION_SIZE; + + if (!is_behind_iommu()) { + TT_THROW("IOMMU is required for sysmem without hugepages."); + } + + log_info(LogSiliconDriver, "Allocating sysmem without hugepages (size: {:#x}).", size); + void *mapping = mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE | MAP_POPULATE, -1, 0); + + if (mapping == MAP_FAILED) { + TT_THROW( + "UMD: Failed to allocate memory for device/host shared buffer (size: {} errno: {}).", + size, + strerror(errno)); + } + + uint64_t iova = map_for_dma(mapping, size); + log_info(LogSiliconDriver, "Mapped sysmem without hugepages to IOVA {:#x}.", iova); + + hugepage_mapping_per_channel.resize(num_fake_mem_channels); + + // Support for more than 1GB host memory accessible per device, via channels. + for (size_t ch = 0; ch < num_fake_mem_channels; ch++) { + uint8_t *base = static_cast(mapping) + ch * HUGEPAGE_REGION_SIZE; + hugepage_mapping_per_channel[ch] = {base, HUGEPAGE_REGION_SIZE, iova + ch * HUGEPAGE_REGION_SIZE}; + } + + return true; +} + int PCIDevice::get_num_host_mem_channels() const { return hugepage_mapping_per_channel.size(); } hugepage_mapping PCIDevice::get_hugepage_mapping(int channel) const { @@ -812,6 +869,55 @@ hugepage_mapping PCIDevice::get_hugepage_mapping(int channel) const { } } +uint64_t PCIDevice::map_for_dma(void *buffer, size_t size) { + static const auto page_size = sysconf(_SC_PAGESIZE); + + const uint64_t vaddr = reinterpret_cast(buffer); + const uint32_t flags = is_behind_iommu() ? 0 : TENSTORRENT_PIN_PAGES_CONTIGUOUS; + + if (vaddr % page_size != 0 || size % page_size != 0) { + TT_THROW("Buffer must be page-aligned with a size that is a multiple of the page size"); + } + + tenstorrent_pin_pages pin_pages{}; + pin_pages.in.output_size_bytes = sizeof(pin_pages.out); + pin_pages.in.flags = flags; + pin_pages.in.virtual_address = vaddr; + pin_pages.in.size = size; + + // With IOMMU, this will probably fail on you if you're mapping something + // large. The situation today is that the kernel driver uses a 32-bit DMA + // address mask, so all DMA allocations and mappings show up in the IOVA + // range of 0x0 to 0xffff'ffff. According to syseng, we can get up to 3GB + // on Intel, 3.75GB on AMD, but this requires multiple mappings with small + // chunks, down to 2MB. It's possible to make such non-contiguous mappings + // appear both virtually contiguous (to the application) and physically + // contiguous (to the NOC, using iATU), but it's not clear that this is + // worth the effort... the scheme this is intended to replace supports up + // to 4GB which is what application developers want. + // + // What can we do here? + // 1. Use hugepages (part of what we are trying to avoid here). + // 2. Use a larger value for the driver's dma_address_bits (currently 32; + // has implications for non-UMD based applications -- basically that any + // DMA buffer mapped beyond the 4GB boundary requires iATU configuration + // for the hardware to be able to reach it). + // 3. Use multiple mappings with small chunks (won't get us to 4GB; adds + // complexity). + // 4. Modify the driver so that DMA allocations are in the low 4GB IOVA + // range but mappings from userspace can be further up (requires driver + // changes). + // 5. ??? + // + // If you need a quick workaround here, I suggest: + // sudo insmod ./tenstorrent.ko dma_address_bits=48 + if (ioctl(pci_device_file_desc, TENSTORRENT_IOCTL_PIN_PAGES, &pin_pages) == -1) { + TT_THROW("Failed to pin pages for DMA: {}", strerror(errno)); + } + + return pin_pages.out.physical_address; +} + void PCIDevice::print_file_contents(std::string filename, std::string hint) { if (std::filesystem::exists(filename)) { std::ifstream meminfo(filename); diff --git a/device/pcie/pci_device.hpp b/device/pcie/pci_device.hpp index 62ffb4c2..b9f121b1 100644 --- a/device/pcie/pci_device.hpp +++ b/device/pcie/pci_device.hpp @@ -41,10 +41,11 @@ struct dynamic_tlb { uint64_t remaining_size; // Bytes remaining between bar_offset and end of the TLB. }; +// These are not necessarily hugepages if IOMMU is enabled. struct hugepage_mapping { void *mapping = nullptr; size_t mapping_size = 0; - uint64_t physical_address = 0; + uint64_t physical_address = 0; // or IOVA, if IOMMU is enabled }; struct PciDeviceInfo { @@ -73,6 +74,7 @@ class PCIDevice { const int revision; // PCI revision value from sysfs const tt::ARCH arch; // e.g. Grayskull, Wormhole, Blackhole const semver_t kmd_version; // KMD version + const bool behind_iommu; // Whether the system is protected from this device by an IOMMU std::unique_ptr architecture_implementation; public: @@ -152,6 +154,11 @@ class PCIDevice { */ tt::ARCH get_arch() const { return arch; } + /** + * @return whether the system is protected from this device by an IOMMU + */ + bool is_behind_iommu() const { return behind_iommu; } + // Note: byte_addr is (mostly but not always) offset into BAR0. This // interface assumes the caller knows what they are doing - but it's unclear // how to use this interface correctly without knowing details of the chip @@ -200,9 +207,31 @@ class PCIDevice { // TODO: this also probably has more sense to live in the future TTDevice class. bool init_hugepage(uint32_t num_host_mem_channels); + + /** + * Allocate sysmem without hugepages and map it through IOMMU. + * This is used when the system is protected by an IOMMU. The mappings will + * still appear as hugepages to the caller. + * @param size sysmem size in bytes; size % (1UL << 30) == 0 + * @return whether allocation/mapping succeeded. + */ + bool init_nohugepage(size_t size); + int get_num_host_mem_channels() const; hugepage_mapping get_hugepage_mapping(int channel) const; + /** + * Map a buffer for DMA access by the device. + * + * Supports mapping physically-contiguous buffers (e.g. hugepages) for the + * no-IOMMU case. + * + * @param buffer must be page-aligned + * @param size must be a multiple of the page size + * @return uint64_t PA (no IOMMU) or IOVA (with IOMMU) for use by the device + */ + uint64_t map_for_dma(void *buffer, size_t size); + public: // TODO: we can and should make all of these private. void *bar0_uc = nullptr;