Skip to content

Commit

Permalink
Add IOMMU support
Browse files Browse the repository at this point in the history
This implementation is not going to work very well for large
allocations.  The problem is detailed in a comment in the code.
  • Loading branch information
joelsmithTT committed Nov 26, 2024
1 parent db8a0d5 commit 5832c26
Show file tree
Hide file tree
Showing 3 changed files with 147 additions and 4 deletions.
8 changes: 8 additions & 0 deletions device/cluster.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1415,6 +1415,14 @@ void Cluster::init_pcie_iatus() {
int logical_id = src_device_it.first;
PCIDevice* src_pci_device = src_device_it.second.get();

// TODO: with the IOMMU case, I think we can get away with using just
// one iATU region for WH. (On BH, we don't need iATU). We can only
// cover slightly less than 4GB with WH, and the iATU can cover 4GB.
// Splitting it into multiple regions is fine, but it's not necessary.
//
// ... something to consider when this code is refactored into PCIDevice
// where it belongs.

// Device to Host (multiple channels)
for (int channel_id = 0; channel_id < src_pci_device->get_num_host_mem_channels(); channel_id++) {
hugepage_mapping hugepage_map = src_pci_device->get_hugepage_mapping(channel_id);
Expand Down
112 changes: 109 additions & 3 deletions device/pcie/pci_device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,15 @@ T read_sysfs(const PciDeviceInfo &device_info, const std::string &attribute_name
}
}

static bool detect_iommu(const PciDeviceInfo &device_info) {
try {
auto iommu_type = read_sysfs<std::string>(device_info, "iommu_group/type");
return iommu_type.substr(0, 3) == "DMA"; // DMA or DMA-FQ
} catch (...) {
return false;
}
}

static PciDeviceInfo read_device_info(int fd) {
tenstorrent_get_device_info info{};
info.in.output_size_bytes = sizeof(info.out);
Expand Down Expand Up @@ -258,6 +267,8 @@ tt::ARCH PciDeviceInfo::get_arch() const {
return infos;
}

static const semver_t kmd_ver_for_iommu = semver_t(1, 29, 0);

PCIDevice::PCIDevice(int pci_device_number, int logical_device_id) :
device_path(fmt::format("/dev/tenstorrent/{}", pci_device_number)),
pci_device_num(pci_device_number),
Expand All @@ -267,9 +278,19 @@ PCIDevice::PCIDevice(int pci_device_number, int logical_device_id) :
numa_node(read_sysfs<int>(info, "numa_node", -1)), // default to -1 if not found
revision(read_sysfs<int>(info, "revision")),
arch(detect_arch(info.device_id, revision)),
architecture_implementation(tt::umd::architecture_implementation::create(arch)),
kmd_version(read_kmd_version()) {
log_info(LogSiliconDriver, "Opened PCI device {}; KMD version: {}", pci_device_num, kmd_version.to_string());
kmd_version(read_kmd_version()),
behind_iommu(detect_iommu(info)),
architecture_implementation(tt::umd::architecture_implementation::create(arch)) {
if (behind_iommu && kmd_version < kmd_ver_for_iommu) {
TT_THROW("Running with IOMMU support requires KMD version {} or newer", kmd_ver_for_iommu.to_string());
}

log_info(
LogSiliconDriver,
"Opened PCI device {}; KMD version: {}, IOMMU: {}",
pci_device_num,
kmd_version.to_string(),
behind_iommu ? "enabled" : "disabled");

struct {
tenstorrent_query_mappings query_mappings;
Expand Down Expand Up @@ -688,6 +709,11 @@ tt::umd::architecture_implementation *PCIDevice::get_architecture_implementation
bool PCIDevice::init_hugepage(uint32_t num_host_mem_channels) {
const size_t hugepage_size = HUGEPAGE_REGION_SIZE;

if (is_behind_iommu()) {
size_t size = hugepage_size * num_host_mem_channels;
return init_nohugepage(size);
}

// Convert from logical (device_id in netlist) to physical device_id (in case of virtualization)
auto physical_device_id = get_device_num();

Expand Down Expand Up @@ -802,6 +828,37 @@ bool PCIDevice::init_hugepage(uint32_t num_host_mem_channels) {
return success;
}

bool PCIDevice::init_nohugepage(size_t size) {
const size_t num_fake_mem_channels = size / HUGEPAGE_REGION_SIZE;

if (!is_behind_iommu()) {
TT_THROW("IOMMU is required for sysmem without hugepages.");
}

log_info(LogSiliconDriver, "Allocating sysmem without hugepages (size: {:#x}).", size);
void *mapping = mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE | MAP_POPULATE, -1, 0);

if (mapping == MAP_FAILED) {
TT_THROW(
"UMD: Failed to allocate memory for device/host shared buffer (size: {} errno: {}).",
size,
strerror(errno));
}

uint64_t iova = map_for_dma(mapping, size);
log_info(LogSiliconDriver, "Mapped sysmem without hugepages to IOVA {:#x}.", iova);

hugepage_mapping_per_channel.resize(num_fake_mem_channels);

// Support for more than 1GB host memory accessible per device, via channels.
for (size_t ch = 0; ch < num_fake_mem_channels; ch++) {
uint8_t *base = static_cast<uint8_t *>(mapping) + ch * HUGEPAGE_REGION_SIZE;
hugepage_mapping_per_channel[ch] = {base, HUGEPAGE_REGION_SIZE, iova + ch * HUGEPAGE_REGION_SIZE};
}

return true;
}

int PCIDevice::get_num_host_mem_channels() const { return hugepage_mapping_per_channel.size(); }

hugepage_mapping PCIDevice::get_hugepage_mapping(int channel) const {
Expand All @@ -812,6 +869,55 @@ hugepage_mapping PCIDevice::get_hugepage_mapping(int channel) const {
}
}

uint64_t PCIDevice::map_for_dma(void *buffer, size_t size) {
static const auto page_size = sysconf(_SC_PAGESIZE);

const uint64_t vaddr = reinterpret_cast<uint64_t>(buffer);
const uint32_t flags = is_behind_iommu() ? 0 : TENSTORRENT_PIN_PAGES_CONTIGUOUS;

if (vaddr % page_size != 0 || size % page_size != 0) {
TT_THROW("Buffer must be page-aligned with a size that is a multiple of the page size");
}

tenstorrent_pin_pages pin_pages{};
pin_pages.in.output_size_bytes = sizeof(pin_pages.out);
pin_pages.in.flags = flags;
pin_pages.in.virtual_address = vaddr;
pin_pages.in.size = size;

// With IOMMU, this will probably fail on you if you're mapping something
// large. The situation today is that the kernel driver uses a 32-bit DMA
// address mask, so all DMA allocations and mappings show up in the IOVA
// range of 0x0 to 0xffff'ffff. According to syseng, we can get up to 3GB
// on Intel, 3.75GB on AMD, but this requires multiple mappings with small
// chunks, down to 2MB. It's possible to make such non-contiguous mappings
// appear both virtually contiguous (to the application) and physically
// contiguous (to the NOC, using iATU), but it's not clear that this is
// worth the effort... the scheme this is intended to replace supports up
// to 4GB which is what application developers want.
//
// What can we do here?
// 1. Use hugepages (part of what we are trying to avoid here).
// 2. Use a larger value for the driver's dma_address_bits (currently 32;
// has implications for non-UMD based applications -- basically that any
// DMA buffer mapped beyond the 4GB boundary requires iATU configuration
// for the hardware to be able to reach it).
// 3. Use multiple mappings with small chunks (won't get us to 4GB; adds
// complexity).
// 4. Modify the driver so that DMA allocations are in the low 4GB IOVA
// range but mappings from userspace can be further up (requires driver
// changes).
// 5. ???
//
// If you need a quick workaround here, I suggest:
// sudo insmod ./tenstorrent.ko dma_address_bits=48
if (ioctl(pci_device_file_desc, TENSTORRENT_IOCTL_PIN_PAGES, &pin_pages) == -1) {
TT_THROW("Failed to pin pages for DMA: {}", strerror(errno));
}

return pin_pages.out.physical_address;
}

void PCIDevice::print_file_contents(std::string filename, std::string hint) {
if (std::filesystem::exists(filename)) {
std::ifstream meminfo(filename);
Expand Down
31 changes: 30 additions & 1 deletion device/pcie/pci_device.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,11 @@ struct dynamic_tlb {
uint64_t remaining_size; // Bytes remaining between bar_offset and end of the TLB.
};

// These are not necessarily hugepages if IOMMU is enabled.
struct hugepage_mapping {
void *mapping = nullptr;
size_t mapping_size = 0;
uint64_t physical_address = 0;
uint64_t physical_address = 0; // or IOVA, if IOMMU is enabled
};

struct PciDeviceInfo {
Expand Down Expand Up @@ -73,6 +74,7 @@ class PCIDevice {
const int revision; // PCI revision value from sysfs
const tt::ARCH arch; // e.g. Grayskull, Wormhole, Blackhole
const semver_t kmd_version; // KMD version
const bool behind_iommu; // Whether the system is protected from this device by an IOMMU
std::unique_ptr<tt::umd::architecture_implementation> architecture_implementation;

public:
Expand Down Expand Up @@ -152,6 +154,11 @@ class PCIDevice {
*/
tt::ARCH get_arch() const { return arch; }

/**
* @return whether the system is protected from this device by an IOMMU
*/
bool is_behind_iommu() const { return behind_iommu; }

// Note: byte_addr is (mostly but not always) offset into BAR0. This
// interface assumes the caller knows what they are doing - but it's unclear
// how to use this interface correctly without knowing details of the chip
Expand Down Expand Up @@ -200,9 +207,31 @@ class PCIDevice {

// TODO: this also probably has more sense to live in the future TTDevice class.
bool init_hugepage(uint32_t num_host_mem_channels);

/**
* Allocate sysmem without hugepages and map it through IOMMU.
* This is used when the system is protected by an IOMMU. The mappings will
* still appear as hugepages to the caller.
* @param size sysmem size in bytes; size % (1UL << 30) == 0
* @return whether allocation/mapping succeeded.
*/
bool init_nohugepage(size_t size);

int get_num_host_mem_channels() const;
hugepage_mapping get_hugepage_mapping(int channel) const;

/**
* Map a buffer for DMA access by the device.
*
* Supports mapping physically-contiguous buffers (e.g. hugepages) for the
* no-IOMMU case.
*
* @param buffer must be page-aligned
* @param size must be a multiple of the page size
* @return uint64_t PA (no IOMMU) or IOVA (with IOMMU) for use by the device
*/
uint64_t map_for_dma(void *buffer, size_t size);

public:
// TODO: we can and should make all of these private.
void *bar0_uc = nullptr;
Expand Down

0 comments on commit 5832c26

Please sign in to comment.