From 8d0f0d7f869faf0aab3e521c227db81fa191cd75 Mon Sep 17 00:00:00 2001 From: Andrew Lewycky Date: Wed, 6 Dec 2023 15:05:35 -0500 Subject: [PATCH 01/10] Change version to 1.30-bh --- README.md | 5 ++--- dkms.conf | 2 +- module.c | 2 +- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 83afee6..04ea22a 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ The driver registers device files named `/dev/tenstorrent/%d`, one for each enum * `dnf install epel-release && dnf install dkms` (Enterprise Linux based) ``` sudo dkms add . -sudo dkms install tenstorrent/1.30 +sudo dkms install tenstorrent/1.30-bh sudo modprobe tenstorrent ``` (or reboot, driver will auto-load next boot) @@ -26,6 +26,5 @@ sudo modprobe tenstorrent ### To uninstall: ``` sudo modprobe -r tenstorrent -sudo dkms remove tenstorrent/1.30 --all +sudo dkms remove tenstorrent/1.30-bh --all ``` - diff --git a/dkms.conf b/dkms.conf index 73616bc..635f506 100644 --- a/dkms.conf +++ b/dkms.conf @@ -1,5 +1,5 @@ PACKAGE_NAME="tenstorrent" -PACKAGE_VERSION="1.30" +PACKAGE_VERSION="1.30-bh" BUILT_MODULE_NAME="tenstorrent" DEST_MODULE_LOCATION="/kernel/extra" diff --git a/module.c b/module.c index 4ffba29..bb66383 100644 --- a/module.c +++ b/module.c @@ -9,7 +9,7 @@ #include "chardev.h" #include "enumerate.h" -#define TTDRIVER_VER "1.30" +#define TTDRIVER_VER "1.30-bh" MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Tenstorrent AI kernel driver"); From fd1194267d8336d604f070b7b1812cb1467578aa Mon Sep 17 00:00:00 2001 From: Andrew Lewycky Date: Wed, 6 Dec 2023 15:03:53 -0500 Subject: [PATCH 02/10] Minimal Black Hole support No HW programming or ARC messages. --- Makefile | 2 +- README.md | 1 + blackhole.c | 37 +++++++++++++++++++++++++++++++++++++ blackhole.h | 17 +++++++++++++++++ enumerate.h | 1 + module.c | 3 +++ 6 files changed, 60 insertions(+), 1 deletion(-) create mode 100644 blackhole.c create mode 100644 blackhole.h diff --git a/Makefile b/Makefile index 29993f9..bf48fa0 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ # SPDX-License-Identifier: GPL-2.0-only obj-m += tenstorrent.o -tenstorrent-y := module.o chardev.o enumerate.o interrupt.o grayskull.o wormhole.o pcie.o hwmon.o sg_helpers.o memory.o +tenstorrent-y := module.o chardev.o enumerate.o interrupt.o grayskull.o wormhole.o blackhole.o pcie.o hwmon.o sg_helpers.o memory.o KDIR := /lib/modules/$(shell uname -r)/build KMAKE := $(MAKE) -C $(KDIR) M=$(CURDIR) diff --git a/README.md b/README.md index 04ea22a..999d68f 100644 --- a/README.md +++ b/README.md @@ -7,6 +7,7 @@ ## Supported hardware: * Grayskull * Wormhole +* Blackhole The driver registers device files named `/dev/tenstorrent/%d`, one for each enumerated device. diff --git a/blackhole.c b/blackhole.c new file mode 100644 index 0000000..f70c65a --- /dev/null +++ b/blackhole.c @@ -0,0 +1,37 @@ +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. +// SPDX-License-Identifier: GPL-2.0-only + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include + +#include "blackhole.h" +#include "pcie.h" +#include "module.h" + +static bool blackhole_init(struct tenstorrent_device *tt_dev) { + return true; +} + +static bool blackhole_init_hardware(struct tenstorrent_device *tt_dev) { + return true; +} + +static bool blackhole_post_hardware_init(struct tenstorrent_device *tt_dev) { + return true; +} + +static void blackhole_cleanup_hardware(struct tenstorrent_device *tt_dev) { +} + +static void blackhole_cleanup(struct tenstorrent_device *tt_dev) { +} + +struct tenstorrent_device_class blackhole_class = { + .name = "Blackhole", + .instance_size = sizeof(struct blackhole_device), + .init_device = blackhole_init, + .init_hardware = blackhole_init_hardware, + .post_hardware_init = blackhole_post_hardware_init, + .cleanup_hardware = blackhole_cleanup_hardware, + .cleanup_device = blackhole_cleanup, +}; diff --git a/blackhole.h b/blackhole.h new file mode 100644 index 0000000..2c9d561 --- /dev/null +++ b/blackhole.h @@ -0,0 +1,17 @@ +// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// SPDX-License-Identifier: GPL-2.0-only + +#ifndef TTDRIVER_BLACKHOLE_H_INCLUDED +#define TTDRIVER_BLACKHOLE_H_INCLUDED + +#include +#include "device.h" + +struct blackhole_device { + struct tenstorrent_device tt; +}; + +#define tt_dev_to_wh_dev(ttdev) \ + container_of((tt_dev), struct blackhole_device, tt) + +#endif diff --git a/enumerate.h b/enumerate.h index 0d58dc9..d485531 100644 --- a/enumerate.h +++ b/enumerate.h @@ -14,6 +14,7 @@ #define PCI_VENDOR_ID_TENSTORRENT 0x1E52 #define PCI_DEVICE_ID_GRAYSKULL 0xFACA #define PCI_DEVICE_ID_WORMHOLE 0x401E +#define PCI_DEVICE_ID_BLACKHOLE 0xB140 struct pci_dev; struct cdev; diff --git a/module.c b/module.c index bb66383..eed4f4e 100644 --- a/module.c +++ b/module.c @@ -94,12 +94,15 @@ MODULE_PARM_DESC(auto_reset_timeout, "Timeout duration in seconds for M3 auto re struct tenstorrent_device_class; extern struct tenstorrent_device_class grayskull_class; extern struct tenstorrent_device_class wormhole_class; +extern struct tenstorrent_device_class blackhole_class; const struct pci_device_id tenstorrent_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_TENSTORRENT, PCI_DEVICE_ID_GRAYSKULL), .driver_data=(kernel_ulong_t)&grayskull_class }, { PCI_DEVICE(PCI_VENDOR_ID_TENSTORRENT, PCI_DEVICE_ID_WORMHOLE), .driver_data=(kernel_ulong_t)&wormhole_class }, + { PCI_DEVICE(PCI_VENDOR_ID_TENSTORRENT, PCI_DEVICE_ID_BLACKHOLE), + .driver_data=(kernel_ulong_t)&blackhole_class }, { 0 }, }; From 59804d53b23cecff282fbf5a1753b62528bb1672 Mon Sep 17 00:00:00 2001 From: Andrew Lewycky Date: Thu, 23 May 2024 11:51:22 -0400 Subject: [PATCH 03/10] Directly test for mapping regions so large that they wrap around VerifyNoOverlap will catch this too, but this is more obvious. --- test/query_mappings.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/test/query_mappings.cpp b/test/query_mappings.cpp index df95ca4..bb30294 100644 --- a/test/query_mappings.cpp +++ b/test/query_mappings.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. +// SPDX-FileCopyrightText: © 2024 Tenstorrent Inc. // SPDX-License-Identifier: GPL-2.0-only // Verify that all resource IDs are known to us. @@ -143,6 +143,10 @@ void VerifySizes(const std::vector &mappings) if (std::any_of(mappings.begin(), mappings.end(), [=](const auto &m) { return m.mapping_id != TENSTORRENT_MAPPING_UNUSED && m.mapping_base % pagesize != 0; })) THROW_TEST_FAILURE("Mapping base is not a multiple of page size in QUERY_MAPPINGS results."); + + if (std::any_of(mappings.begin(), mappings.end(), + [](const auto &m) { return m.mapping_size > std::numeric_limits::max() - m.mapping_base; })) + THROW_TEST_FAILURE("Mapping region wraps around."); } void PrintMappings(const std::vector& mappings) From 199831c4b3d2bce18fea8bf43f9419a92e3883f0 Mon Sep 17 00:00:00 2001 From: Andrew Lewycky Date: Thu, 23 May 2024 12:05:52 -0400 Subject: [PATCH 04/10] Adjust MMAP_OFFSET_DMA_BUF to fit all DMA buffers in 32-bit mmap --- memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/memory.c b/memory.c index c3c7719..391eccf 100644 --- a/memory.c +++ b/memory.c @@ -132,7 +132,7 @@ static void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npage // tenstorrent_allocate_dma_buf_in.buf_index is u8 so that sets a limit of // U8_MAX DMA buffers per fd. 32-bit mmap offsets are divided by PAGE_SIZE, // so PAGE_SIZE << 32 is the largest possible offset. -#define MMAP_OFFSET_DMA_BUF ((u64)(PAGE_SIZE-U8_MAX) << 32) +#define MMAP_OFFSET_DMA_BUF ((u64)(PAGE_SIZE-U8_MAX-1) << 32) #define MMAP_SIZE_DMA_BUF (U64_C(1) << 32) From f2c88fd68867e1fcd78ab515b6161b109b120f61 Mon Sep 17 00:00:00 2001 From: Andrew Lewycky Date: Thu, 23 May 2024 12:12:11 -0400 Subject: [PATCH 05/10] Test for 32-bit mmap offset limit --- test/query_mappings.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/test/query_mappings.cpp b/test/query_mappings.cpp index bb30294..0721485 100644 --- a/test/query_mappings.cpp +++ b/test/query_mappings.cpp @@ -128,6 +128,7 @@ void VerifyNoOverlap(const std::vector &mappings) } // Verify that size > 0. Verify that base & size are multiples of the page size. +// Verify that the size is not too large and that mapping_base is not too high. void VerifySizes(const std::vector &mappings) { if (std::any_of(mappings.begin(), mappings.end(), @@ -147,6 +148,12 @@ void VerifySizes(const std::vector &mappings) if (std::any_of(mappings.begin(), mappings.end(), [](const auto &m) { return m.mapping_size > std::numeric_limits::max() - m.mapping_base; })) THROW_TEST_FAILURE("Mapping region wraps around."); + + std::uint64_t mmap_offset_limit_for_32b = (std::uint64_t)1 << 44; // 32 + log(PAGE_SIZE) + + if (std::any_of(mappings.begin(), mappings.end(), + [=](const auto &m) { return m.mapping_size + m.mapping_base >= mmap_offset_limit_for_32b; })) + THROW_TEST_FAILURE("Mapping base/size do not fit into 32-bit mmap offset."); } void PrintMappings(const std::vector& mappings) From 552e1fc07a5925129a1af766dbfcc0c410c9508e Mon Sep 17 00:00:00 2001 From: Andrew Lewycky Date: Mon, 27 May 2024 12:25:39 -0400 Subject: [PATCH 06/10] Adjust mmap offsets to allow for 32GB between each This allows for a single resource mapping to be 32GB. These values are reported through query mapping so the ABI has not changed. --- memory.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/memory.c b/memory.c index 391eccf..a9d7c22 100644 --- a/memory.c +++ b/memory.c @@ -122,12 +122,12 @@ static void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npage // These are the mmap offsets for various resources. In the user-kernel // interface they are dynamic (TENSTORRENT_IOCTL_QUERY_MAPPINGS and // TENSTORRENT_IOCTL_ALLOCATE_DMA_BUF), but they are actually hard-coded. -#define MMAP_OFFSET_RESOURCE0_UC (U64_C(0) << 32) -#define MMAP_OFFSET_RESOURCE0_WC (U64_C(1) << 32) -#define MMAP_OFFSET_RESOURCE1_UC (U64_C(2) << 32) -#define MMAP_OFFSET_RESOURCE1_WC (U64_C(3) << 32) -#define MMAP_OFFSET_RESOURCE2_UC (U64_C(4) << 32) -#define MMAP_OFFSET_RESOURCE2_WC (U64_C(5) << 32) +#define MMAP_OFFSET_RESOURCE0_UC (U64_C(0) << 35) +#define MMAP_OFFSET_RESOURCE0_WC (U64_C(1) << 35) +#define MMAP_OFFSET_RESOURCE1_UC (U64_C(2) << 35) +#define MMAP_OFFSET_RESOURCE1_WC (U64_C(3) << 35) +#define MMAP_OFFSET_RESOURCE2_UC (U64_C(4) << 35) +#define MMAP_OFFSET_RESOURCE2_WC (U64_C(5) << 35) // tenstorrent_allocate_dma_buf_in.buf_index is u8 so that sets a limit of // U8_MAX DMA buffers per fd. 32-bit mmap offsets are divided by PAGE_SIZE, From 9e58aed95297556a88c6bb2ed14e2f6c73e36566 Mon Sep 17 00:00:00 2001 From: nxu Date: Wed, 10 Jul 2024 16:01:19 +0000 Subject: [PATCH 07/10] added pcie timer interface interrupt sequence --- chardev.c | 4 ++-- ioctl.h | 1 + pcie.c | 13 +++++++++++++ pcie.h | 1 + 4 files changed, 17 insertions(+), 2 deletions(-) diff --git a/chardev.c b/chardev.c index 1329fae..cd7e860 100644 --- a/chardev.c +++ b/chardev.c @@ -186,8 +186,8 @@ static long ioctl_reset_device(struct chardev_private *priv, else ok = false; - } else if (in.flags == TENSTORRENT_RESET_DEVICE_RESET_PCIE_LINK) { - ok = pcie_hot_reset_and_restore_state(pdev); + } else if (in.flags == TENSTORRENT_RESET_DEVICE_CONFIG_WRITE) { + ok = pcie_timer_interrupt(pdev); } else { return -EINVAL; } diff --git a/ioctl.h b/ioctl.h index e33890f..d56e1f6 100644 --- a/ioctl.h +++ b/ioctl.h @@ -125,6 +125,7 @@ struct tenstorrent_get_driver_info { // tenstorrent_reset_device_in.flags #define TENSTORRENT_RESET_DEVICE_RESTORE_STATE 0 #define TENSTORRENT_RESET_DEVICE_RESET_PCIE_LINK 1 +#define TENSTORRENT_RESET_DEVICE_CONFIG_WRITE 2 struct tenstorrent_reset_device_in { __u32 output_size_bytes; diff --git a/pcie.c b/pcie.c index 84e1db0..95a5dce 100644 --- a/pcie.c +++ b/pcie.c @@ -13,6 +13,12 @@ #include "grayskull.h" #define FW_MSG_PCIE_RETRAIN 0xB6 +#define INTERFACE_TIMER_CONTROL_OFF 0x930 +#define INTERFACE_TIMER_TARGET_OFF 0x934 + +#define INTERFACE_TIMER_TARGET 0x1 +#define INTERFACE_TIMER_EN 0x1 +#define INTERFACE_FORCE_PENDING 0x10 static bool poll_pcie_link_up(struct pci_dev *pdev, u32 timeout_ms) { u16 tt_vendor_id; @@ -116,3 +122,10 @@ bool complete_pcie_init(struct tenstorrent_device *tt_dev, u8 __iomem* reset_uni return false; } + +bool pcie_timer_interrupt(struct pci_dev *pdev) +{ + pci_write_config_dword(pdev, INTERFACE_TIMER_TARGET_OFF, INTERFACE_TIMER_TARGET); + pci_write_config_dword(pdev, INTERFACE_TIMER_CONTROL_OFF, INTERFACE_TIMER_EN | INTERFACE_FORCE_PENDING); + return true; +} diff --git a/pcie.h b/pcie.h index bede693..7f6e707 100644 --- a/pcie.h +++ b/pcie.h @@ -9,5 +9,6 @@ bool safe_pci_restore_state(struct pci_dev *pdev); bool complete_pcie_init(struct tenstorrent_device *tt_dev, u8 __iomem* reset_unit_regs); bool pcie_hot_reset_and_restore_state(struct pci_dev *pdev); +bool pcie_timer_interrupt(struct pci_dev *pdev); #endif From 99e3b3538cc3da4dcb09f85efd437048e304271d Mon Sep 17 00:00:00 2001 From: Andrew Lewycky Date: Thu, 27 Jul 2023 11:22:54 -0400 Subject: [PATCH 08/10] Get DMA address limit from device class --- blackhole.c | 1 + device.h | 1 + enumerate.c | 3 ++- grayskull.c | 1 + wormhole.c | 1 + 5 files changed, 6 insertions(+), 1 deletion(-) diff --git a/blackhole.c b/blackhole.c index f70c65a..44811f3 100644 --- a/blackhole.c +++ b/blackhole.c @@ -29,6 +29,7 @@ static void blackhole_cleanup(struct tenstorrent_device *tt_dev) { struct tenstorrent_device_class blackhole_class = { .name = "Blackhole", .instance_size = sizeof(struct blackhole_device), + .dma_address_bits = 58, .init_device = blackhole_init, .init_hardware = blackhole_init_hardware, .post_hardware_init = blackhole_post_hardware_init, diff --git a/device.h b/device.h index f09bcce..437a014 100644 --- a/device.h +++ b/device.h @@ -42,6 +42,7 @@ struct tenstorrent_device { struct tenstorrent_device_class { const char *name; u32 instance_size; + u32 dma_address_bits; bool (*init_device)(struct tenstorrent_device *ttdev); bool (*init_hardware)(struct tenstorrent_device *ttdev); bool (*post_hardware_init)(struct tenstorrent_device *ttdev); diff --git a/enumerate.c b/enumerate.c index 131bb9d..487c553 100644 --- a/enumerate.c +++ b/enumerate.c @@ -77,7 +77,8 @@ static int tenstorrent_pci_probe(struct pci_dev *dev, const struct pci_device_id mutex_init(&tt_dev->chardev_mutex); - tt_dev->dma_capable = (dma_set_mask_and_coherent(&dev->dev, DMA_BIT_MASK(dma_address_bits ?: 32)) == 0); + tt_dev->dma_capable = (dma_set_mask_and_coherent(&dev->dev, DMA_BIT_MASK(dma_address_bits ?: device_class->dma_address_bits)) == 0); + // Max these to ensure the IOVA allocator will not split large pinned regions. dma_set_max_seg_size(&dev->dev, UINT_MAX); dma_set_seg_boundary(&dev->dev, ULONG_MAX); diff --git a/grayskull.c b/grayskull.c index 78244ed..08e4b9c 100644 --- a/grayskull.c +++ b/grayskull.c @@ -875,6 +875,7 @@ static void grayskull_last_release_handler(struct tenstorrent_device *tt_dev) { struct tenstorrent_device_class grayskull_class = { .name = "Grayskull", .instance_size = sizeof(struct grayskull_device), + .dma_address_bits = 32, .init_device = grayskull_init, .init_hardware = grayskull_init_hardware, .post_hardware_init = grayskull_post_hardware_init, diff --git a/wormhole.c b/wormhole.c index 0c1d1cf..496c132 100644 --- a/wormhole.c +++ b/wormhole.c @@ -211,6 +211,7 @@ static void wormhole_cleanup(struct tenstorrent_device *tt_dev) { struct tenstorrent_device_class wormhole_class = { .name = "Wormhole", .instance_size = sizeof(struct wormhole_device), + .dma_address_bits = 32, .init_device = wormhole_init, .init_hardware = wormhole_init_hardware, .post_hardware_init = wormhole_post_hardware_init, From fe38bbc35b9173e79a58dd3815504c1ce4116e41 Mon Sep 17 00:00:00 2001 From: nxu Date: Thu, 8 Aug 2024 23:03:04 +0000 Subject: [PATCH 09/10] increase MRRS to 4096B --- blackhole.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/blackhole.c b/blackhole.c index 44811f3..cc4f651 100644 --- a/blackhole.c +++ b/blackhole.c @@ -8,11 +8,15 @@ #include "pcie.h" #include "module.h" +#define MAX_MRRS 4096 + static bool blackhole_init(struct tenstorrent_device *tt_dev) { return true; } static bool blackhole_init_hardware(struct tenstorrent_device *tt_dev) { + struct pci_dev *pdev = tt_dev->pdev; + pcie_set_readrq(pdev, MAX_MRRS); return true; } From ddcf4f2b8c7e50210af3456aab090153cd833148 Mon Sep 17 00:00:00 2001 From: Daniel Rosen Date: Tue, 24 Sep 2024 18:38:26 -0400 Subject: [PATCH 10/10] Readded support for WH reset The condition for handling the RESET_PCIE_LINK flag was removed. This commit readds it. This is required for handling the reset of WH chips. Signed-off-by: Daniel Rosen --- chardev.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/chardev.c b/chardev.c index cd7e860..22e2ece 100644 --- a/chardev.c +++ b/chardev.c @@ -185,7 +185,8 @@ static long ioctl_reset_device(struct chardev_private *priv, ok = priv->device->dev_class->init_hardware(priv->device); else ok = false; - + } else if (in.flags == TENSTORRENT_RESET_DEVICE_RESET_PCIE_LINK) { + ok = pcie_hot_reset_and_restore_state(pdev); } else if (in.flags == TENSTORRENT_RESET_DEVICE_CONFIG_WRITE) { ok = pcie_timer_interrupt(pdev); } else {