diff --git a/.editorconfig b/.editorconfig index c8d58f27b2fbf..f5607c058203a 100644 --- a/.editorconfig +++ b/.editorconfig @@ -1,16 +1,4 @@ -# Top-most EditorConfig file -root = true - -# Unix-style newlines with a newline ending every file, utf-8 charset -[*] -end_of_line = lf -trim_trailing_whitespace = true -charset = utf-8 - +# 2 space indentation [*.py] indent_style = space - -# ignore binary files -[{*.hwx,*.mlmodel,*.weights,*.golden}] -end_of_line = unset -trim_trailing_whitespace = unset +indent_size = 2 diff --git a/.github/workflows/editorconfig.yml b/.github/workflows/editorconfig.yml deleted file mode 100644 index 1e7095ecc20fb..0000000000000 --- a/.github/workflows/editorconfig.yml +++ /dev/null @@ -1,13 +0,0 @@ -name: EditorConfig Checker - -on: - push: - pull_request: - -jobs: - editorconfig: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - - uses: editorconfig-checker/action-editorconfig-checker@main - - run: editorconfig-checker diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 00886fe4a758c..fb91c5e6b651f 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -249,7 +249,7 @@ jobs: wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | sudo tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | sudo tee /etc/apt/sources.list.d/oneAPI.list sudo apt update && \ - sudo apt install -y intel-oneapi-runtime-compilers intel-oneapi-runtime-opencl + sudo apt install -y intel-oneapi-runtime-compilers intel-oneapi-runtime-opencl - name: Install packages (cuda) if: matrix.backend == 'cuda' run: | diff --git a/examples/webgpu/compile_webgpu.py b/examples/webgpu/compile_webgpu.py index 2c29451bcc364..0fd00a5aa0ad6 100644 --- a/examples/webgpu/compile_webgpu.py +++ b/examples/webgpu/compile_webgpu.py @@ -9,7 +9,7 @@ model.load_from_pretrained() run, special_names = jit_model(model, Tensor.randn(1,3,224,224)) functions, statements, bufs, _bufs_to_save = compile_net(run, special_names) - + state = get_state_dict(model) weights = {id(x.lazydata.realized): name for name, x in state.items()} safe_save(state, path.join(path.dirname(__file__), "net.safetensors")) @@ -28,7 +28,7 @@ const getTensorBuffer = (safetensorBuffer, tensorMetadata) => {{ return safetensorBuffer.subarray(...tensorMetadata.data_offsets); }} - + const createEmptyBuf = (device, size) => {{ return device.createBuffer({{size, usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC | GPUBufferUsage.COPY_DST }}); }}; @@ -50,7 +50,7 @@ }}; {kernel_code} - + const setupNet = async (device, safetensor) => {{ const metadata = getTensorMetadata(safetensor); diff --git a/examples/webgpu/index.html b/examples/webgpu/index.html index d662116e9e3ea..a57fc435fca0d 100644 --- a/examples/webgpu/index.html +++ b/examples/webgpu/index.html @@ -56,7 +56,7 @@

WebGPU tinygrad EfficientNe console.log(`${delta} ms ${label}`); document.getElementById('time').innerHTML = `${delta} ms ${label}`; return out; - } + } const getLabels = async () => (await fetch("https://raw.githubusercontent.com/anishathalye/imagenet-simple-labels/master/imagenet-simple-labels.json")).json(); @@ -80,7 +80,7 @@

WebGPU tinygrad EfficientNe const response = await fetch(resource) if (!response.ok) error("sir. that is not a good URL. try a new one"); document.getElementById("imagebox").src = resource - + const img = new Image(); img.crossOrigin = "Anonymous"; img.onload = () => { diff --git a/extra/accel/ane/2_compile/compile.mm b/extra/accel/ane/2_compile/compile.mm index 3940bf095ec99..2ccdae8a53182 100644 --- a/extra/accel/ane/2_compile/compile.mm +++ b/extra/accel/ane/2_compile/compile.mm @@ -36,7 +36,7 @@ int main(int argc, char* argv[]) { CFTypeRef ivalues[2]; ivalues[0] = CFStringCreateWithCString(kCFAllocatorDefault, argv[1], kCFStringEncodingUTF8); ivalues[1] = CFSTR("./"); - + CFDictionaryRef iDictionary = CFDictionaryCreate(kCFAllocatorDefault, ikeys, ivalues, 2, &kCFTypeDictionaryKeyCallBacks, &kCFTypeDictionaryValueCallBacks); CFArrayRef array = CFArrayCreate(kCFAllocatorDefault, (const void**)&iDictionary, 1, &kCFTypeArrayCallBacks); @@ -69,5 +69,6 @@ int main(int argc, char* argv[]) { int ret = ANECCompile(optionsDictionary, flagsDictionary, 0); printf("compile: %d\n", ret); + return ret; } diff --git a/extra/accel/ane/3_run/h11ane.h b/extra/accel/ane/3_run/h11ane.h index 84e4f84730b8d..3cc1e5442b367 100644 --- a/extra/accel/ane/3_run/h11ane.h +++ b/extra/accel/ane/3_run/h11ane.h @@ -142,7 +142,7 @@ namespace H11ANE { int ANE_ForgetFirmware(); - private: // size is 0x88 + private: // size is 0x88 unsigned char unknown[0x88]; }; diff --git a/extra/accel/ane/README.md b/extra/accel/ane/README.md index ba6238f4687db..289cff0006f3f 100644 --- a/extra/accel/ane/README.md +++ b/extra/accel/ane/README.md @@ -52,13 +52,13 @@ Sadly disabling amfi breaks things like vscode. You can runtime patch ``` # MacOS 12.4 -smol :: ~/fun/tinygrad » sha1sum /usr/libexec/amfid +smol :: ~/fun/tinygrad » sha1sum /usr/libexec/amfid 0f7e7f7e41408f83d7ebc7564a3828f41cb2ab58 /usr/libexec/amfid # with patching +0x8e38 (lldb) image list -[ 0] 04B6DF6C-6068-3F18-81A7-978985574387 0x0000000102ad0000 /usr/libexec/amfid +[ 0] 04B6DF6C-6068-3F18-81A7-978985574387 0x0000000102ad0000 /usr/libexec/amfid (lldb) p *(unsigned int *)0x102ad8e38=0xd2800000 ``` @@ -67,7 +67,7 @@ This disables the entitlement check, then you don't need a bootarg. I wish Apple ## Extracting ANEServices.framework ``` -# install xcode and +# install xcode and sudo xcode-select --switch /Applications/Xcode.app # xcode also contains ANEServices.tbd brew install keith/formulae/dyld-shared-cache-extractor @@ -87,7 +87,7 @@ https://github.com/antgroup-arclab/ANETools.git * frame #0: 0x00000001c250fecc AppleNeuralEngine`-[_ANEDaemonConnection loadModel:sandboxExtension:options:qos:withReply:] (lldb) po $x2 _ANEModel: { modelURL=file:///var/folders/l8/38vj8bm52_gfgsqgdn__sh2w0000gn/T/test_F48D9B88-A68D-476F-ADC8-32BDAF9A2498.mlmodelc/ : key={"isegment":0,"inputs":{"image":{"shape":[1,1,1,64,1]},"image2":{"shape":[1,1,1,64,1]}},"outputs":{"probs":{"shape":[1,1,1,64,1]}}} : string_id=0x00000000 : program=(null) : state=1 : programHandle=0 : intermediateBufferHandle=0 : queueDepth=0 : attr={ -} : perfStatsMask=0} +} : perfStatsMask=0} ``` ## Choices diff --git a/extra/accel/ane/README.old b/extra/accel/ane/README.old index a3b512e2e16ea..83b97cb424864 100644 --- a/extra/accel/ane/README.old +++ b/extra/accel/ane/README.old @@ -79,7 +79,7 @@ rbreak ^ZinAneInstruction* weeee ZinIrRegBitPrintOutDebug_7u_ print (void)debugregs(0, 0x0000000100211030+8, 3) -== min.plist == +== min.plist == Types: GOC, Conv, Broadcast, ScaledElementWise, Reshape, InputView, Neuron, Concat @@ -323,7 +323,7 @@ zin_ane_compiler v4.2.1 ANECCompile(__CFDictionary *param_1, __CFDictionary *param_2, unsigned long param_3) param_1: -{ +{ InputNetworks = ( { NetworkPlistName = "net.plist"; diff --git a/extra/accel/ane/aneregs b/extra/accel/ane/aneregs index ea3f8d0b0e559..fef779165a2cb 100644 --- a/extra/accel/ane/aneregs +++ b/extra/accel/ane/aneregs @@ -1,99 +1,99 @@ // ZinIrRegBitPrintOutDebug_7u_ -Task_ID: 0 +Task_ID: 0 header = 10*4 = 0x28 - -aneTD.Header[0].TID = 0 -aneTD.Header[0].NID = 0 -aneTD.Header[0].LNID = 1 -aneTD.Header[0].EON = 1 -aneTD.Header[1].ExeCycles = 0 -aneTD.Header[1].NextSize = 0 -aneTD.Header[2].LogEvents = 1058 -aneTD.Header[3].Exceptions = 0 + +aneTD.Header[0].TID = 0 +aneTD.Header[0].NID = 0 +aneTD.Header[0].LNID = 1 +aneTD.Header[0].EON = 1 +aneTD.Header[1].ExeCycles = 0 +aneTD.Header[1].NextSize = 0 +aneTD.Header[2].LogEvents = 1058 +aneTD.Header[3].Exceptions = 0 aneTD.Header[4].DebugLogEvents = 16775274 aneTD.Header[5].DebugExceptions = 0 -aneTD.Header[6].DisallowAbort = 0 -aneTD.Header[6].TDSkip = 0 -aneTD.Header[6].KPC = 0 -aneTD.Header[6].SPL = 0 -aneTD.Header[6].TSR = 0 -aneTD.Header[6].SPC = 0 -aneTD.Header[6].DPC = 0 -aneTD.Header[6].TSE = 0 +aneTD.Header[6].DisallowAbort = 0 +aneTD.Header[6].TDSkip = 0 +aneTD.Header[6].KPC = 0 +aneTD.Header[6].SPL = 0 +aneTD.Header[6].TSR = 0 +aneTD.Header[6].SPC = 0 +aneTD.Header[6].DPC = 0 +aneTD.Header[6].TSE = 0 aneTD.Header[6].NextPriority = 0 -aneTD.Header[6].TDE = 0 -aneTD.Header[6].SrcLoc = 1 -aneTD.Header[6].DstLoc = 1 -aneTD.Header[6].TQDis = 0 -aneTD.Header[7].NextPointer = 0 -aneTD.Header[8].RBase0 = 5 -aneTD.Header[8].RBE0 = 1 -aneTD.Header[8].RBase1 = 0 -aneTD.Header[8].RBE1 = 0 -aneTD.Header[8].WBase = 4 -aneTD.Header[8].WBE = 1 -aneTD.Header[8].TBase = 0 -aneTD.Header[8].TBE = 0 -aneTD.Header[8].ENE = 1 -aneTD.Header[9].KBase0 = 1 -aneTD.Header[9].KBE0 = 1 -aneTD.Header[9].KBase1 = 0 -aneTD.Header[9].KBE1 = 0 -aneTD.Header[9].KBase2 = 0 -aneTD.Header[9].KBE2 = 0 -aneTD.Header[9].KBase3 = 0 -aneTD.Header[9].KBE3 = 0 - +aneTD.Header[6].TDE = 0 +aneTD.Header[6].SrcLoc = 1 +aneTD.Header[6].DstLoc = 1 +aneTD.Header[6].TQDis = 0 +aneTD.Header[7].NextPointer = 0 +aneTD.Header[8].RBase0 = 5 +aneTD.Header[8].RBE0 = 1 +aneTD.Header[8].RBase1 = 0 +aneTD.Header[8].RBE1 = 0 +aneTD.Header[8].WBase = 4 +aneTD.Header[8].WBE = 1 +aneTD.Header[8].TBase = 0 +aneTD.Header[8].TBE = 0 +aneTD.Header[8].ENE = 1 +aneTD.Header[9].KBase0 = 1 +aneTD.Header[9].KBE0 = 1 +aneTD.Header[9].KBase1 = 0 +aneTD.Header[9].KBE1 = 0 +aneTD.Header[9].KBase2 = 0 +aneTD.Header[9].KBE2 = 0 +aneTD.Header[9].KBase3 = 0 +aneTD.Header[9].KBE3 = 0 + 0x28 = 00 F8 01 F4 = 0x1F800 +0x30 aneRegs.KernelDMASrc.CoeffBaseAddr[0].Addr = 0 aneRegs.KernelDMASrc.CoeffBfrSize[0].MemBfrSize = 2 -aneRegs.KernelDMASrc.CoeffDMAConfig[0].CacheHint = 2 +aneRegs.KernelDMASrc.CoeffDMAConfig[0].CacheHint = 2 aneRegs.KernelDMASrc.CoeffDMAConfig[0].CrH = 0 aneRegs.KernelDMASrc.CoeffDMAConfig[0].En = 1 -aneRegs.KernelDMASrc.CoeffDMAConfig[0].PrefetchParticipateEn = 0 +aneRegs.KernelDMASrc.CoeffDMAConfig[0].PrefetchParticipateEn = 0 aneRegs.KernelDMASrc.CoeffBaseAddr[1].Addr = 0 aneRegs.KernelDMASrc.CoeffBfrSize[1].MemBfrSize = 1 -aneRegs.KernelDMASrc.CoeffDMAConfig[1].CacheHint = 2 +aneRegs.KernelDMASrc.CoeffDMAConfig[1].CacheHint = 2 aneRegs.KernelDMASrc.CoeffDMAConfig[1].CrH = 0 aneRegs.KernelDMASrc.CoeffDMAConfig[1].En = 0 -aneRegs.KernelDMASrc.CoeffDMAConfig[1].PrefetchParticipateEn = 0 +aneRegs.KernelDMASrc.CoeffDMAConfig[1].PrefetchParticipateEn = 0 aneRegs.KernelDMASrc.CoeffBaseAddr[2].Addr = 0 aneRegs.KernelDMASrc.CoeffBfrSize[2].MemBfrSize = 1 -aneRegs.KernelDMASrc.CoeffDMAConfig[2].CacheHint = 2 +aneRegs.KernelDMASrc.CoeffDMAConfig[2].CacheHint = 2 aneRegs.KernelDMASrc.CoeffDMAConfig[2].CrH = 0 aneRegs.KernelDMASrc.CoeffDMAConfig[2].En = 0 -aneRegs.KernelDMASrc.CoeffDMAConfig[2].PrefetchParticipateEn = 0 +aneRegs.KernelDMASrc.CoeffDMAConfig[2].PrefetchParticipateEn = 0 # there's 13 more of these aneRegs.KernelDMASrc.Spare0.Spare = 0 -aneRegs.KernelDMASrc.Spare1.Spare = 0 +aneRegs.KernelDMASrc.Spare1.Spare = 0 0x124 = 00 00 00 3C = 0 +0x1d4 -aneRegs.Common.Cfg.AccDoubleBufEn = 1 -aneRegs.Common.Cfg.ActiveNE = 0 +aneRegs.Common.Cfg.AccDoubleBufEn = 1 +aneRegs.Common.Cfg.ActiveNE = 0 aneRegs.Common.Cfg.ContextSwitchIn = 0 aneRegs.Common.Cfg.ContextSwitchOut = 0 -aneRegs.Common.Cfg.ShMax = 1 -aneRegs.Common.Cfg.ShMin = 0 -aneRegs.Common.Cfg.ShPref = 1 +aneRegs.Common.Cfg.ShMax = 1 +aneRegs.Common.Cfg.ShMin = 0 +aneRegs.Common.Cfg.ShPref = 1 aneRegs.Common.Cfg.SmallSourceMode = 0 -aneRegs.Common.ChCfg.InFmt = 2 -aneRegs.Common.ChCfg.OutFmt = 2 -aneRegs.Common.Cin.Cin = 1 -aneRegs.Common.ConvCfg.Kh = 1 -aneRegs.Common.ConvCfg.Kw = 1 -aneRegs.Common.ConvCfg.OCGSize = 0 -aneRegs.Common.ConvCfg.Ox = 1 -aneRegs.Common.ConvCfg.Oy = 1 -aneRegs.Common.ConvCfg.Px = 0 -aneRegs.Common.ConvCfg.Py = 0 -aneRegs.Common.ConvCfg.Sx = 1 -aneRegs.Common.ConvCfg.Sy = 1 -aneRegs.Common.Cout.Cout = 1 -aneRegs.Common.DPE.Category = 0 +aneRegs.Common.ChCfg.InFmt = 2 +aneRegs.Common.ChCfg.OutFmt = 2 +aneRegs.Common.Cin.Cin = 1 +aneRegs.Common.ConvCfg.Kh = 1 +aneRegs.Common.ConvCfg.Kw = 1 +aneRegs.Common.ConvCfg.OCGSize = 0 +aneRegs.Common.ConvCfg.Ox = 1 +aneRegs.Common.ConvCfg.Oy = 1 +aneRegs.Common.ConvCfg.Px = 0 +aneRegs.Common.ConvCfg.Py = 0 +aneRegs.Common.ConvCfg.Sx = 1 +aneRegs.Common.ConvCfg.Sy = 1 +aneRegs.Common.Cout.Cout = 1 +aneRegs.Common.DPE.Category = 0 aneRegs.Common.GroupConvCfg.ElemMultMode = 0 aneRegs.Common.GroupConvCfg.NumGroups = 1 aneRegs.Common.GroupConvCfg.UnicastCin = 1 @@ -132,7 +132,7 @@ aneRegs.TileDMASrc.PixelOffset[1].Offset = 0 aneRegs.TileDMASrc.PixelOffset[2].Offset = 0 aneRegs.TileDMASrc.PixelOffset[3].Offset = 0 aneRegs.TileDMASrc.PlaneStride.PlaneStride = 3 -aneRegs.TileDMASrc.RowStride.Stride = 3 +aneRegs.TileDMASrc.RowStride.Stride = 3 aneRegs.TileDMASrc.Spare0.Spare = 0 aneRegs.TileDMASrc.Spare1.Spare = 0 @@ -145,7 +145,7 @@ aneRegs.L2.ResultCfg.AliasPlanarRslt = 0 aneRegs.L2.ResultCfg.AliasPlanarSrc = 0 aneRegs.L2.ResultCfg.ResultType = 2 aneRegs.L2.ResultCfg.DMACmpVec = 0 -aneRegs.L2.ResultCfg.DMAFmt = 1 +aneRegs.L2.ResultCfg.DMAFmt = 1 aneRegs.L2.ResultCfg.DMAInterleave = 1 aneRegs.L2.ResultCfg.DMAOffsetCh = 0 aneRegs.L2.ResultCfg.L2BfrMode = 1 @@ -173,38 +173,38 @@ aneRegs.L2.SourceRowStride.Stride = 10 +0x2f0 0x23C = 00 C8 00 10 = 0xC800 +0x30c -aneRegs.NE.AccBias.AccBias = 0 -aneRegs.NE.AccBias.AccBiasShift = 0 -aneRegs.NE.KernelCfg.GroupKernelReuse = 0 -aneRegs.NE.KernelCfg.KernelFmt = 0 -aneRegs.NE.KernelCfg.PalettizedBits = 8 -aneRegs.NE.KernelCfg.PalettizedEn = 0 -aneRegs.NE.KernelCfg.SparseFmt = 0 -aneRegs.NE.MACCfg.BiasMode = 0 -aneRegs.NE.MACCfg.BinaryPoint = 0 -aneRegs.NE.MACCfg.KernelMode = 1 -aneRegs.NE.MACCfg.MatrixBiasEn = 0 -aneRegs.NE.MACCfg.NonlinearMode = 2 -aneRegs.NE.MACCfg.OpMode = 4 -aneRegs.NE.MACCfg.PostScaleMode = 0 -aneRegs.NE.MatrixVectorBias.MatrixVectorBias = 0 -aneRegs.NE.PostScale.PostRightShift = 0 -aneRegs.NE.PostScale.PostScale = 15360 -aneRegs.NE.Spare0.Spare = 0 -aneRegs.NE.Spare1.Spare = 0 +aneRegs.NE.AccBias.AccBias = 0 +aneRegs.NE.AccBias.AccBiasShift = 0 +aneRegs.NE.KernelCfg.GroupKernelReuse = 0 +aneRegs.NE.KernelCfg.KernelFmt = 0 +aneRegs.NE.KernelCfg.PalettizedBits = 8 +aneRegs.NE.KernelCfg.PalettizedEn = 0 +aneRegs.NE.KernelCfg.SparseFmt = 0 +aneRegs.NE.MACCfg.BiasMode = 0 +aneRegs.NE.MACCfg.BinaryPoint = 0 +aneRegs.NE.MACCfg.KernelMode = 1 +aneRegs.NE.MACCfg.MatrixBiasEn = 0 +aneRegs.NE.MACCfg.NonlinearMode = 2 +aneRegs.NE.MACCfg.OpMode = 4 +aneRegs.NE.MACCfg.PostScaleMode = 0 +aneRegs.NE.MatrixVectorBias.MatrixVectorBias = 0 +aneRegs.NE.PostScale.PostRightShift = 0 +aneRegs.NE.PostScale.PostScale = 15360 +aneRegs.NE.Spare0.Spare = 0 +aneRegs.NE.Spare1.Spare = 0 0x254 = 00 78 01 18 = 0x17800 +0x32c -aneRegs.TileDMADst.BaseAddr.Addr = 0 +aneRegs.TileDMADst.BaseAddr.Addr = 0 aneRegs.TileDMADst.DepthStride.DepthStride = 3 aneRegs.TileDMADst.DMAConfig.BypassEOW = 0 aneRegs.TileDMADst.DMAConfig.CacheHint = 3 -aneRegs.TileDMADst.DMAConfig.CrH = 0 -aneRegs.TileDMADst.DMAConfig.En = 1 +aneRegs.TileDMADst.DMAConfig.CrH = 0 +aneRegs.TileDMADst.DMAConfig.En = 1 aneRegs.TileDMADst.DMAConfig.L2BfrMode = 1 aneRegs.TileDMADst.Fmt.CmpVec = 0 aneRegs.TileDMADst.Fmt.CmpVecFill = 0 -aneRegs.TileDMADst.Fmt.FmtMode = 1 +aneRegs.TileDMADst.Fmt.FmtMode = 1 aneRegs.TileDMADst.Fmt.Interleave = 1 aneRegs.TileDMADst.Fmt.MemFmt = 2 aneRegs.TileDMADst.Fmt.OffsetCh = 0 diff --git a/extra/accel/ane/lib/ane.py b/extra/accel/ane/lib/ane.py index 3d193123cf09c..f7f710a69e11a 100755 --- a/extra/accel/ane/lib/ane.py +++ b/extra/accel/ane/lib/ane.py @@ -214,7 +214,7 @@ def fill(self, dat, addrs, type, val, base=0x4000): mdf = ane.pack(dd, md) assert(md == mdf) - comp = ane.compile(dat) + comp = ane.compile(dat) ret = ane.run(comp, tin, tout) print("** after **") print(tind) diff --git a/extra/datasets/coco.py b/extra/datasets/coco.py index 5b359745877d3..008efae4fa412 100644 --- a/extra/datasets/coco.py +++ b/extra/datasets/coco.py @@ -24,7 +24,7 @@ def create_dict(key_row, val_row, rows): return {row[key_row]:row[val_row] for r with zipfile.ZipFile(fn, 'r') as zip_ref: zip_ref.extractall(BASEDIR) fn.unlink() - + if not pathlib.Path(BASEDIR/'annotations').is_dir(): fn = BASEDIR/'annotations_trainval2017.zip' @@ -178,7 +178,7 @@ def evaluate_predictions_on_coco(json_result_file, iou_type="bbox"): with open(json_result_file, "r") as f: for line in f: coco_results.append(json.loads(line)) - + coco_gt = COCO(str(BASEDIR/'annotations/instances_val2017.json')) set_of_json = remove_dup([json.dumps(d, cls=NpEncoder) for d in coco_results]) unique_list = [json.loads(s) for s in set_of_json] @@ -186,7 +186,7 @@ def evaluate_predictions_on_coco(json_result_file, iou_type="bbox"): with open(f'{json_result_file}.flattend', "w") as f: json.dump(unique_list, f) - coco_dt = coco_gt.loadRes(str(f'{json_result_file}.flattend')) + coco_dt = coco_gt.loadRes(str(f'{json_result_file}.flattend')) coco_eval = COCOeval(coco_gt, coco_dt, iou_type) coco_eval.evaluate() coco_eval.accumulate() diff --git a/extra/gemm/amx.py b/extra/gemm/amx.py index 1ad0cd7db2ff0..5901a8a633b5a 100755 --- a/extra/gemm/amx.py +++ b/extra/gemm/amx.py @@ -56,7 +56,7 @@ def op_gpr(op, builder, gpr): builder.asm(ir.FunctionType(ir.VoidType(), [ir.Int module = ir.Module(name=__file__) func = ir.Function(module, ir.FunctionType(ir.IntType(64), [ir.FloatType().as_pointer()]*3), name='exec') -# load all +# load all entry = ir.IRBuilder(func.append_basic_block(name="entry")) zm, xm, ym = [entry.ptrtoint(func.args[i], ir.IntType(64)) for i in range(3)] diff --git a/extra/gemm/gemm.c b/extra/gemm/gemm.c index ecc4038bc3aa7..fb77c7b264285 100644 --- a/extra/gemm/gemm.c +++ b/extra/gemm/gemm.c @@ -135,7 +135,7 @@ int main() { for (int i = 0; i < 4000; i++) { memset(C, 0, N*N*sizeof(float)); -#if NTHREADS != 1 +#if NTHREADS != 1 nready = 0; ndone = 0; pthread_mutex_lock(&lock); @@ -147,7 +147,7 @@ int main() { #endif uint64_t start = nanos(); -#if NTHREADS == 1 +#if NTHREADS == 1 matmul(0, N); #else // unlocking mutex starts threads @@ -156,7 +156,7 @@ int main() { #endif uint64_t end = nanos(); -#if NTHREADS != 1 +#if NTHREADS != 1 for (int j = 0; j < NTHREADS; j++) { pthread_join(threads[j], NULL); } diff --git a/extra/gemm/gemm.py b/extra/gemm/gemm.py index f6f13d8f99fd1..5073cc05cbfed 100755 --- a/extra/gemm/gemm.py +++ b/extra/gemm/gemm.py @@ -21,7 +21,7 @@ et = time.monotonic() s = et-st print(f"{flop/s * 1e-9:.2f} GFLOP/S, {s*1e3:.2f} ms") - + with open("/tmp/matmul", "wb") as f: f.write(A.data) f.write(B.data) diff --git a/extra/intel/go.sh b/extra/intel/go.sh index a84f19d0ef1e1..8c67088c0575e 100755 --- a/extra/intel/go.sh +++ b/extra/intel/go.sh @@ -1,3 +1,3 @@ #!/bin/bash -e /opt/intel/oneapi/compiler/latest/linux/bin-llvm/clang++ joint_matrix_bfloat16.cpp -fsycl -SYCL_PI_TRACE=1 ./a.out +SYCL_PI_TRACE=1 ./a.out diff --git a/test/external/external_test_image.py b/test/external/external_test_image.py index 3e246eef70bff..68d0aec3481dd 100644 --- a/test/external/external_test_image.py +++ b/test/external/external_test_image.py @@ -23,7 +23,7 @@ def test_sum_image(self): t1 = t1.sum() t1.realize() assert t1.numpy() == 16*4*4*4, f"got {t1.numpy()}" - + def test_add_image(self): t1 = Tensor.ones(16, 16, 1).reshape(16, 4, 4) + 3 t2 = Tensor.ones(16, 16, 1).reshape(16, 4, 4) + 4 @@ -38,7 +38,7 @@ def test_padded_conv(self): tiny_conv = Conv2d(in_chans, out_chans, 3, bias=None, padding=1) tiny_dat = Tensor.ones(bs, 12, 64, 128) tiny_conv(tiny_dat).realize() - + def test_op_conv(self): bs, in_chans, out_chans = 1,12,32 tiny_conv = Conv2d(in_chans, out_chans, 3, bias=None, padding=1) diff --git a/test/external/external_test_opt.py b/test/external/external_test_opt.py index 9dc08dbd4f681..dd5e14ff285cf 100644 --- a/test/external/external_test_opt.py +++ b/test/external/external_test_opt.py @@ -366,7 +366,7 @@ def _test_fold_expand_reduce_helper(self, n, m, axis, allowed): a = Tensor.ones(n, m).sum(axis).reshape(n, 1).expand(n, m).sum(axis) a.realize() cache_len = len(GlobalCounters.cache) - np.testing.assert_allclose(a.numpy(), b.numpy(), rtol=1e-3, atol=1e-5) + np.testing.assert_allclose(a.numpy(), b.numpy(), rtol=1e-3, atol=1e-5) return cache_len def test_expand_reduce_is_folded_on_same_axis(self): @@ -377,9 +377,9 @@ def test_expand_reduce_is_folded_on_same_axis(self): a = Tensor.ones(n, n).sum(axis).reshape(n, 1).expand(n, n).sum(axis) a.realize() cache_len = len(GlobalCounters.cache) - np.testing.assert_allclose(a.numpy(), b.numpy(), rtol=1e-3, atol=1e-5) + np.testing.assert_allclose(a.numpy(), b.numpy(), rtol=1e-3, atol=1e-5) return cache_len - + def test_expand_reduce_is_not_folded_on_different_axes(self): axis1, axis2 = 0, 1 for n in [4, 8, 16]: @@ -388,7 +388,7 @@ def test_expand_reduce_is_not_folded_on_different_axes(self): a = Tensor.ones(n, n).sum(axis1).reshape(n, 1).expand(n, n).sum(axis2) a.realize() cache_len = len(GlobalCounters.cache) - np.testing.assert_allclose(a.numpy(), b.numpy(), rtol=1e-3, atol=1e-5) + np.testing.assert_allclose(a.numpy(), b.numpy(), rtol=1e-3, atol=1e-5) return cache_len if __name__ == '__main__': diff --git a/test/external/external_test_yolov8.py b/test/external/external_test_yolov8.py index 4555c455513af..e250c59c2e641 100644 --- a/test/external/external_test_yolov8.py +++ b/test/external/external_test_yolov8.py @@ -13,22 +13,22 @@ def test_all_load_weights(self): for variant in ['n', 's', 'm', 'l', 'x']: weights_location = Path(__file__).parent.parent.parent / "weights" / f'yolov8{variant}.safetensors' download_file(f'https://gitlab.com/r3sist/yolov8_weights/-/raw/master/yolov8{variant}.safetensors', weights_location) - - depth, width, ratio = get_variant_multiples(variant) - TinyYolov8 = YOLOv8(w=width, r=ratio, d=depth, num_classes=80) + + depth, width, ratio = get_variant_multiples(variant) + TinyYolov8 = YOLOv8(w=width, r=ratio, d=depth, num_classes=80) state_dict = safe_load(weights_location) load_state_dict(TinyYolov8, state_dict) print(f'successfully loaded weights for yolov{variant}') - + def test_predictions(self): test_image_urls = ['https://raw.githubusercontent.com/ultralytics/yolov5/master/data/images/bus.jpg', 'https://www.aljazeera.com/wp-content/uploads/2022/10/2022-04-28T192650Z_1186456067_UP1EI4S1I0P14_RTRMADP_3_SOCCER-ENGLAND-MUN-CHE-REPORT.jpg'] variant = 'n' weights_location = Path(__file__).parent.parent.parent / "weights" / f'yolov8{variant}.safetensors' - depth, width, ratio = get_variant_multiples(variant) - TinyYolov8 = YOLOv8(w=width, r=ratio, d=depth, num_classes=80) + depth, width, ratio = get_variant_multiples(variant) + TinyYolov8 = YOLOv8(w=width, r=ratio, d=depth, num_classes=80) state_dict = safe_load(weights_location) load_state_dict(TinyYolov8, state_dict) - + for i in range(len(test_image_urls)): img_stream = io.BytesIO(fetch(test_image_urls[i])) img = cv2.imdecode(np.frombuffer(img_stream.read(), np.uint8), 1) @@ -37,40 +37,41 @@ def test_predictions(self): post_predictions = postprocess(preds=predictions, img=test_image, orig_imgs=[img]) labels = label_predictions(post_predictions) assert labels == {5: 1, 0: 4, 11: 1} if i == 0 else labels == {0: 13, 29: 1, 32: 1} - + def test_forward_pass_torch_onnx(self): variant = 'n' - weights_location_onnx = Path(__file__).parent.parent.parent / "weights" / f'yolov8{variant}.onnx' - weights_location_pt = Path(__file__).parent.parent.parent / "weights" / f'yolov8{variant}.pt' - weights_location = Path(__file__).parent.parent.parent / "weights" / f'yolov8{variant}.safetensors' + weights_location_onnx = Path(__file__).parent.parent.parent / "weights" / f'yolov8{variant}.onnx' + weights_location_pt = Path(__file__).parent.parent.parent / "weights" / f'yolov8{variant}.pt' + weights_location = Path(__file__).parent.parent.parent / "weights" / f'yolov8{variant}.safetensors' download_file(f'https://github.com/ultralytics/assets/releases/download/v0.0.0/yolov8{variant}.pt', weights_location_pt) # the ultralytics export prints a lot of unneccesary things if not os.path.isfile(weights_location_onnx): - model = ultralytics.YOLO(model=weights_location_pt, task='Detect') - model.export(format="onnx",imgsz=[640, 480]) + model = ultralytics.YOLO(model=weights_location_pt, task='Detect') + model.export(format="onnx",imgsz=[640, 480]) - depth, width, ratio = get_variant_multiples(variant) - TinyYolov8 = YOLOv8(w=width, r=ratio, d=depth, num_classes=80) + depth, width, ratio = get_variant_multiples(variant) + TinyYolov8 = YOLOv8(w=width, r=ratio, d=depth, num_classes=80) state_dict = safe_load(weights_location) load_state_dict(TinyYolov8, state_dict) - + image_location = [np.frombuffer(io.BytesIO(fetch('https://raw.githubusercontent.com/ultralytics/yolov5/master/data/images/bus.jpg')).read(), np.uint8)] orig_image = [cv2.imdecode(image_location[0], 1)] - + input_image = preprocess(orig_image) - + onnx_session = ort.InferenceSession(weights_location_onnx) onnx_input_name = onnx_session.get_inputs()[0].name onnx_output_name = onnx_session.get_outputs()[0].name onnx_output = onnx_session.run([onnx_output_name], {onnx_input_name: input_image.cpu().numpy()}) tiny_output = TinyYolov8(input_image) - - # currently rtol is 0.025 because there is a 1-2% difference in our predictions - # because of the zero padding in SPPF module (line 280) maxpooling layers rather than the -infinity in torch. - # This difference does not make a difference "visually". + + # currently rtol is 0.025 because there is a 1-2% difference in our predictions + # because of the zero padding in SPPF module (line 280) maxpooling layers rather than the -infinity in torch. + # This difference does not make a difference "visually". np.testing.assert_allclose(onnx_output[0], tiny_output.cpu().numpy(), atol=5e-4, rtol=0.025) - + if __name__ == '__main__': unittest.main() + \ No newline at end of file diff --git a/test/external/graph_batchnorm.py b/test/external/graph_batchnorm.py index 08db2f81d8dac..2fa98b05c80b9 100644 --- a/test/external/graph_batchnorm.py +++ b/test/external/graph_batchnorm.py @@ -33,7 +33,7 @@ def forward(self, x): return self.c2(self.c(x)).relu() lm = LilModel() model_step(lm) - + def test_two_conv_bn(self): class LilModel: def __init__(self): diff --git a/test/extra/test_utils.py b/test/extra/test_utils.py index c72b4eb7aedca..6350403ade335 100644 --- a/test/extra/test_utils.py +++ b/test/extra/test_utils.py @@ -6,7 +6,7 @@ import torch import numpy as np -from tinygrad.helpers import getenv +from tinygrad.helpers import getenv from extra.utils import fetch, temp, download_file from tinygrad.state import torch_load from PIL import Image @@ -33,7 +33,7 @@ def setUp(self): os.chdir(self.tempdir.name) with open('test_file.txt', 'x') as f: f.write("12345") - + def tearDown(self): os.chdir(self.working_dir) self.tempdir.cleanup() @@ -41,7 +41,7 @@ def tearDown(self): #test ./ def test_fetch_relative_dotslash(self): self.assertEqual(b'12345', fetch("./test_file.txt")) - + #test ../ def test_fetch_relative_dotdotslash(self): os.mkdir('test_file_path') @@ -92,7 +92,7 @@ def __init__(self): ) if isfloat16: model = model.half() - path = temp(f"test_load_{isfloat16}.pt") + path = temp(f"test_load_{isfloat16}.pt") torch.save(model.state_dict(), path) model2 = torch_load(path) @@ -102,6 +102,5 @@ def __init__(self): assert a.shape == b.shape assert a.dtype == b.dtype assert np.array_equal(a, b) - if __name__ == '__main__': - unittest.main() + unittest.main() \ No newline at end of file diff --git a/test/test_webgpu.js b/test/test_webgpu.js index 53c9204b531b8..ce12330f665a3 100644 --- a/test/test_webgpu.js +++ b/test/test_webgpu.js @@ -10,7 +10,7 @@ function cleanup(err) { res.kill(); if(err != null) { console.error(err); - process.exit(1); + process.exit(1); } } @@ -24,7 +24,7 @@ async function waitForText(selector, text) { ready = true; break } - await timeout(2000); + await timeout(2000); n += 1 } return ready; diff --git a/tinygrad/runtime/ops_metal.py b/tinygrad/runtime/ops_metal.py index 566e6c7c75bb9..7b364dd35d909 100644 --- a/tinygrad/runtime/ops_metal.py +++ b/tinygrad/runtime/ops_metal.py @@ -11,7 +11,7 @@ class _METAL: def __init__(self): self.device = Metal.MTLCreateSystemDefaultDevice() - self.dispatch_group = libdispatch.dispatch_group_create() + self.dispatch_group = libdispatch.dispatch_group_create() self.mtl_queue = self.device.newCommandQueue() def command_buffer(self): command_buffer = self.mtl_queue.commandBuffer()