We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
======================================================== AN INTERNAL KERNEL BUILD ERROR OCCURRED! device name = gfx1010 error = -11 memory pattern = Cached global memory based trmm, computing kernel generator Subproblem dimensions: dims[0].itemY = 32, dims[0].itemX = 8, dims[0].y = 32, dims[0].x = 8, dims[0].bwidth = 32; ; dims[1].itemY = 4, dims[1].itemX = 4, dims[1].y = 4, dims[1].x = 4, dims[1].bwidth = 8; ; Parallelism granularity: pgran->wgDim = 1, pgran->wgSize[0] = 64, pgran->wgSize[1] = 1, pgran->wfSize = 64 Kernel extra flags: 942700465 Source: typedef union GPtr { __global float *f; __global float2 *f2v; __global float4 *f4v; __global float8 *f8v; __global float16 *f16v; } GPtr; typedef union LPtr { __local float *f; __local float2 *f2v; __local float4 *f4v; __local float8 *f8v; __local float16 *f16v; } LPtr; typedef union PPtr { float *f; float2 *f2v; float4 *f4v; float8 *f8v; float16 *f16v; } PPtr; __attribute__((reqd_work_group_size(64, 1, 1))) void __kernel strmmSubgroup( uint M, uint N, float alpha, const __global float *restrict A, uint lda, const __global float *restrict B, __global float *C, uint ldb) { float8 a0, a1, a2, a3; float8 b0, b1, b2, b3; float4 c0, c1, c2, c3; uint currM, currN; uint4 coord = 0; /* contains coordB, coordA, k */ const int lid = get_local_id(0); const int gid = get_global_id(0) / 64; int2 itemId; int2 subgCoord; itemId.x = get_local_id(0)%4; itemId.y = get_local_id(0)/4; subgCoord.x = itemId.y/8; subgCoord.y = itemId.y%8; currN = gid * 8; currM = (M - 1) / 32 * 32; GPtr Ag = {A}; GPtr Bg = {B}; coord.x = currN + subgCoord.x*4; for (uint m0 = 0; m0 < M; m0 += 32) { uint kBegin = 0; coord.z = kBegin; coord.y = currM + subgCoord.y*4; c0 = 0; c1 = 0; c2 = 0; c3 = 0; if ((coord.x < N) && (coord.y < M)) { if (coord.y + 4 > M) { coord.y -= 4 - M % 4; } if (coord.x + 4 > N) { coord.x -= 4 - N % 4; } uint k0; uint kMax; kMax = currM - currM%8; for( k0 = 0; k0 < kMax; k0 += 32 ) { coord.z=(k0+itemId.x*8+64*gid)%kMax; /* -- Tiles multiplier -- */ const uint4 bx = {mad24(coord.x % N, ldb, 0u), mad24((coord.x + 1) % N, ldb, 0u), mad24((coord.x + 2) % N, ldb, 0u), mad24((coord.x + 3) % N, ldb, 0u)}; const uint8 bk = ((uint8)(0, 1, 2, 3, 4, 5, 6, 7) + coord.z) % M; b0.s0 = (Bg).f[bx.s0 + bk.s0]; b0.s1 = (Bg).f[bx.s0 + bk.s1]; b0.s2 = (Bg).f[bx.s0 + bk.s2]; b0.s3 = (Bg).f[bx.s0 + bk.s3]; b0.s4 = (Bg).f[bx.s0 + bk.s4]; b0.s5 = (Bg).f[bx.s0 + bk.s5]; b0.s6 = (Bg).f[bx.s0 + bk.s6]; b0.s7 = (Bg).f[bx.s0 + bk.s7]; b1.s0 = (Bg).f[bx.s1 + bk.s0]; b1.s1 = (Bg).f[bx.s1 + bk.s1]; b1.s2 = (Bg).f[bx.s1 + bk.s2]; b1.s3 = (Bg).f[bx.s1 + bk.s3]; b1.s4 = (Bg).f[bx.s1 + bk.s4]; b1.s5 = (Bg).f[bx.s1 + bk.s5]; b1.s6 = (Bg).f[bx.s1 + bk.s6]; b1.s7 = (Bg).f[bx.s1 + bk.s7]; b2.s0 = (Bg).f[bx.s2 + bk.s0]; b2.s1 = (Bg).f[bx.s2 + bk.s1]; b2.s2 = (Bg).f[bx.s2 + bk.s2]; b2.s3 = (Bg).f[bx.s2 + bk.s3]; b2.s4 = (Bg).f[bx.s2 + bk.s4]; b2.s5 = (Bg).f[bx.s2 + bk.s5]; b2.s6 = (Bg).f[bx.s2 + bk.s6]; b2.s7 = (Bg).f[bx.s2 + bk.s7]; b3.s0 = (Bg).f[bx.s3 + bk.s0]; b3.s1 = (Bg).f[bx.s3 + bk.s1]; b3.s2 = (Bg).f[bx.s3 + bk.s2]; b3.s3 = (Bg).f[bx.s3 + bk.s3]; b3.s4 = (Bg).f[bx.s3 + bk.s4]; b3.s5 = (Bg).f[bx.s3 + bk.s5]; b3.s6 = (Bg).f[bx.s3 + bk.s6]; b3.s7 = (Bg).f[bx.s3 + bk.s7]; const uint4 ay = {mad24(coord.y % M, lda, 0u), mad24((coord.y + 1) % M, lda, 0u), mad24((coord.y + 2) % M, lda, 0u), mad24((coord.y + 3) % M, lda, 0u)}; const uint8 ak = ((uint8)(0, 1, 2, 3, 4, 5, 6, 7) + coord.z) % M; a0.s0 = (Ag).f[ay.s0 + ak.s0]; a0.s1 = (Ag).f[ay.s0 + ak.s1]; a0.s2 = (Ag).f[ay.s0 + ak.s2]; a0.s3 = (Ag).f[ay.s0 + ak.s3]; a0.s4 = (Ag).f[ay.s0 + ak.s4]; a0.s5 = (Ag).f[ay.s0 + ak.s5]; a0.s6 = (Ag).f[ay.s0 + ak.s6]; a0.s7 = (Ag).f[ay.s0 + ak.s7]; a1.s0 = (Ag).f[ay.s1 + ak.s0]; a1.s1 = (Ag).f[ay.s1 + ak.s1]; a1.s2 = (Ag).f[ay.s1 + ak.s2]; a1.s3 = (Ag).f[ay.s1 + ak.s3]; a1.s4 = (Ag).f[ay.s1 + ak.s4]; a1.s5 = (Ag).f[ay.s1 + ak.s5]; a1.s6 = (Ag).f[ay.s1 + ak.s6]; a1.s7 = (Ag).f[ay.s1 + ak.s7]; a2.s0 = (Ag).f[ay.s2 + ak.s0]; a2.s1 = (Ag).f[ay.s2 + ak.s1]; a2.s2 = (Ag).f[ay.s2 + ak.s2]; a2.s3 = (Ag).f[ay.s2 + ak.s3]; a2.s4 = (Ag).f[ay.s2 + ak.s4]; a2.s5 = (Ag).f[ay.s2 + ak.s5]; a2.s6 = (Ag).f[ay.s2 + ak.s6]; a2.s7 = (Ag).f[ay.s2 + ak.s7]; a3.s0 = (Ag).f[ay.s3 + ak.s0]; a3.s1 = (Ag).f[ay.s3 + ak.s1]; a3.s2 = (Ag).f[ay.s3 + ak.s2]; a3.s3 = (Ag).f[ay.s3 + ak.s3]; a3.s4 = (Ag).f[ay.s3 + ak.s4]; a3.s5 = (Ag).f[ay.s3 + ak.s5]; a3.s6 = (Ag).f[ay.s3 + ak.s6]; a3.s7 = (Ag).f[ay.s3 + ak.s7]; c0.s0 = mad(a0.s0, b0.s0, c0.s0); c0.s0 = mad(a0.s1, b0.s1, c0.s0); c0.s0 = mad(a0.s2, b0.s2, c0.s0); c0.s0 = mad(a0.s3, b0.s3, c0.s0); c0.s0 = mad(a0.s4, b0.s4, c0.s0); c0.s0 = mad(a0.s5, b0.s5, c0.s0); c0.s0 = mad(a0.s6, b0.s6, c0.s0); c0.s0 = mad(a0.s7, b0.s7, c0.s0); c1.s0 = mad(a0.s0, b1.s0, c1.s0); c1.s0 = mad(a0.s1, b1.s1, c1.s0); c1.s0 = mad(a0.s2, b1.s2, c1.s0); c1.s0 = mad(a0.s3, b1.s3, c1.s0); c1.s0 = mad(a0.s4, b1.s4, c1.s0); c1.s0 = mad(a0.s5, b1.s5, c1.s0); c1.s0 = mad(a0.s6, b1.s6, c1.s0); c1.s0 = mad(a0.s7, b1.s7, c1.s0); c2.s0 = mad(a0.s0, b2.s0, c2.s0); c2.s0 = mad(a0.s1, b2.s1, c2.s0); c2.s0 = mad(a0.s2, b2.s2, c2.s0); c2.s0 = mad(a0.s3, b2.s3, c2.s0); c2.s0 = mad(a0.s4, b2.s4, c2.s0); c2.s0 = mad(a0.s5, b2.s5, c2.s0); c2.s0 = mad(a0.s6, b2.s6, c2.s0); c2.s0 = mad(a0.s7, b2.s7, c2.s0); c3.s0 = mad(a0.s0, b3.s0, c3.s0); c3.s0 = mad(a0.s1, b3.s1, c3.s0); c3.s0 = mad(a0.s2, b3.s2, c3.s0); c3.s0 = mad(a0.s3, b3.s3, c3.s0); c3.s0 = mad(a0.s4, b3.s4, c3.s0); c3.s0 = mad(a0.s5, b3.s5, c3.s0); c3.s0 = mad(a0.s6, b3.s6, c3.s0); c3.s0 = mad(a0.s7, b3.s7, c3.s0); c0.s1 = mad(a1.s0, b0.s0, c0.s1); c0.s1 = mad(a1.s1, b0.s1, c0.s1); c0.s1 = mad(a1.s2, b0.s2, c0.s1); c0.s1 = mad(a1.s3, b0.s3, c0.s1); c0.s1 = mad(a1.s4, b0.s4, c0.s1); c0.s1 = mad(a1.s5, b0.s5, c0.s1); c0.s1 = mad(a1.s6, b0.s6, c0.s1); c0.s1 = mad(a1.s7, b0.s7, c0.s1); c1.s1 = mad(a1.s0, b1.s0, c1.s1); c1.s1 = mad(a1.s1, b1.s1, c1.s1); c1.s1 = mad(a1.s2, b1.s2, c1.s1); c1.s1 = mad(a1.s3, b1.s3, c1.s1); c1.s1 = mad(a1.s4, b1.s4, c1.s1); c1.s1 = mad(a1.s5, b1.s5, c1.s1); c1.s1 = mad(a1.s6, b1.s6, c1.s1); c1.s1 = mad(a1.s7, b1.s7, c1.s1); c2.s1 = mad(a1.s0, b2.s0, c2.s1); c2.s1 = mad(a1.s1, b2.s1, c2.s1); c2.s1 = mad(a1.s2, b2.s2, c2.s1); c2.s1 = mad(a1.s3, b2.s3, c2.s1); c2.s1 = mad(a1.s4, b2.s4, c2.s1); c2.s1 = mad(a1.s5, b2.s5, c2.s1); c2.s1 = mad(a1.s6, b2.s6, c2.s1); c2.s1 = mad(a1.s7, b2.s7, c2.s1); c3.s1 = mad(a1.s0, b3.s0, c3.s1); c3.s1 = mad(a1.s1, b3.s1, c3.s1); c3.s1 = mad(a1.s2, b3.s2, c3.s1); c3.s1 = mad(a1.s3, b3.s3, c3.s1); c3.s1 = mad(a1.s4, b3.s4, c3.s1); c3.s1 = mad(a1.s5, b3.s5, c3.s1); c3.s1 = mad(a1.s6, b3.s6, c3.s1); c3.s1 = mad(a1.s7, b3.s7, c3.s1); c0.s2 = mad(a2.s0, b0.s0, c0.s2); c0.s2 = mad(a2.s1, b0.s1, c0.s2); c0.s2 = mad(a2.s2, b0.s2, c0.s2); c0.s2 = mad(a2.s3, b0.s3, c0.s2); c0.s2 = mad(a2.s4, b0.s4, c0.s2); c0.s2 = mad(a2.s5, b0.s5, c0.s2); c0.s2 = mad(a2.s6, b0.s6, c0.s2); c0.s2 = mad(a2.s7, b0.s7, c0.s2); c1.s2 = mad(a2.s0, b1.s0, c1.s2); c1.s2 = mad(a2.s1, b1.s1, c1.s2); c1.s2 = mad(a2.s2, b1.s2, c1.s2); c1.s2 = mad(a2.s3, b1.s3, c1.s2); c1.s2 = mad(a2.s4, b1.s4, c1.s2); c1.s2 = mad(a2.s5, b1.s5, c1.s2); c1.s2 = mad(a2.s6, b1.s6, c1.s2); c1.s2 = mad(a2.s7, b1.s7, c1.s2); c2.s2 = mad(a2.s0, b2.s0, c2.s2); c2.s2 = mad(a2.s1, b2.s1, c2.s2); c2.s2 = mad(a2.s2, b2.s2, c2.s2); c2.s2 = mad(a2.s3, b2.s3, c2.s2); c2.s2 = mad(a2.s4, b2.s4, c2.s2); c2.s2 = mad(a2.s5, b2.s5, c2.s2); c2.s2 = mad(a2.s6, b2.s6, c2.s2); c2.s2 = mad(a2.s7, b2.s7, c2.s2); c3.s2 = mad(a2.s0, b3.s0, c3.s2); c3.s2 = mad(a2.s1, b3.s1, c3.s2); c3.s2 = mad(a2.s2, b3.s2, c3.s2); c3.s2 = mad(a2.s3, b3.s3, c3.s2); c3.s2 = mad(a2.s4, b3.s4, c3.s2); c3.s2 = mad(a2.s5, b3.s5, c3.s2); c3.s2 = mad(a2.s6, b3.s6, c3.s2); c3.s2 = mad(a2.s7, b3.s7, c3.s2); c0.s3 = mad(a3.s0, b0.s0, c0.s3); c0.s3 = mad(a3.s1, b0.s1, c0.s3); c0.s3 = mad(a3.s2, b0.s2, c0.s3); c0.s3 = mad(a3.s3, b0.s3, c0.s3); c0.s3 = mad(a3.s4, b0.s4, c0.s3); c0.s3 = mad(a3.s5, b0.s5, c0.s3); c0.s3 = mad(a3.s6, b0.s6, c0.s3); c0.s3 = mad(a3.s7, b0.s7, c0.s3); c1.s3 = mad(a3.s0, b1.s0, c1.s3); c1.s3 = mad(a3.s1, b1.s1, c1.s3); c1.s3 = mad(a3.s2, b1.s2, c1.s3); c1.s3 = mad(a3.s3, b1.s3, c1.s3); c1.s3 = mad(a3.s4, b1.s4, c1.s3); c1.s3 = mad(a3.s5, b1.s5, c1.s3); c1.s3 = mad(a3.s6, b1.s6, c1.s3); c1.s3 = mad(a3.s7, b1.s7, c1.s3); c2.s3 = mad(a3.s0, b2.s0, c2.s3); c2.s3 = mad(a3.s1, b2.s1, c2.s3); c2.s3 = mad(a3.s2, b2.s2, c2.s3); c2.s3 = mad(a3.s3, b2.s3, c2.s3); c2.s3 = mad(a3.s4, b2.s4, c2.s3); c2.s3 = mad(a3.s5, b2.s5, c2.s3); c2.s3 = mad(a3.s6, b2.s6, c2.s3); c2.s3 = mad(a3.s7, b2.s7, c2.s3); c3.s3 = mad(a3.s0, b3.s0, c3.s3); c3.s3 = mad(a3.s1, b3.s1, c3.s3); c3.s3 = mad(a3.s2, b3.s2, c3.s3); c3.s3 = mad(a3.s3, b3.s3, c3.s3); c3.s3 = mad(a3.s4, b3.s4, c3.s3); c3.s3 = mad(a3.s5, b3.s5, c3.s3); c3.s3 = mad(a3.s6, b3.s6, c3.s3); c3.s3 = mad(a3.s7, b3.s7, c3.s3); /* ---------------------- */ } if( itemId.x == 0 ) { for( k0 = kMax; (k0 < currM+32)&&(k0 < M); k0 += 1 ) { coord.z=k0; /* -- Tiles multiplier -- */ const uint bk = coord.z % M; b0.s0 = (Bg).f[mad24(coord.x % N, ldb, bk)]; b1.s0 = (Bg).f[mad24((coord.x + 1) % N, ldb, bk)]; b2.s0 = (Bg).f[mad24((coord.x + 2) % N, ldb, bk)]; b3.s0 = (Bg).f[mad24((coord.x + 3) % N, ldb, bk)]; b0.s0 = (coord.z < M) ? b0.s0 : 0; b1.s0 = (coord.z < M) ? b1.s0 : 0; b2.s0 = (coord.z < M) ? b2.s0 : 0; b3.s0 = (coord.z < M) ? b3.s0 : 0; const uint ak = coord.z % M; a0.s0 = (Ag).f[mad24(coord.y % M, lda, ak)]; a1.s0 = (Ag).f[mad24((coord.y + 1) % M, lda, ak)]; a2.s0 = (Ag).f[mad24((coord.y + 2) % M, lda, ak)]; a3.s0 = (Ag).f[mad24((coord.y + 3) % M, lda, ak)]; a0.s0 = (coord.z < M) ? a0.s0 : 0; a1.s0 = (coord.z < M) ? a1.s0 : 0; a2.s0 = (coord.z < M) ? a2.s0 : 0; a3.s0 = (coord.z < M) ? a3.s0 : 0; // post fetch A { uint zy = coord.y; a0.s0 = zy < coord.z ? 0 : a0.s0; a0.s0 = zy == coord.z ? 1 : a0.s0; zy++; a1.s0 = zy < coord.z ? 0 : a1.s0; a1.s0 = zy == coord.z ? 1 : a1.s0; zy++; a2.s0 = zy < coord.z ? 0 : a2.s0; a2.s0 = zy == coord.z ? 1 : a2.s0; zy++; a3.s0 = zy < coord.z ? 0 : a3.s0; a3.s0 = zy == coord.z ? 1 : a3.s0; } c0.s0 = mad(a0.s0, b0.s0, c0.s0); c1.s0 = mad(a0.s0, b1.s0, c1.s0); c2.s0 = mad(a0.s0, b2.s0, c2.s0); c3.s0 = mad(a0.s0, b3.s0, c3.s0); c0.s1 = mad(a1.s0, b0.s0, c0.s1); c1.s1 = mad(a1.s0, b1.s0, c1.s1); c2.s1 = mad(a1.s0, b2.s0, c2.s1); c3.s1 = mad(a1.s0, b3.s0, c3.s1); c0.s2 = mad(a2.s0, b0.s0, c0.s2); c1.s2 = mad(a2.s0, b1.s0, c1.s2); c2.s2 = mad(a2.s0, b2.s0, c2.s2); c3.s2 = mad(a2.s0, b3.s0, c3.s2); c0.s3 = mad(a3.s0, b0.s0, c0.s3); c1.s3 = mad(a3.s0, b1.s0, c1.s3); c2.s3 = mad(a3.s0, b2.s0, c2.s3); c3.s3 = mad(a3.s0, b3.s0, c3.s3); /* ---------------------- */ } } } barrier(CLK_GLOBAL_MEM_FENCE); if ((coord.y + 4 == M) && (M % 4)) { coord.y += 4 - M % 4; } if ((coord.x + 4 == N) && (N % 4)) { coord.x += 4 - N % 4; } //-----MergeUpdateResult // veclenC scratch[SUBG_ITEMS*MSTEP_SUBG*vecNumC] __local float4 ascratch[4*16*4]; __local float4 *scratch = ascratch; //LDS block has the same vectorization as C matrix block //VNUM_C*((get_local_id(1)%MSTEP_SUBG)*SUBG_ITEMS +get_local_id(0) ); scratch += 4*((itemId.y%16)*4 +itemId.x ); for( uint mstep = 0; mstep < 16; mstep += 16 ) { if( (itemId.y >= mstep)&&(itemId.y < (mstep+16)) ) { scratch[0] = c0; scratch[1] = c1; scratch[2] = c2; scratch[3] = c3; c0 = 0; c1 = 0; c2 = 0; c3 = 0; } barrier(CLK_LOCAL_MEM_FENCE); if( (itemId.y >= mstep)&&(itemId.y < (mstep+16)) ) { if ( 0 == itemId.x ) { for(uint k = 0; k < 4 * 4; k += 4) { c0 += scratch[0]; c1 += scratch[1]; c2 += scratch[2]; c3 += scratch[3]; //Adding the LDS block size in vectors scratch += 4; } if ((coord.y < M) && (coord.x < N)) { uint y = min(4u, M - (uint)coord.y); uint x = min(4u, N - (uint)coord.x); if ((y == 4) && (x == 4)) { GPtr uC; uC.f = C + coord.x * ldb + coord.y; __global float *pC = uC.f; float4 tempC0, tempC1, tempC2, tempC3; tempC0 = mad(c0, alpha, 0); tempC1 = mad(c1, alpha, 0); tempC2 = mad(c2, alpha, 0); tempC3 = mad(c3, alpha, 0); pC[0] = tempC0.s0; pC[1] = tempC0.s1; pC[2] = tempC0.s2; pC[3] = tempC0.s3; pC[ldb] = tempC1.s0; pC[ldb + 1] = tempC1.s1; pC[ldb + 2] = tempC1.s2; pC[ldb + 3] = tempC1.s3; pC[(ldb << 1)] = tempC2.s0; pC[mad24(2u, ldb, 1u)] = tempC2.s1; pC[mad24(2u, ldb, 2u)] = tempC2.s2; pC[mad24(2u, ldb, 3u)] = tempC2.s3; pC[mad24(3u, ldb, 0u)] = tempC3.s0; pC[mad24(3u, ldb, 1u)] = tempC3.s1; pC[mad24(3u, ldb, 2u)] = tempC3.s2; pC[mad24(3u, ldb, 3u)] = tempC3.s3; } else { GPtr uC; int i, j; PPtr res; uC.f = C + coord.x * ldb + coord.y; uC.f += (x-1) * ldb; if (x) { switch (y) { case 4: uC.f[(y+0) % 4] = c3.s0 * alpha; case 3: uC.f[(y+1) % 4] = c3.s1 * alpha; case 2: uC.f[(y+2) % 4] = c3.s2 * alpha; case 1: uC.f[(y+3) % 4] = c3.s3 * alpha; } uC.f -= ldb; x--; } if (x) { switch (y) { case 4: uC.f[(y+0) % 4] = c2.s0 * alpha; case 3: uC.f[(y+1) % 4] = c2.s1 * alpha; case 2: uC.f[(y+2) % 4] = c2.s2 * alpha; case 1: uC.f[(y+3) % 4] = c2.s3 * alpha; } uC.f -= ldb; x--; } if (x) { switch (y) { case 4: uC.f[(y+0) % 4] = c1.s0 * alpha; case 3: uC.f[(y+1) % 4] = c1.s1 * alpha; case 2: uC.f[(y+2) % 4] = c1.s2 * alpha; case 1: uC.f[(y+3) % 4] = c1.s3 * alpha; } uC.f -= ldb; x--; } if (x) { switch (y) { case 4: uC.f[(y+0) % 4] = c0.s0 * alpha; case 3: uC.f[(y+1) % 4] = c0.s1 * alpha; case 2: uC.f[(y+2) % 4] = c0.s2 * alpha; case 1: uC.f[(y+3) % 4] = c0.s3 * alpha; } uC.f -= ldb; x--; } } } } } barrier(CLK_LOCAL_MEM_FENCE); } currM -= 32; } } -------------------------------------------------------- Build log: /tmp/comgr-a1a18b/input/CompileCLSource:56:16: warning: initializing '__global float *' with an expression of type 'const __global float *restrict' discards qualifiers GPtr Ag = {A}; ^ /tmp/comgr-a1a18b/input/CompileCLSource:57:16: warning: initializing '__global float *' with an expression of type 'const __global float *restrict' discards qualifiers GPtr Bg = {B}; ^ /tmp/comgr-a1a18b/input/CompileCLSource:366:24: error: variables in the local address space can only be declared in the outermost scope of a kernel function __local float4 ascratch[4*16*4]; ^ 2 warnings and 1 error generated. Error: Failed to compile opencl source (from CL to LLVM IR). ======================================================== Segmentation fault (core dumped)
Additionally, several tests fail, e.g.
[ RUN ] ColumnMajor_SmallRange_BigLDA_OffSet/GEMM.zgemm/39 m : 6 n: 63 /home/robin/dev/clBLAS/src/tests/include/matrix.h:472: Failure The difference between ((ref).s[0]) and ((clresult).s[0]) is 58230133, which exceeds delta, where ((ref).s[0]) evaluates to -270497230451976, ((clresult).s[0]) evaluates to -270497288682109, and delta evaluates to 0. clblasColumnMajor, clblasTrans, clblasTrans, M = 128, N = 128, K = 128, offA = 1, offB = 0, offC = 0, lda = 500, ldb = 501, ldc = 502 seed = 12345, queues = 1, [ FAILED ] ColumnMajor_SmallRange_BigLDA_OffSet/GEMM.zgemm/39, where GetParam() = (1, 1, 1, 128, 128, 128, 48-byte object <F4-01 00-00 00-00 00-00 F5-01 00-00 00-00 00-00 F6-01 00-00 00-00 00-00 01-00 00-00 00-00 00-00 00-00 00-00 00-00 00-00 00-00 00-00 00-00 00-00>, 1) (9 ms)
The text was updated successfully, but these errors were encountered:
No branches or pull requests
Additionally, several tests fail, e.g.
The text was updated successfully, but these errors were encountered: