Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

test-short failure on gfx1010 (RX5700 XT) #351

Open
robinchrist opened this issue Apr 22, 2020 · 0 comments
Open

test-short failure on gfx1010 (RX5700 XT) #351

robinchrist opened this issue Apr 22, 2020 · 0 comments

Comments

@robinchrist
Copy link

========================================================

AN INTERNAL KERNEL BUILD ERROR OCCURRED!
device name = gfx1010
error = -11
memory pattern = Cached global memory based trmm, computing kernel generator
Subproblem dimensions: dims[0].itemY = 32, dims[0].itemX = 8, dims[0].y = 32, dims[0].x = 8, dims[0].bwidth = 32; ; dims[1].itemY = 4, dims[1].itemX = 4, dims[1].y = 4, dims[1].x = 4, dims[1].bwidth = 8; ; 
Parallelism granularity: pgran->wgDim = 1, pgran->wgSize[0] = 64, pgran->wgSize[1] = 1, pgran->wfSize = 64
Kernel extra flags: 942700465
Source:

typedef union GPtr {
    __global float *f;
    __global float2 *f2v;
    __global float4 *f4v;
    __global float8 *f8v;
    __global float16 *f16v;
} GPtr;

typedef union LPtr {
    __local float *f;
    __local float2 *f2v;
    __local float4 *f4v;
    __local float8 *f8v;
    __local float16 *f16v;
} LPtr;

typedef union PPtr {
    float *f;
    float2 *f2v;
    float4 *f4v;
    float8 *f8v;
    float16 *f16v;
} PPtr;

__attribute__((reqd_work_group_size(64, 1, 1)))
void __kernel
strmmSubgroup(
    uint M,
    uint N,
    float alpha,
    const __global float *restrict A,
    uint lda,
    const __global float *restrict B,
    __global float *C,
    uint ldb)
{
    float8 a0, a1, a2, a3;
    float8 b0, b1, b2, b3;
    float4 c0, c1, c2, c3;
    uint currM, currN;
    uint4 coord = 0; /* contains coordB, coordA, k */
    const int lid = get_local_id(0);
    const int gid = get_global_id(0) / 64;


    int2 itemId;
    int2 subgCoord;
    itemId.x = get_local_id(0)%4;
    itemId.y = get_local_id(0)/4;
    subgCoord.x = itemId.y/8;
    subgCoord.y = itemId.y%8;

    currN = gid * 8;
    currM = (M - 1) / 32 * 32;

    GPtr Ag = {A};
    GPtr Bg = {B};
    coord.x = currN + subgCoord.x*4;
    for (uint m0 = 0; m0 < M; m0 += 32) {
        uint kBegin = 0;
        coord.z = kBegin;
        coord.y = currM + subgCoord.y*4;
        c0 = 0;
        c1 = 0;
        c2 = 0;
        c3 = 0;

        if ((coord.x < N) && (coord.y < M)) {
            if (coord.y + 4 > M) {
                coord.y -= 4 - M % 4;
            }
            if (coord.x + 4 > N) {
                coord.x -= 4 - N % 4;
            }

            uint k0;
            uint kMax;
            kMax = currM - currM%8;
            for( k0 = 0; k0 < kMax; k0 += 32 ) {
                coord.z=(k0+itemId.x*8+64*gid)%kMax;
                /* -- Tiles multiplier -- */
                const uint4 bx = {mad24(coord.x % N, ldb, 0u), mad24((coord.x + 1) % N, ldb, 0u), mad24((coord.x + 2) % N, ldb, 0u),
                                mad24((coord.x + 3) % N, ldb, 0u)};
                const uint8 bk = ((uint8)(0, 1, 2, 3, 4, 5, 6, 7) + coord.z) % M;

                b0.s0 = (Bg).f[bx.s0 + bk.s0];
                b0.s1 = (Bg).f[bx.s0 + bk.s1];
                b0.s2 = (Bg).f[bx.s0 + bk.s2];
                b0.s3 = (Bg).f[bx.s0 + bk.s3];
                b0.s4 = (Bg).f[bx.s0 + bk.s4];
                b0.s5 = (Bg).f[bx.s0 + bk.s5];
                b0.s6 = (Bg).f[bx.s0 + bk.s6];
                b0.s7 = (Bg).f[bx.s0 + bk.s7];
                b1.s0 = (Bg).f[bx.s1 + bk.s0];
                b1.s1 = (Bg).f[bx.s1 + bk.s1];
                b1.s2 = (Bg).f[bx.s1 + bk.s2];
                b1.s3 = (Bg).f[bx.s1 + bk.s3];
                b1.s4 = (Bg).f[bx.s1 + bk.s4];
                b1.s5 = (Bg).f[bx.s1 + bk.s5];
                b1.s6 = (Bg).f[bx.s1 + bk.s6];
                b1.s7 = (Bg).f[bx.s1 + bk.s7];
                b2.s0 = (Bg).f[bx.s2 + bk.s0];
                b2.s1 = (Bg).f[bx.s2 + bk.s1];
                b2.s2 = (Bg).f[bx.s2 + bk.s2];
                b2.s3 = (Bg).f[bx.s2 + bk.s3];
                b2.s4 = (Bg).f[bx.s2 + bk.s4];
                b2.s5 = (Bg).f[bx.s2 + bk.s5];
                b2.s6 = (Bg).f[bx.s2 + bk.s6];
                b2.s7 = (Bg).f[bx.s2 + bk.s7];
                b3.s0 = (Bg).f[bx.s3 + bk.s0];
                b3.s1 = (Bg).f[bx.s3 + bk.s1];
                b3.s2 = (Bg).f[bx.s3 + bk.s2];
                b3.s3 = (Bg).f[bx.s3 + bk.s3];
                b3.s4 = (Bg).f[bx.s3 + bk.s4];
                b3.s5 = (Bg).f[bx.s3 + bk.s5];
                b3.s6 = (Bg).f[bx.s3 + bk.s6];
                b3.s7 = (Bg).f[bx.s3 + bk.s7];

                const uint4 ay = {mad24(coord.y % M, lda, 0u), mad24((coord.y + 1) % M, lda, 0u), mad24((coord.y + 2) % M, lda, 0u),
                                mad24((coord.y + 3) % M, lda, 0u)};
                const uint8 ak = ((uint8)(0, 1, 2, 3, 4, 5, 6, 7) + coord.z) % M;

                a0.s0 = (Ag).f[ay.s0 + ak.s0];
                a0.s1 = (Ag).f[ay.s0 + ak.s1];
                a0.s2 = (Ag).f[ay.s0 + ak.s2];
                a0.s3 = (Ag).f[ay.s0 + ak.s3];
                a0.s4 = (Ag).f[ay.s0 + ak.s4];
                a0.s5 = (Ag).f[ay.s0 + ak.s5];
                a0.s6 = (Ag).f[ay.s0 + ak.s6];
                a0.s7 = (Ag).f[ay.s0 + ak.s7];
                a1.s0 = (Ag).f[ay.s1 + ak.s0];
                a1.s1 = (Ag).f[ay.s1 + ak.s1];
                a1.s2 = (Ag).f[ay.s1 + ak.s2];
                a1.s3 = (Ag).f[ay.s1 + ak.s3];
                a1.s4 = (Ag).f[ay.s1 + ak.s4];
                a1.s5 = (Ag).f[ay.s1 + ak.s5];
                a1.s6 = (Ag).f[ay.s1 + ak.s6];
                a1.s7 = (Ag).f[ay.s1 + ak.s7];
                a2.s0 = (Ag).f[ay.s2 + ak.s0];
                a2.s1 = (Ag).f[ay.s2 + ak.s1];
                a2.s2 = (Ag).f[ay.s2 + ak.s2];
                a2.s3 = (Ag).f[ay.s2 + ak.s3];
                a2.s4 = (Ag).f[ay.s2 + ak.s4];
                a2.s5 = (Ag).f[ay.s2 + ak.s5];
                a2.s6 = (Ag).f[ay.s2 + ak.s6];
                a2.s7 = (Ag).f[ay.s2 + ak.s7];
                a3.s0 = (Ag).f[ay.s3 + ak.s0];
                a3.s1 = (Ag).f[ay.s3 + ak.s1];
                a3.s2 = (Ag).f[ay.s3 + ak.s2];
                a3.s3 = (Ag).f[ay.s3 + ak.s3];
                a3.s4 = (Ag).f[ay.s3 + ak.s4];
                a3.s5 = (Ag).f[ay.s3 + ak.s5];
                a3.s6 = (Ag).f[ay.s3 + ak.s6];
                a3.s7 = (Ag).f[ay.s3 + ak.s7];

                c0.s0 = mad(a0.s0, b0.s0, c0.s0);
                c0.s0 = mad(a0.s1, b0.s1, c0.s0);
                c0.s0 = mad(a0.s2, b0.s2, c0.s0);
                c0.s0 = mad(a0.s3, b0.s3, c0.s0);
                c0.s0 = mad(a0.s4, b0.s4, c0.s0);
                c0.s0 = mad(a0.s5, b0.s5, c0.s0);
                c0.s0 = mad(a0.s6, b0.s6, c0.s0);
                c0.s0 = mad(a0.s7, b0.s7, c0.s0);
                c1.s0 = mad(a0.s0, b1.s0, c1.s0);
                c1.s0 = mad(a0.s1, b1.s1, c1.s0);
                c1.s0 = mad(a0.s2, b1.s2, c1.s0);
                c1.s0 = mad(a0.s3, b1.s3, c1.s0);
                c1.s0 = mad(a0.s4, b1.s4, c1.s0);
                c1.s0 = mad(a0.s5, b1.s5, c1.s0);
                c1.s0 = mad(a0.s6, b1.s6, c1.s0);
                c1.s0 = mad(a0.s7, b1.s7, c1.s0);
                c2.s0 = mad(a0.s0, b2.s0, c2.s0);
                c2.s0 = mad(a0.s1, b2.s1, c2.s0);
                c2.s0 = mad(a0.s2, b2.s2, c2.s0);
                c2.s0 = mad(a0.s3, b2.s3, c2.s0);
                c2.s0 = mad(a0.s4, b2.s4, c2.s0);
                c2.s0 = mad(a0.s5, b2.s5, c2.s0);
                c2.s0 = mad(a0.s6, b2.s6, c2.s0);
                c2.s0 = mad(a0.s7, b2.s7, c2.s0);
                c3.s0 = mad(a0.s0, b3.s0, c3.s0);
                c3.s0 = mad(a0.s1, b3.s1, c3.s0);
                c3.s0 = mad(a0.s2, b3.s2, c3.s0);
                c3.s0 = mad(a0.s3, b3.s3, c3.s0);
                c3.s0 = mad(a0.s4, b3.s4, c3.s0);
                c3.s0 = mad(a0.s5, b3.s5, c3.s0);
                c3.s0 = mad(a0.s6, b3.s6, c3.s0);
                c3.s0 = mad(a0.s7, b3.s7, c3.s0);

                c0.s1 = mad(a1.s0, b0.s0, c0.s1);
                c0.s1 = mad(a1.s1, b0.s1, c0.s1);
                c0.s1 = mad(a1.s2, b0.s2, c0.s1);
                c0.s1 = mad(a1.s3, b0.s3, c0.s1);
                c0.s1 = mad(a1.s4, b0.s4, c0.s1);
                c0.s1 = mad(a1.s5, b0.s5, c0.s1);
                c0.s1 = mad(a1.s6, b0.s6, c0.s1);
                c0.s1 = mad(a1.s7, b0.s7, c0.s1);
                c1.s1 = mad(a1.s0, b1.s0, c1.s1);
                c1.s1 = mad(a1.s1, b1.s1, c1.s1);
                c1.s1 = mad(a1.s2, b1.s2, c1.s1);
                c1.s1 = mad(a1.s3, b1.s3, c1.s1);
                c1.s1 = mad(a1.s4, b1.s4, c1.s1);
                c1.s1 = mad(a1.s5, b1.s5, c1.s1);
                c1.s1 = mad(a1.s6, b1.s6, c1.s1);
                c1.s1 = mad(a1.s7, b1.s7, c1.s1);
                c2.s1 = mad(a1.s0, b2.s0, c2.s1);
                c2.s1 = mad(a1.s1, b2.s1, c2.s1);
                c2.s1 = mad(a1.s2, b2.s2, c2.s1);
                c2.s1 = mad(a1.s3, b2.s3, c2.s1);
                c2.s1 = mad(a1.s4, b2.s4, c2.s1);
                c2.s1 = mad(a1.s5, b2.s5, c2.s1);
                c2.s1 = mad(a1.s6, b2.s6, c2.s1);
                c2.s1 = mad(a1.s7, b2.s7, c2.s1);
                c3.s1 = mad(a1.s0, b3.s0, c3.s1);
                c3.s1 = mad(a1.s1, b3.s1, c3.s1);
                c3.s1 = mad(a1.s2, b3.s2, c3.s1);
                c3.s1 = mad(a1.s3, b3.s3, c3.s1);
                c3.s1 = mad(a1.s4, b3.s4, c3.s1);
                c3.s1 = mad(a1.s5, b3.s5, c3.s1);
                c3.s1 = mad(a1.s6, b3.s6, c3.s1);
                c3.s1 = mad(a1.s7, b3.s7, c3.s1);

                c0.s2 = mad(a2.s0, b0.s0, c0.s2);
                c0.s2 = mad(a2.s1, b0.s1, c0.s2);
                c0.s2 = mad(a2.s2, b0.s2, c0.s2);
                c0.s2 = mad(a2.s3, b0.s3, c0.s2);
                c0.s2 = mad(a2.s4, b0.s4, c0.s2);
                c0.s2 = mad(a2.s5, b0.s5, c0.s2);
                c0.s2 = mad(a2.s6, b0.s6, c0.s2);
                c0.s2 = mad(a2.s7, b0.s7, c0.s2);
                c1.s2 = mad(a2.s0, b1.s0, c1.s2);
                c1.s2 = mad(a2.s1, b1.s1, c1.s2);
                c1.s2 = mad(a2.s2, b1.s2, c1.s2);
                c1.s2 = mad(a2.s3, b1.s3, c1.s2);
                c1.s2 = mad(a2.s4, b1.s4, c1.s2);
                c1.s2 = mad(a2.s5, b1.s5, c1.s2);
                c1.s2 = mad(a2.s6, b1.s6, c1.s2);
                c1.s2 = mad(a2.s7, b1.s7, c1.s2);
                c2.s2 = mad(a2.s0, b2.s0, c2.s2);
                c2.s2 = mad(a2.s1, b2.s1, c2.s2);
                c2.s2 = mad(a2.s2, b2.s2, c2.s2);
                c2.s2 = mad(a2.s3, b2.s3, c2.s2);
                c2.s2 = mad(a2.s4, b2.s4, c2.s2);
                c2.s2 = mad(a2.s5, b2.s5, c2.s2);
                c2.s2 = mad(a2.s6, b2.s6, c2.s2);
                c2.s2 = mad(a2.s7, b2.s7, c2.s2);
                c3.s2 = mad(a2.s0, b3.s0, c3.s2);
                c3.s2 = mad(a2.s1, b3.s1, c3.s2);
                c3.s2 = mad(a2.s2, b3.s2, c3.s2);
                c3.s2 = mad(a2.s3, b3.s3, c3.s2);
                c3.s2 = mad(a2.s4, b3.s4, c3.s2);
                c3.s2 = mad(a2.s5, b3.s5, c3.s2);
                c3.s2 = mad(a2.s6, b3.s6, c3.s2);
                c3.s2 = mad(a2.s7, b3.s7, c3.s2);

                c0.s3 = mad(a3.s0, b0.s0, c0.s3);
                c0.s3 = mad(a3.s1, b0.s1, c0.s3);
                c0.s3 = mad(a3.s2, b0.s2, c0.s3);
                c0.s3 = mad(a3.s3, b0.s3, c0.s3);
                c0.s3 = mad(a3.s4, b0.s4, c0.s3);
                c0.s3 = mad(a3.s5, b0.s5, c0.s3);
                c0.s3 = mad(a3.s6, b0.s6, c0.s3);
                c0.s3 = mad(a3.s7, b0.s7, c0.s3);
                c1.s3 = mad(a3.s0, b1.s0, c1.s3);
                c1.s3 = mad(a3.s1, b1.s1, c1.s3);
                c1.s3 = mad(a3.s2, b1.s2, c1.s3);
                c1.s3 = mad(a3.s3, b1.s3, c1.s3);
                c1.s3 = mad(a3.s4, b1.s4, c1.s3);
                c1.s3 = mad(a3.s5, b1.s5, c1.s3);
                c1.s3 = mad(a3.s6, b1.s6, c1.s3);
                c1.s3 = mad(a3.s7, b1.s7, c1.s3);
                c2.s3 = mad(a3.s0, b2.s0, c2.s3);
                c2.s3 = mad(a3.s1, b2.s1, c2.s3);
                c2.s3 = mad(a3.s2, b2.s2, c2.s3);
                c2.s3 = mad(a3.s3, b2.s3, c2.s3);
                c2.s3 = mad(a3.s4, b2.s4, c2.s3);
                c2.s3 = mad(a3.s5, b2.s5, c2.s3);
                c2.s3 = mad(a3.s6, b2.s6, c2.s3);
                c2.s3 = mad(a3.s7, b2.s7, c2.s3);
                c3.s3 = mad(a3.s0, b3.s0, c3.s3);
                c3.s3 = mad(a3.s1, b3.s1, c3.s3);
                c3.s3 = mad(a3.s2, b3.s2, c3.s3);
                c3.s3 = mad(a3.s3, b3.s3, c3.s3);
                c3.s3 = mad(a3.s4, b3.s4, c3.s3);
                c3.s3 = mad(a3.s5, b3.s5, c3.s3);
                c3.s3 = mad(a3.s6, b3.s6, c3.s3);
                c3.s3 = mad(a3.s7, b3.s7, c3.s3);
                /* ---------------------- */
            }
            if( itemId.x == 0 ) {
                for( k0 = kMax; (k0 < currM+32)&&(k0 < M); k0 += 1 ) {
                    coord.z=k0;
                    /* -- Tiles multiplier -- */
                    const uint bk = coord.z % M;

                    b0.s0 = (Bg).f[mad24(coord.x % N, ldb, bk)];
                    b1.s0 = (Bg).f[mad24((coord.x + 1) % N, ldb, bk)];
                    b2.s0 = (Bg).f[mad24((coord.x + 2) % N, ldb, bk)];
                    b3.s0 = (Bg).f[mad24((coord.x + 3) % N, ldb, bk)];

                    b0.s0 = (coord.z < M) ? b0.s0 : 0;
                    b1.s0 = (coord.z < M) ? b1.s0 : 0;
                    b2.s0 = (coord.z < M) ? b2.s0 : 0;
                    b3.s0 = (coord.z < M) ? b3.s0 : 0;

                    const uint ak = coord.z % M;

                    a0.s0 = (Ag).f[mad24(coord.y % M, lda, ak)];
                    a1.s0 = (Ag).f[mad24((coord.y + 1) % M, lda, ak)];
                    a2.s0 = (Ag).f[mad24((coord.y + 2) % M, lda, ak)];
                    a3.s0 = (Ag).f[mad24((coord.y + 3) % M, lda, ak)];

                    a0.s0 = (coord.z < M) ? a0.s0 : 0;
                    a1.s0 = (coord.z < M) ? a1.s0 : 0;
                    a2.s0 = (coord.z < M) ? a2.s0 : 0;
                    a3.s0 = (coord.z < M) ? a3.s0 : 0;
                    // post fetch A
                    {
                        uint zy = coord.y;
                        a0.s0 = zy < coord.z ? 0 : a0.s0;
                        a0.s0 = zy == coord.z ? 1 : a0.s0;
                        zy++;
                        a1.s0 = zy < coord.z ? 0 : a1.s0;
                        a1.s0 = zy == coord.z ? 1 : a1.s0;
                        zy++;
                        a2.s0 = zy < coord.z ? 0 : a2.s0;
                        a2.s0 = zy == coord.z ? 1 : a2.s0;
                        zy++;
                        a3.s0 = zy < coord.z ? 0 : a3.s0;
                        a3.s0 = zy == coord.z ? 1 : a3.s0;
                    }

                    c0.s0 = mad(a0.s0, b0.s0, c0.s0);
                    c1.s0 = mad(a0.s0, b1.s0, c1.s0);
                    c2.s0 = mad(a0.s0, b2.s0, c2.s0);
                    c3.s0 = mad(a0.s0, b3.s0, c3.s0);

                    c0.s1 = mad(a1.s0, b0.s0, c0.s1);
                    c1.s1 = mad(a1.s0, b1.s0, c1.s1);
                    c2.s1 = mad(a1.s0, b2.s0, c2.s1);
                    c3.s1 = mad(a1.s0, b3.s0, c3.s1);

                    c0.s2 = mad(a2.s0, b0.s0, c0.s2);
                    c1.s2 = mad(a2.s0, b1.s0, c1.s2);
                    c2.s2 = mad(a2.s0, b2.s0, c2.s2);
                    c3.s2 = mad(a2.s0, b3.s0, c3.s2);

                    c0.s3 = mad(a3.s0, b0.s0, c0.s3);
                    c1.s3 = mad(a3.s0, b1.s0, c1.s3);
                    c2.s3 = mad(a3.s0, b2.s0, c2.s3);
                    c3.s3 = mad(a3.s0, b3.s0, c3.s3);
                    /* ---------------------- */
                }
            }
        }
        barrier(CLK_GLOBAL_MEM_FENCE);
        if ((coord.y + 4 == M) && (M % 4)) {
            coord.y += 4 - M % 4;
        }
        if ((coord.x + 4 == N) && (N % 4)) {
            coord.x += 4 - N % 4;
        }

        //-----MergeUpdateResult

        // veclenC scratch[SUBG_ITEMS*MSTEP_SUBG*vecNumC]
        __local float4 ascratch[4*16*4];
        __local float4 *scratch = ascratch;

        //LDS block has the same vectorization as C matrix block
        //VNUM_C*((get_local_id(1)%MSTEP_SUBG)*SUBG_ITEMS +get_local_id(0) );
        scratch += 4*((itemId.y%16)*4 +itemId.x );

        for( uint mstep = 0; mstep < 16; mstep += 16 ) {

            if( (itemId.y >= mstep)&&(itemId.y < (mstep+16)) ) {

                scratch[0] = c0;
                scratch[1] = c1;
                scratch[2] = c2;
                scratch[3] = c3;

                c0 = 0;
                c1 = 0;
                c2 = 0;
                c3 = 0;

            }

            barrier(CLK_LOCAL_MEM_FENCE);

            if( (itemId.y >= mstep)&&(itemId.y < (mstep+16)) ) {
                if ( 0 == itemId.x ) {

                    for(uint k = 0; k < 4 * 4; k += 4) {

                        c0 += scratch[0];
                        c1 += scratch[1];
                        c2 += scratch[2];
                        c3 += scratch[3];

                        //Adding the LDS block size in vectors
                        scratch += 4;
                    }

                    if ((coord.y < M) && (coord.x < N)) {
                        uint y = min(4u, M - (uint)coord.y);
                        uint x = min(4u, N - (uint)coord.x);
                        if ((y == 4) && (x == 4)) {
                            GPtr uC;

                            uC.f = C + coord.x * ldb + coord.y;

                            __global float *pC = uC.f;

                            float4 tempC0, tempC1, tempC2, tempC3;

                            tempC0 = mad(c0, alpha, 0);
                            tempC1 = mad(c1, alpha, 0);
                            tempC2 = mad(c2, alpha, 0);
                            tempC3 = mad(c3, alpha, 0);
                            pC[0] = tempC0.s0;
                            pC[1] = tempC0.s1;
                            pC[2] = tempC0.s2;
                            pC[3] = tempC0.s3;
                            pC[ldb] = tempC1.s0;
                            pC[ldb + 1] = tempC1.s1;
                            pC[ldb + 2] = tempC1.s2;
                            pC[ldb + 3] = tempC1.s3;
                            pC[(ldb << 1)] = tempC2.s0;
                            pC[mad24(2u, ldb, 1u)] = tempC2.s1;
                            pC[mad24(2u, ldb, 2u)] = tempC2.s2;
                            pC[mad24(2u, ldb, 3u)] = tempC2.s3;
                            pC[mad24(3u, ldb, 0u)] = tempC3.s0;
                            pC[mad24(3u, ldb, 1u)] = tempC3.s1;
                            pC[mad24(3u, ldb, 2u)] = tempC3.s2;
                            pC[mad24(3u, ldb, 3u)] = tempC3.s3;
                        }
                        else  {
                            GPtr uC;
                            int i, j;
                            PPtr res;

                            uC.f = C + coord.x * ldb + coord.y;

                            uC.f += (x-1) * ldb;

                            if (x)  {
                                switch (y) {
                                    case 4:
                                    uC.f[(y+0) % 4] = c3.s0 * alpha;
                                    case 3:
                                    uC.f[(y+1) % 4] = c3.s1 * alpha;
                                    case 2:
                                    uC.f[(y+2) % 4] = c3.s2 * alpha;
                                    case 1:
                                    uC.f[(y+3) % 4] = c3.s3 * alpha;
                                }
                                uC.f -= ldb;
                                x--;
                            }
                            if (x)  {
                                switch (y) {
                                    case 4:
                                    uC.f[(y+0) % 4] = c2.s0 * alpha;
                                    case 3:
                                    uC.f[(y+1) % 4] = c2.s1 * alpha;
                                    case 2:
                                    uC.f[(y+2) % 4] = c2.s2 * alpha;
                                    case 1:
                                    uC.f[(y+3) % 4] = c2.s3 * alpha;
                                }
                                uC.f -= ldb;
                                x--;
                            }
                            if (x)  {
                                switch (y) {
                                    case 4:
                                    uC.f[(y+0) % 4] = c1.s0 * alpha;
                                    case 3:
                                    uC.f[(y+1) % 4] = c1.s1 * alpha;
                                    case 2:
                                    uC.f[(y+2) % 4] = c1.s2 * alpha;
                                    case 1:
                                    uC.f[(y+3) % 4] = c1.s3 * alpha;
                                }
                                uC.f -= ldb;
                                x--;
                            }
                            if (x)  {
                                switch (y) {
                                    case 4:
                                    uC.f[(y+0) % 4] = c0.s0 * alpha;
                                    case 3:
                                    uC.f[(y+1) % 4] = c0.s1 * alpha;
                                    case 2:
                                    uC.f[(y+2) % 4] = c0.s2 * alpha;
                                    case 1:
                                    uC.f[(y+3) % 4] = c0.s3 * alpha;
                                }
                                uC.f -= ldb;
                                x--;
                            }
                        }
                    }

                }
            }
            barrier(CLK_LOCAL_MEM_FENCE);
        }
        currM -= 32;
    }
}



--------------------------------------------------------

Build log:

/tmp/comgr-a1a18b/input/CompileCLSource:56:16: warning: initializing '__global float *' with an expression of type 'const __global float *restrict' discards qualifiers
    GPtr Ag = {A};
               ^
/tmp/comgr-a1a18b/input/CompileCLSource:57:16: warning: initializing '__global float *' with an expression of type 'const __global float *restrict' discards qualifiers
    GPtr Bg = {B};
               ^
/tmp/comgr-a1a18b/input/CompileCLSource:366:24: error: variables in the local address space can only be declared in the outermost scope of a kernel function
        __local float4 ascratch[4*16*4];
                       ^
2 warnings and 1 error generated.
Error: Failed to compile opencl source (from CL to LLVM IR).

========================================================

Segmentation fault (core dumped)

Additionally, several tests fail, e.g.

[ RUN      ] ColumnMajor_SmallRange_BigLDA_OffSet/GEMM.zgemm/39
m : 6    n: 63
/home/robin/dev/clBLAS/src/tests/include/matrix.h:472: Failure
The difference between ((ref).s[0]) and ((clresult).s[0]) is 58230133, which exceeds delta, where
((ref).s[0]) evaluates to -270497230451976,
((clresult).s[0]) evaluates to -270497288682109, and
delta evaluates to 0.
clblasColumnMajor, clblasTrans, clblasTrans, M = 128, N = 128, K = 128, offA = 1, offB = 0, offC = 0, lda = 500, ldb = 501, ldc = 502
             seed = 12345, queues = 1, [  FAILED  ] ColumnMajor_SmallRange_BigLDA_OffSet/GEMM.zgemm/39, where GetParam() = (1, 1, 1, 128, 128, 128, 48-byte object <F4-01 00-00 00-00 00-00 F5-01 00-00 00-00 00-00 F6-01 00-00 00-00 00-00 01-00 00-00 00-00 00-00 00-00 00-00 00-00 00-00 00-00 00-00 00-00 00-00>, 1) (9 ms)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant