CUDA 8 Doesn't Work - MSVC #81

serialhex · 2016-10-19T17:01:18Z

The error I am getting is:

? (cl-cuda-examples.vector-add:main)
nvcc -arch=sm_30 -I d:/programmin/quicklisp/local-projects/cl-cuda/include -ptx -o tmp/cl-cuda.tmp.ptx tmp/cl-cuda.tmp.cu
> Error: nvcc exits with code: 2
>        tmp/cl-cuda.tmp.cu(24): warning: variable "cl_cuda_examples_sph_size_z" was declared but never referenced
>
>        tmp/cl-cuda.tmp.cu(17): error: dynamic initialization is not supported for __device__, __constant__ and __shared__ variables.
>
>        tmp/cl-cuda.tmp.cu(18): error: dynamic initialization is not supported for __device__, __constant__ and __shared__ variables.
>
>        tmp/cl-cuda.tmp.cu(19): error: dynamic initialization is not supported for __device__, __constant__ and __shared__ variables.
>
>        3 errors detected in the compilation of "C:/Users/538353~1/AppData/Local/Temp/tmpxft_000023a8_00000000-8_cl-cuda.tmp.cpp1.ii".
>
> While executing: CL-CUDA.API.NVCC::RUN-NVCC-COMMAND, in process listener(1).
> Type :POP to abort, :R for a list of available restarts.
> Type :? for other options.

when using the vector-add example.

I'm pretty sure this is an error on NVIDIAs side, I thought I changed from 8.0.27 to 8.0.44, but unfortunately 8.0.44 didn't work and I tried 8.0.27 and it still didn't work. I'm going to try installing an earlier version (7.x) and see if that works.

The text was updated successfully, but these errors were encountered:

serialhex · 2016-10-19T18:13:36Z

Apparently something changed and even v7.5 doesn't work when i try to launch the vector-add example... but everything else seems to work...? Tests even work...

serialhex · 2016-10-19T18:16:45Z

The code that is genreated by the vector-add example is this:

#include "int.h"
#include "float.h"
#include "float3.h"
#include "float4.h"
#include "double.h"
#include "double3.h"
#include "double4.h"
#include "curand.h"


/**
 *  Kernel globals
 */

__device__ static int cl_cuda_examples_defglobal_foo = 0;
__constant__ static int cl_cuda_examples_defglobal_bar = 0;
__constant__ static float4 cl_cuda_examples_sph_box_min = make_float4( 0.0f, 0.0f, 0.0f, 0.0f );
__constant__ static float4 cl_cuda_examples_sph_box_max = make_float4( 0.0f, 0.0f, 0.0f, 0.0f );
__constant__ static float4 cl_cuda_examples_sph_origin = make_float4( 0.0f, 0.0f, 0.0f, 0.0f );
__constant__ static float cl_cuda_examples_sph_delta = 0.0f;
__constant__ static int cl_cuda_examples_sph_capacity = 0;
__constant__ static int cl_cuda_examples_sph_size_x = 0;
__constant__ static int cl_cuda_examples_sph_size_y = 0;
__constant__ static int cl_cuda_examples_sph_size_z = 0;


/**
 *  Kernel function prototypes
 */

extern "C" __global__ void cl_cuda_examples_diffuse0_cuda_diffusion2d( float* f, float* fn, int nx, int ny, float c0, float c1, float c2 );
extern "C" __global__ void cl_cuda_examples_diffuse1_cuda_diffusion2d( float* f, float* fn, int nx, int ny, float c0, float c1, float c2 );
extern "C" __global__ void cl_cuda_examples_vector_add_vec_add_kernel( float* a, float* b, float* c, int n );
extern "C" __global__ void cl_cuda_examples_defglobal_add_globals( int* out );
extern "C" __device__ float cl_cuda_examples_sph_norm( float4 x );
extern "C" __device__ int cl_cuda_examples_sph_offset( int i, int j, int k, int l );
extern "C" __global__ void cl_cuda_examples_sph_update_neighbor_map( int* neighbor_map, float4* pos, int n );
extern "C" __global__ void cl_cuda_examples_sph_clear_neighbor_map( int* neighbor_map );
extern "C" __device__ int cl_cuda_examples_sph_apply_collision( float4* acc, int i, float x0, float x1, float4 v, float4 normal );
extern "C" __device__ int cl_cuda_examples_sph_apply_accel_limit( float4* acc, int i );
extern "C" __global__ void cl_cuda_examples_sph_boundary_condition( float4* acc, float4* pos, float4* vel, int n );
extern "C" __device__ float cl_cuda_examples_sph_poly6_kernel( float4 x );
extern "C" __device__ float4 cl_cuda_examples_sph_grad_spiky_kernel( float4 x );
extern "C" __device__ float cl_cuda_examples_sph_rap_visc_kernel( float4 x );
extern "C" __global__ void cl_cuda_examples_sph_update_density( float* rho, float4* pos, int n, int* neighbor_map );
extern "C" __global__ void cl_cuda_examples_sph_update_pressure( float* prs, float* rho, int n );
extern "C" __device__ float4 cl_cuda_examples_sph_pressure_term( float* rho, float* prs, int i, int j, float4 dr );
extern "C" __device__ float4 cl_cuda_examples_sph_viscosity_term( float4* vel, float* rho, int i, int j, float4 dr );
extern "C" __global__ void cl_cuda_examples_sph_update_force( float4* force, float4* pos, float4* vel, float* rho, float* prs, int n, int* neighbor_map );
extern "C" __global__ void cl_cuda_examples_sph_update_acceleration( float4* acc, float4* force, float* rho, int n );
extern "C" __global__ void cl_cuda_examples_sph_update_velocity( float4* vel, float4* acc, int n );
extern "C" __global__ void cl_cuda_examples_sph_update_position( float4* pos, float4* vel, int n );


/**
 *  Kernel function definitions
 */

__global__ void cl_cuda_examples_diffuse0_cuda_diffusion2d( float* f, float* fn, int nx, int ny, float c0, float c1, float c2 )
{
  {
    int jy = ((blockDim.y * blockIdx.y) + threadIdx.y);
    {
      int jx = ((blockDim.x * blockIdx.x) + threadIdx.x);
      {
        int j = ((nx * jy) + jx);
        {
          float fcc = f[j];
          float fcw = 0.0f;
          float fce = 0.0f;
          float fcs = 0.0f;
          float fcn = 0.0f;
          if ((jx == 0)) {
            fcw = fcc;
          } else {
            fcw = f[(j - 1)];
          }
          if ((jx == (nx - 1))) {
            fce = fcc;
          } else {
            fce = f[(j + 1)];
          }
          if ((jy == 0)) {
            fcs = fcc;
          } else {
            fcs = f[(j - nx)];
          }
          if ((jy == (ny - 1))) {
            fcn = fcc;
          } else {
            fcn = f[(j + nx)];
          }
          fn[j] = ((c0 * (fce + fcw)) + ((c1 * (fcn + fcs)) + (c2 * fcc)));
        }
      }
    }
  }
}

__global__ void cl_cuda_examples_diffuse1_cuda_diffusion2d( float* f, float* fn, int nx, int ny, float c0, float c1, float c2 )
{
  {
    int jx = (threadIdx.x + 1);
    {
      int jy = (threadIdx.y + 1);
      {
        int j = ((nx * ((blockDim.y * blockIdx.y) + threadIdx.y)) + ((blockDim.x * blockIdx.x) + threadIdx.x));
        {
          float fcc = f[j];
          {
            __shared__ float fs[(16 + 2)][(16 + 2)];
            fs[jy][jx] = fcc;
            if ((threadIdx.x == 0)) {
              if ((blockIdx.x == 0)) {
                fs[jy][0] = fcc;
              } else {
                fs[jy][0] = f[(j - 1)];
              }
            }
            if ((threadIdx.x == (blockDim.x - 1))) {
              if ((blockIdx.x == (gridDim.x - 1))) {
                fs[jy][(blockDim.x + 1)] = fcc;
              } else {
                fs[jy][(blockDim.x + 1)] = f[(j + 1)];
              }
            }
            if ((threadIdx.y == 0)) {
              if ((blockIdx.y == 0)) {
                fs[0][jx] = fcc;
              } else {
                fs[0][jx] = f[(j - nx)];
              }
            }
            if ((threadIdx.y == (blockDim.y - 1))) {
              if ((blockIdx.y == (gridDim.y - 1))) {
                fs[(blockDim.y + 1)][jx] = fcc;
              } else {
                fs[(blockDim.y + 1)][jx] = f[(j + nx)];
              }
            }
            __syncthreads();
            fn[j] = ((c0 * (fs[jy][(jx + 1)] + fs[jy][(jx - 1)])) + ((c1 * (fs[(jy + 1)][jx] + fs[(jy - 1)][jx])) + (c2 * fs[jy][jx])));
          }
        }
      }
    }
  }
}

__global__ void cl_cuda_examples_vector_add_vec_add_kernel( float* a, float* b, float* c, int n )
{
  {
    int i = ((blockDim.x * blockIdx.x) + threadIdx.x);
    if ((i < n)) {
      c[i] = (a[i] + b[i]);
    }
  }
}

__global__ void cl_cuda_examples_defglobal_add_globals( int* out )
{
  out[0] = (cl_cuda_examples_defglobal_foo + cl_cuda_examples_defglobal_bar);
}

__device__ float cl_cuda_examples_sph_norm( float4 x )
{
  return sqrtf( ((x.x * x.x) + ((x.y * x.y) + ((x.z * x.z) + (x.w * x.w)))) );
}

__device__ int cl_cuda_examples_sph_offset( int i, int j, int k, int l )
{
  return ((cl_cuda_examples_sph_capacity * (cl_cuda_examples_sph_size_x * (cl_cuda_examples_sph_size_y * k))) + ((cl_cuda_examples_sph_capacity * (cl_cuda_examples_sph_size_x * j)) + ((cl_cuda_examples_sph_capacity * i) + l)));
}

__global__ void cl_cuda_examples_sph_update_neighbor_map( int* neighbor_map, float4* pos, int n )
{
  {
    int p = ((blockIdx.x * blockDim.x) + threadIdx.x);
    if ((p < n)) {
      {
        float4 x15680 = pos[p];
        {
          int i = floorf( ((x15680.x - cl_cuda_examples_sph_origin.x) / cl_cuda_examples_sph_delta) );
          int j = floorf( ((x15680.y - cl_cuda_examples_sph_origin.y) / cl_cuda_examples_sph_delta) );
          int k = floorf( ((x15680.z - cl_cuda_examples_sph_origin.z) / cl_cuda_examples_sph_delta) );
          {
            int offset = cl_cuda_examples_sph_offset( i, j, k, 0 );
            {
              int l = atomicAdd( &( neighbor_map[offset] ), 1 );
              neighbor_map[cl_cuda_examples_sph_offset( i, j, k, (l + 1) )] = p;
            }
          }
        }
      }
    }
  }
}

__global__ void cl_cuda_examples_sph_clear_neighbor_map( int* neighbor_map )
{
  {
    int i = threadIdx.x;
    int j = blockIdx.x;
    int k = blockIdx.y;
    neighbor_map[cl_cuda_examples_sph_offset( i, j, k, 0 )] = 0;
  }
}

__device__ int cl_cuda_examples_sph_apply_collision( float4* acc, int i, float x0, float x1, float4 v, float4 normal )
{
  {
    float distance = ((x1 - x0) * 0.004f);
    {
      float diff = ((0.002f * 2.0f) - distance);
      {
        float adj = ((20000.0f * diff) - (512.0f * float4_dot( normal, v )));
        if ((0.00001f < diff)) {
          acc[i] = float4_add( acc[i], float4_scale_flipped( adj, normal ) );
        }
      }
    }
  }
  return 0;
}

__device__ int cl_cuda_examples_sph_apply_accel_limit( float4* acc, int i )
{
  {
    float accel = cl_cuda_examples_sph_norm( acc[i] );
    if ((200.0f < accel)) {
      acc[i] = float4_scale( acc[i], (200.0f / accel) );
    }
  }
  return 0;
}

__global__ void cl_cuda_examples_sph_boundary_condition( float4* acc, float4* pos, float4* vel, int n )
{
  {
    int i = ((blockIdx.x * blockDim.x) + threadIdx.x);
    if ((i < n)) {
      {
        float4 xi = pos[i];
        float4 vi = vel[i];
        cl_cuda_examples_sph_apply_collision( acc, i, cl_cuda_examples_sph_box_min.x, xi.x, vi, make_float4( 1.0f, 0.0f, 0.0f, 0.0f ) );
        cl_cuda_examples_sph_apply_collision( acc, i, xi.x, cl_cuda_examples_sph_box_max.x, vi, make_float4( -1.0f, 0.0f, 0.0f, 0.0f ) );
        cl_cuda_examples_sph_apply_collision( acc, i, cl_cuda_examples_sph_box_min.y, xi.y, vi, make_float4( 0.0f, 1.0f, 0.0f, 0.0f ) );
        cl_cuda_examples_sph_apply_collision( acc, i, xi.y, cl_cuda_examples_sph_box_max.y, vi, make_float4( 0.0f, -1.0f, 0.0f, 0.0f ) );
        cl_cuda_examples_sph_apply_collision( acc, i, cl_cuda_examples_sph_box_min.z, xi.z, vi, make_float4( 0.0f, 0.0f, 1.0f, 0.0f ) );
        cl_cuda_examples_sph_apply_collision( acc, i, xi.z, cl_cuda_examples_sph_box_max.z, vi, make_float4( 0.0f, 0.0f, -1.0f, 0.0f ) );
        cl_cuda_examples_sph_apply_accel_limit( acc, i );
      }
    }
  }
}

__device__ float cl_cuda_examples_sph_poly6_kernel( float4 x )
{
  {
    float r = cl_cuda_examples_sph_norm( x );
    return ((315.0f / (64.0f * (3.1415927f * (0.005f * (0.005f * (0.005f * (0.005f * (0.005f * (0.005f * (0.005f * (0.005f * 0.005f))))))))))) * (((0.005f * 0.005f) - (r * r)) * (((0.005f * 0.005f) - (r * r)) * ((0.005f * 0.005f) - (r * r)))));
  }
}

__device__ float4 cl_cuda_examples_sph_grad_spiky_kernel( float4 x )
{
  {
    float r = cl_cuda_examples_sph_norm( x );
    return float4_scale_flipped( (-45.0f / (3.1415927f * (0.005f * (0.005f * (0.005f * (0.005f * (0.005f * 0.005f))))))), float4_scale_flipped( ((0.005f - r) * (0.005f - r)), float4_scale_inverted( x, r ) ) );
  }
}

__device__ float cl_cuda_examples_sph_rap_visc_kernel( float4 x )
{
  {
    float r = cl_cuda_examples_sph_norm( x );
    return ((45.0f / (3.1415927f * (0.005f * (0.005f * (0.005f * (0.005f * (0.005f * 0.005f))))))) * (0.005f - r));
  }
}

__global__ void cl_cuda_examples_sph_update_density( float* rho, float4* pos, int n, int* neighbor_map )
{
  {
    int i = ((blockIdx.x * blockDim.x) + threadIdx.x);
    if ((i < n)) {
      {
        float4 xi = pos[i];
        float tmp = 0.0f;
        {
          float4 x16010 = xi;
          {
            int i015989 = floorf( ((x16010.x - cl_cuda_examples_sph_origin.x) / cl_cuda_examples_sph_delta) );
            int j015990 = floorf( ((x16010.y - cl_cuda_examples_sph_origin.y) / cl_cuda_examples_sph_delta) );
            int k015991 = floorf( ((x16010.z - cl_cuda_examples_sph_origin.z) / cl_cuda_examples_sph_delta) );
            for ( int i15992 = (i015989 - 1); ! (i15992 > (i015989 + 1)); i15992 = (i15992 + 1) )
            {
              for ( int j15993 = (j015990 - 1); ! (j15993 > (j015990 + 1)); j15993 = (j15993 + 1) )
              {
                for ( int k15994 = (k015991 - 1); ! (k15994 > (k015991 + 1)); k15994 = (k15994 + 1) )
                {
                  for ( int l15995 = 1; ! (l15995 > neighbor_map[cl_cuda_examples_sph_offset( i15992, j15993, k15994, 0 )]); l15995 = (l15995 + 1) )
                  {
                    {
                      int j = neighbor_map[cl_cuda_examples_sph_offset( i15992, j15993, k15994, l15995 )];
                      {
                        float4 xj = pos[j];
                        {
                          float4 dr = float4_scale( float4_sub( xi, xj ), 0.004f );
                          if ((cl_cuda_examples_sph_norm( dr ) <= 0.005f)) {
                            tmp = (tmp + ((0.00020543f / 8.0f) * cl_cuda_examples_sph_poly6_kernel( dr )));
                          }
                        }
                      }
                    }
                  }
                }
              }
            }
          }
        }
        rho[i] = tmp;
      }
    }
  }
}

__global__ void cl_cuda_examples_sph_update_pressure( float* prs, float* rho, int n )
{
  {
    int i = ((blockIdx.x * blockDim.x) + threadIdx.x);
    if ((i < n)) {
      prs[i] = ((rho[i] - 600.0f) * 3.0f);
    }
  }
}

__device__ float4 cl_cuda_examples_sph_pressure_term( float* rho, float* prs, int i, int j, float4 dr )
{
  return float4_scale_flipped( ((float_negate( (0.00020543f / 8.0f) ) * (prs[i] + prs[j])) / (2.0f * rho[j])), cl_cuda_examples_sph_grad_spiky_kernel( dr ) );
}

__device__ float4 cl_cuda_examples_sph_viscosity_term( float4* vel, float* rho, int i, int j, float4 dr )
{
  return float4_scale( float4_scale_inverted( float4_scale_flipped( 0.2f, float4_scale_flipped( (0.00020543f / 8.0f), float4_sub( vel[j], vel[i] ) ) ), rho[j] ), cl_cuda_examples_sph_rap_visc_kernel( dr ) );
}

__global__ void cl_cuda_examples_sph_update_force( float4* force, float4* pos, float4* vel, float* rho, float* prs, int n, int* neighbor_map )
{
  {
    int i = ((blockIdx.x * blockDim.x) + threadIdx.x);
    if ((i < n)) {
      {
        float4 xi = pos[i];
        float4 tmp = make_float4( 0.0f, 0.0f, 0.0f, 0.0f );
        {
          float4 x16150 = xi;
          {
            int i016129 = floorf( ((x16150.x - cl_cuda_examples_sph_origin.x) / cl_cuda_examples_sph_delta) );
            int j016130 = floorf( ((x16150.y - cl_cuda_examples_sph_origin.y) / cl_cuda_examples_sph_delta) );
            int k016131 = floorf( ((x16150.z - cl_cuda_examples_sph_origin.z) / cl_cuda_examples_sph_delta) );
            for ( int i16132 = (i016129 - 1); ! (i16132 > (i016129 + 1)); i16132 = (i16132 + 1) )
            {
              for ( int j16133 = (j016130 - 1); ! (j16133 > (j016130 + 1)); j16133 = (j16133 + 1) )
              {
                for ( int k16134 = (k016131 - 1); ! (k16134 > (k016131 + 1)); k16134 = (k16134 + 1) )
                {
                  for ( int l16135 = 1; ! (l16135 > neighbor_map[cl_cuda_examples_sph_offset( i16132, j16133, k16134, 0 )]); l16135 = (l16135 + 1) )
                  {
                    {
                      int j = neighbor_map[cl_cuda_examples_sph_offset( i16132, j16133, k16134, l16135 )];
                      if ((i != j)) {
                        {
                          float4 xj = pos[j];
                          {
                            float4 dr = float4_scale( float4_sub( xi, xj ), 0.004f );
                            if ((cl_cuda_examples_sph_norm( dr ) <= 0.005f)) {
                              tmp = float4_add( tmp, cl_cuda_examples_sph_pressure_term( rho, prs, i, j, dr ) );
                              tmp = float4_add( tmp, cl_cuda_examples_sph_viscosity_term( vel, rho, i, j, dr ) );
                            }
                          }
                        }
                      }
                    }
                  }
                }
              }
            }
          }
        }
        force[i] = tmp;
      }
    }
  }
}

__global__ void cl_cuda_examples_sph_update_acceleration( float4* acc, float4* force, float* rho, int n )
{
  {
    int i = ((blockIdx.x * blockDim.x) + threadIdx.x);
    if ((i < n)) {
      acc[i] = float4_add( float4_scale_inverted( force[i], rho[i] ), make_float4( 0.0f, -9.8f, 0.0f, 0.0f ) );
    }
  }
}

__global__ void cl_cuda_examples_sph_update_velocity( float4* vel, float4* acc, int n )
{
  {
    int i = ((blockIdx.x * blockDim.x) + threadIdx.x);
    if ((i < n)) {
      vel[i] = float4_add( vel[i], float4_scale( acc[i], 0.0004f ) );
    }
  }
}

__global__ void cl_cuda_examples_sph_update_position( float4* pos, float4* vel, int n )
{
  {
    int i = ((blockIdx.x * blockDim.x) + threadIdx.x);
    if ((i < n)) {
      pos[i] = float4_add( pos[i], float4_scale_inverted( float4_scale( vel[i], 0.0004f ), 0.004f ) );
    }
  }
}

hope this helps...

takagi · 2016-10-24T08:20:43Z

Thanks for reporting. The following lines cause the errors, where make_float4 is a function and the constant variables need to be dynamically initialized.

__constant__ static float4 cl_cuda_examples_sph_box_min = make_float4( 0.0f, 0.0f, 0.0f, 0.0f );
__constant__ static float4 cl_cuda_examples_sph_box_max = make_float4( 0.0f, 0.0f, 0.0f, 0.0f );
__constant__ static float4 cl_cuda_examples_sph_origin = make_float4( 0.0f, 0.0f, 0.0f, 0.0f );

It seems that initialization of constant variables needs to be compiled into literals instead of function calling on MSVC, though I'm not sure relating with CUDA version.

takagi · 2016-10-24T08:38:43Z

The following fix should work.

__constant__ static float4 cl_cuda_examples_sph_box_min = (float4){ 0.0f, 0.0f, 0.0f, 0.0f };
__constant__ static float4 cl_cuda_examples_sph_box_max = (float4){ 0.0f, 0.0f, 0.0f, 0.0f };
__constant__ static float4 cl_cuda_examples_sph_origin = (float4){ 0.0f, 0.0f, 0.0f, 0.0f };

serialhex · 2016-10-25T16:02:42Z

I've re-checked everything. The tests work, but for some reason the vector-add example doesn't. I haven't had a chance to manually check the fix you provided, but seeing the output, what it complains about, and the fixes suggested online I wouldn't doubt if it worked.

My guess is that it's just a MSVC thing and maybe have the code generator work around that?

Everything else works beautifully, thanks for this library! It's saved me much headache and time!!!

alaa-alawi · 2016-11-07T13:34:54Z

I'm facing the same problem and upon closer inspection, I'd found that the generated output is messed up, it tries to compile functions for diffuse0, diffuse1, sph and defglabal examples into one file! currently the only way for me is by restarting my Slime session (using SBCL). I'll try to find where we need to reset the output buffer (if any), before generating a new definitions.

takagi · 2016-11-07T22:30:29Z

@alaa-alawi cl-cuda.lang.kernel::kernel keeps kernel definitions in a single namespace and they are compiled into a big string through cl-cuda.lang.compiler.compile-kernel:compile-kernel. It would be nice if kernel managed kernel definitions and generated their compiled codes per package.

takagi · 2016-11-07T22:40:43Z

@serialhex

My guess is that it's just a MSVC thing and maybe have the code generator work around that?

Yes, the problem is caused on MSVC. I need to fix forms as (float4 0.0 0.0 0.0 0.0) to be compiled differently when they are used in functions and when they appear in global initialization.

takagi · 2016-11-07T22:44:33Z

When they are used in functions:

make_float4(0.0f, 0.0f, 0.0f, 0.0f)

When they appear in global initialization:

(float4){0.0f, 0.0f, 0.0f, 0.0f}

or

__make_float4(0.0f, 0.0f, 0.0f, 0.0f)    /*  macro defined in `cl-cuda/include/` directory */

takagi · 2016-11-07T22:51:49Z

To fix this,

Provide to compile-expression a flag, e.g. global-p, that selects the codes generated:
- If global-p is t, generates __make_float4(0.0f, 0.0f, 0.0f, 0.0f)
- Otherwise, generates make_float4(0.0f, 0.0f, 0.0f, 0.0f)
Fix its callers compile-global and compile-statement to specify the value of the flag.

takagi · 2016-11-08T00:01:42Z

I try to take time this week to fix this problem.

alaa-alawi · 2016-11-08T15:14:11Z

@takagi wouldn't it be feasible to attach the kernel definition with its associated symbol. That is through its properties list (Something similar to what is done here http://www.gigamonkeys.com/book/practical-parsing-binary-files.html)

takagi · 2016-12-17T08:45:46Z

@alaa-alawi I think the technique would not feasible in cl-cuda's case, how can we get symbols associated with kernel definitions to be compiled?

takagi · 2016-12-17T15:19:56Z

@serialhex Sorry for late response. Would you try the latest windows branch if you use MSVC environment, otherwise please try issue/81 branch? It would be great if I would hear the result.

takagi · 2016-12-17T15:24:57Z

Rel. #73.

alaa-alawi · 2016-12-18T18:48:04Z

@takagi Attaching symbols to a given kernel definition symbol (its generated defun) with a list of callees.

(push callee (get kernel-symbol 'cl-cuda-kernel-calees)) ;; during kernel definition

Then recursively compile each, collecting the result into a clean slate. What came to mind was the case when there will be separate cl-cuda helper/utilities libraries with their own packages, and how to compile the final kernel properly. Do you think using package alone as a kind of namespace will not help in this case? I assume not, however I'll need to validate this with some test cases.

takagi · 2016-12-19T06:36:59Z

Attaching symbols to a given kernel definition symbol (its generated defun) with a list of callees.

(push callee (get kernel-symbol 'cl-cuda-kernel-calees)) ;; during kernel definition

Then recursively compile each, collecting the result into a clean slate.

Okay. Then the questions I have are:

How to or when to collect the callees of a kernel function?
What granularity are kernel modules, i.g. .ptx files, generated with?

What came to mind was the case when there will be separate cl-cuda helper/utilities libraries with their own packages, and how to compile the final kernel properly. Do you think using package alone as a kind of namespace will not help in this case? I assume not, however I'll need to validate this with some test cases.

Sorry, I do not get your points. Are some special treatments needed in this case? I guess callees can be collected even if they are in other packages from their callers. If a kernel function is called from multiple callers, it would be collected multiple times from each caller. Would you give me an example?

takagi · 2016-12-19T10:53:37Z

How to or when to collect the callees of a kernel function?

Additionally, we should take care of mutual recursion.

guicho271828 · 2017-01-17T09:22:33Z

All the examples worked on my machine, GTX 1070, cuda 8.0 on ubuntu 16.04. Plz change the issue title to something that mentions MSVC, it would give a bad impression on newcomers.

takagi · 2017-01-23T07:50:06Z

It should be fixed via #83, so I close this issue. If the problem is not fixed, please reopen it.

serialhex changed the title ~~CUDA 8 Doesn't Work~~ CUDA 8 Doesn't Work - MSVC Jan 17, 2017

takagi closed this as completed Jan 23, 2017

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

CUDA 8 Doesn't Work - MSVC #81

CUDA 8 Doesn't Work - MSVC #81

serialhex commented Oct 19, 2016

serialhex commented Oct 19, 2016

serialhex commented Oct 19, 2016

takagi commented Oct 24, 2016

takagi commented Oct 24, 2016

serialhex commented Oct 25, 2016

alaa-alawi commented Nov 7, 2016 •

edited

Loading

takagi commented Nov 7, 2016

takagi commented Nov 7, 2016

takagi commented Nov 7, 2016

takagi commented Nov 7, 2016

takagi commented Nov 8, 2016

alaa-alawi commented Nov 8, 2016

takagi commented Dec 17, 2016

takagi commented Dec 17, 2016 •

edited

Loading

takagi commented Dec 17, 2016

alaa-alawi commented Dec 18, 2016

takagi commented Dec 19, 2016

takagi commented Dec 19, 2016

guicho271828 commented Jan 17, 2017 •

edited

Loading

takagi commented Jan 23, 2017

CUDA 8 Doesn't Work - MSVC #81

CUDA 8 Doesn't Work - MSVC #81

Comments

serialhex commented Oct 19, 2016

serialhex commented Oct 19, 2016

serialhex commented Oct 19, 2016

takagi commented Oct 24, 2016

takagi commented Oct 24, 2016

serialhex commented Oct 25, 2016

alaa-alawi commented Nov 7, 2016 • edited Loading

takagi commented Nov 7, 2016

takagi commented Nov 7, 2016

takagi commented Nov 7, 2016

takagi commented Nov 7, 2016

takagi commented Nov 8, 2016

alaa-alawi commented Nov 8, 2016

takagi commented Dec 17, 2016

takagi commented Dec 17, 2016 • edited Loading

takagi commented Dec 17, 2016

alaa-alawi commented Dec 18, 2016

takagi commented Dec 19, 2016

takagi commented Dec 19, 2016

guicho271828 commented Jan 17, 2017 • edited Loading

takagi commented Jan 23, 2017

alaa-alawi commented Nov 7, 2016 •

edited

Loading

takagi commented Dec 17, 2016 •

edited

Loading

guicho271828 commented Jan 17, 2017 •

edited

Loading