-
Notifications
You must be signed in to change notification settings - Fork 24
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
CUDA 8 Doesn't Work - MSVC #81
Comments
Apparently something changed and even v7.5 doesn't work when i try to launch the |
The code that is genreated by the #include "int.h"
#include "float.h"
#include "float3.h"
#include "float4.h"
#include "double.h"
#include "double3.h"
#include "double4.h"
#include "curand.h"
/**
* Kernel globals
*/
__device__ static int cl_cuda_examples_defglobal_foo = 0;
__constant__ static int cl_cuda_examples_defglobal_bar = 0;
__constant__ static float4 cl_cuda_examples_sph_box_min = make_float4( 0.0f, 0.0f, 0.0f, 0.0f );
__constant__ static float4 cl_cuda_examples_sph_box_max = make_float4( 0.0f, 0.0f, 0.0f, 0.0f );
__constant__ static float4 cl_cuda_examples_sph_origin = make_float4( 0.0f, 0.0f, 0.0f, 0.0f );
__constant__ static float cl_cuda_examples_sph_delta = 0.0f;
__constant__ static int cl_cuda_examples_sph_capacity = 0;
__constant__ static int cl_cuda_examples_sph_size_x = 0;
__constant__ static int cl_cuda_examples_sph_size_y = 0;
__constant__ static int cl_cuda_examples_sph_size_z = 0;
/**
* Kernel function prototypes
*/
extern "C" __global__ void cl_cuda_examples_diffuse0_cuda_diffusion2d( float* f, float* fn, int nx, int ny, float c0, float c1, float c2 );
extern "C" __global__ void cl_cuda_examples_diffuse1_cuda_diffusion2d( float* f, float* fn, int nx, int ny, float c0, float c1, float c2 );
extern "C" __global__ void cl_cuda_examples_vector_add_vec_add_kernel( float* a, float* b, float* c, int n );
extern "C" __global__ void cl_cuda_examples_defglobal_add_globals( int* out );
extern "C" __device__ float cl_cuda_examples_sph_norm( float4 x );
extern "C" __device__ int cl_cuda_examples_sph_offset( int i, int j, int k, int l );
extern "C" __global__ void cl_cuda_examples_sph_update_neighbor_map( int* neighbor_map, float4* pos, int n );
extern "C" __global__ void cl_cuda_examples_sph_clear_neighbor_map( int* neighbor_map );
extern "C" __device__ int cl_cuda_examples_sph_apply_collision( float4* acc, int i, float x0, float x1, float4 v, float4 normal );
extern "C" __device__ int cl_cuda_examples_sph_apply_accel_limit( float4* acc, int i );
extern "C" __global__ void cl_cuda_examples_sph_boundary_condition( float4* acc, float4* pos, float4* vel, int n );
extern "C" __device__ float cl_cuda_examples_sph_poly6_kernel( float4 x );
extern "C" __device__ float4 cl_cuda_examples_sph_grad_spiky_kernel( float4 x );
extern "C" __device__ float cl_cuda_examples_sph_rap_visc_kernel( float4 x );
extern "C" __global__ void cl_cuda_examples_sph_update_density( float* rho, float4* pos, int n, int* neighbor_map );
extern "C" __global__ void cl_cuda_examples_sph_update_pressure( float* prs, float* rho, int n );
extern "C" __device__ float4 cl_cuda_examples_sph_pressure_term( float* rho, float* prs, int i, int j, float4 dr );
extern "C" __device__ float4 cl_cuda_examples_sph_viscosity_term( float4* vel, float* rho, int i, int j, float4 dr );
extern "C" __global__ void cl_cuda_examples_sph_update_force( float4* force, float4* pos, float4* vel, float* rho, float* prs, int n, int* neighbor_map );
extern "C" __global__ void cl_cuda_examples_sph_update_acceleration( float4* acc, float4* force, float* rho, int n );
extern "C" __global__ void cl_cuda_examples_sph_update_velocity( float4* vel, float4* acc, int n );
extern "C" __global__ void cl_cuda_examples_sph_update_position( float4* pos, float4* vel, int n );
/**
* Kernel function definitions
*/
__global__ void cl_cuda_examples_diffuse0_cuda_diffusion2d( float* f, float* fn, int nx, int ny, float c0, float c1, float c2 )
{
{
int jy = ((blockDim.y * blockIdx.y) + threadIdx.y);
{
int jx = ((blockDim.x * blockIdx.x) + threadIdx.x);
{
int j = ((nx * jy) + jx);
{
float fcc = f[j];
float fcw = 0.0f;
float fce = 0.0f;
float fcs = 0.0f;
float fcn = 0.0f;
if ((jx == 0)) {
fcw = fcc;
} else {
fcw = f[(j - 1)];
}
if ((jx == (nx - 1))) {
fce = fcc;
} else {
fce = f[(j + 1)];
}
if ((jy == 0)) {
fcs = fcc;
} else {
fcs = f[(j - nx)];
}
if ((jy == (ny - 1))) {
fcn = fcc;
} else {
fcn = f[(j + nx)];
}
fn[j] = ((c0 * (fce + fcw)) + ((c1 * (fcn + fcs)) + (c2 * fcc)));
}
}
}
}
}
__global__ void cl_cuda_examples_diffuse1_cuda_diffusion2d( float* f, float* fn, int nx, int ny, float c0, float c1, float c2 )
{
{
int jx = (threadIdx.x + 1);
{
int jy = (threadIdx.y + 1);
{
int j = ((nx * ((blockDim.y * blockIdx.y) + threadIdx.y)) + ((blockDim.x * blockIdx.x) + threadIdx.x));
{
float fcc = f[j];
{
__shared__ float fs[(16 + 2)][(16 + 2)];
fs[jy][jx] = fcc;
if ((threadIdx.x == 0)) {
if ((blockIdx.x == 0)) {
fs[jy][0] = fcc;
} else {
fs[jy][0] = f[(j - 1)];
}
}
if ((threadIdx.x == (blockDim.x - 1))) {
if ((blockIdx.x == (gridDim.x - 1))) {
fs[jy][(blockDim.x + 1)] = fcc;
} else {
fs[jy][(blockDim.x + 1)] = f[(j + 1)];
}
}
if ((threadIdx.y == 0)) {
if ((blockIdx.y == 0)) {
fs[0][jx] = fcc;
} else {
fs[0][jx] = f[(j - nx)];
}
}
if ((threadIdx.y == (blockDim.y - 1))) {
if ((blockIdx.y == (gridDim.y - 1))) {
fs[(blockDim.y + 1)][jx] = fcc;
} else {
fs[(blockDim.y + 1)][jx] = f[(j + nx)];
}
}
__syncthreads();
fn[j] = ((c0 * (fs[jy][(jx + 1)] + fs[jy][(jx - 1)])) + ((c1 * (fs[(jy + 1)][jx] + fs[(jy - 1)][jx])) + (c2 * fs[jy][jx])));
}
}
}
}
}
}
__global__ void cl_cuda_examples_vector_add_vec_add_kernel( float* a, float* b, float* c, int n )
{
{
int i = ((blockDim.x * blockIdx.x) + threadIdx.x);
if ((i < n)) {
c[i] = (a[i] + b[i]);
}
}
}
__global__ void cl_cuda_examples_defglobal_add_globals( int* out )
{
out[0] = (cl_cuda_examples_defglobal_foo + cl_cuda_examples_defglobal_bar);
}
__device__ float cl_cuda_examples_sph_norm( float4 x )
{
return sqrtf( ((x.x * x.x) + ((x.y * x.y) + ((x.z * x.z) + (x.w * x.w)))) );
}
__device__ int cl_cuda_examples_sph_offset( int i, int j, int k, int l )
{
return ((cl_cuda_examples_sph_capacity * (cl_cuda_examples_sph_size_x * (cl_cuda_examples_sph_size_y * k))) + ((cl_cuda_examples_sph_capacity * (cl_cuda_examples_sph_size_x * j)) + ((cl_cuda_examples_sph_capacity * i) + l)));
}
__global__ void cl_cuda_examples_sph_update_neighbor_map( int* neighbor_map, float4* pos, int n )
{
{
int p = ((blockIdx.x * blockDim.x) + threadIdx.x);
if ((p < n)) {
{
float4 x15680 = pos[p];
{
int i = floorf( ((x15680.x - cl_cuda_examples_sph_origin.x) / cl_cuda_examples_sph_delta) );
int j = floorf( ((x15680.y - cl_cuda_examples_sph_origin.y) / cl_cuda_examples_sph_delta) );
int k = floorf( ((x15680.z - cl_cuda_examples_sph_origin.z) / cl_cuda_examples_sph_delta) );
{
int offset = cl_cuda_examples_sph_offset( i, j, k, 0 );
{
int l = atomicAdd( &( neighbor_map[offset] ), 1 );
neighbor_map[cl_cuda_examples_sph_offset( i, j, k, (l + 1) )] = p;
}
}
}
}
}
}
}
__global__ void cl_cuda_examples_sph_clear_neighbor_map( int* neighbor_map )
{
{
int i = threadIdx.x;
int j = blockIdx.x;
int k = blockIdx.y;
neighbor_map[cl_cuda_examples_sph_offset( i, j, k, 0 )] = 0;
}
}
__device__ int cl_cuda_examples_sph_apply_collision( float4* acc, int i, float x0, float x1, float4 v, float4 normal )
{
{
float distance = ((x1 - x0) * 0.004f);
{
float diff = ((0.002f * 2.0f) - distance);
{
float adj = ((20000.0f * diff) - (512.0f * float4_dot( normal, v )));
if ((0.00001f < diff)) {
acc[i] = float4_add( acc[i], float4_scale_flipped( adj, normal ) );
}
}
}
}
return 0;
}
__device__ int cl_cuda_examples_sph_apply_accel_limit( float4* acc, int i )
{
{
float accel = cl_cuda_examples_sph_norm( acc[i] );
if ((200.0f < accel)) {
acc[i] = float4_scale( acc[i], (200.0f / accel) );
}
}
return 0;
}
__global__ void cl_cuda_examples_sph_boundary_condition( float4* acc, float4* pos, float4* vel, int n )
{
{
int i = ((blockIdx.x * blockDim.x) + threadIdx.x);
if ((i < n)) {
{
float4 xi = pos[i];
float4 vi = vel[i];
cl_cuda_examples_sph_apply_collision( acc, i, cl_cuda_examples_sph_box_min.x, xi.x, vi, make_float4( 1.0f, 0.0f, 0.0f, 0.0f ) );
cl_cuda_examples_sph_apply_collision( acc, i, xi.x, cl_cuda_examples_sph_box_max.x, vi, make_float4( -1.0f, 0.0f, 0.0f, 0.0f ) );
cl_cuda_examples_sph_apply_collision( acc, i, cl_cuda_examples_sph_box_min.y, xi.y, vi, make_float4( 0.0f, 1.0f, 0.0f, 0.0f ) );
cl_cuda_examples_sph_apply_collision( acc, i, xi.y, cl_cuda_examples_sph_box_max.y, vi, make_float4( 0.0f, -1.0f, 0.0f, 0.0f ) );
cl_cuda_examples_sph_apply_collision( acc, i, cl_cuda_examples_sph_box_min.z, xi.z, vi, make_float4( 0.0f, 0.0f, 1.0f, 0.0f ) );
cl_cuda_examples_sph_apply_collision( acc, i, xi.z, cl_cuda_examples_sph_box_max.z, vi, make_float4( 0.0f, 0.0f, -1.0f, 0.0f ) );
cl_cuda_examples_sph_apply_accel_limit( acc, i );
}
}
}
}
__device__ float cl_cuda_examples_sph_poly6_kernel( float4 x )
{
{
float r = cl_cuda_examples_sph_norm( x );
return ((315.0f / (64.0f * (3.1415927f * (0.005f * (0.005f * (0.005f * (0.005f * (0.005f * (0.005f * (0.005f * (0.005f * 0.005f))))))))))) * (((0.005f * 0.005f) - (r * r)) * (((0.005f * 0.005f) - (r * r)) * ((0.005f * 0.005f) - (r * r)))));
}
}
__device__ float4 cl_cuda_examples_sph_grad_spiky_kernel( float4 x )
{
{
float r = cl_cuda_examples_sph_norm( x );
return float4_scale_flipped( (-45.0f / (3.1415927f * (0.005f * (0.005f * (0.005f * (0.005f * (0.005f * 0.005f))))))), float4_scale_flipped( ((0.005f - r) * (0.005f - r)), float4_scale_inverted( x, r ) ) );
}
}
__device__ float cl_cuda_examples_sph_rap_visc_kernel( float4 x )
{
{
float r = cl_cuda_examples_sph_norm( x );
return ((45.0f / (3.1415927f * (0.005f * (0.005f * (0.005f * (0.005f * (0.005f * 0.005f))))))) * (0.005f - r));
}
}
__global__ void cl_cuda_examples_sph_update_density( float* rho, float4* pos, int n, int* neighbor_map )
{
{
int i = ((blockIdx.x * blockDim.x) + threadIdx.x);
if ((i < n)) {
{
float4 xi = pos[i];
float tmp = 0.0f;
{
float4 x16010 = xi;
{
int i015989 = floorf( ((x16010.x - cl_cuda_examples_sph_origin.x) / cl_cuda_examples_sph_delta) );
int j015990 = floorf( ((x16010.y - cl_cuda_examples_sph_origin.y) / cl_cuda_examples_sph_delta) );
int k015991 = floorf( ((x16010.z - cl_cuda_examples_sph_origin.z) / cl_cuda_examples_sph_delta) );
for ( int i15992 = (i015989 - 1); ! (i15992 > (i015989 + 1)); i15992 = (i15992 + 1) )
{
for ( int j15993 = (j015990 - 1); ! (j15993 > (j015990 + 1)); j15993 = (j15993 + 1) )
{
for ( int k15994 = (k015991 - 1); ! (k15994 > (k015991 + 1)); k15994 = (k15994 + 1) )
{
for ( int l15995 = 1; ! (l15995 > neighbor_map[cl_cuda_examples_sph_offset( i15992, j15993, k15994, 0 )]); l15995 = (l15995 + 1) )
{
{
int j = neighbor_map[cl_cuda_examples_sph_offset( i15992, j15993, k15994, l15995 )];
{
float4 xj = pos[j];
{
float4 dr = float4_scale( float4_sub( xi, xj ), 0.004f );
if ((cl_cuda_examples_sph_norm( dr ) <= 0.005f)) {
tmp = (tmp + ((0.00020543f / 8.0f) * cl_cuda_examples_sph_poly6_kernel( dr )));
}
}
}
}
}
}
}
}
}
}
rho[i] = tmp;
}
}
}
}
__global__ void cl_cuda_examples_sph_update_pressure( float* prs, float* rho, int n )
{
{
int i = ((blockIdx.x * blockDim.x) + threadIdx.x);
if ((i < n)) {
prs[i] = ((rho[i] - 600.0f) * 3.0f);
}
}
}
__device__ float4 cl_cuda_examples_sph_pressure_term( float* rho, float* prs, int i, int j, float4 dr )
{
return float4_scale_flipped( ((float_negate( (0.00020543f / 8.0f) ) * (prs[i] + prs[j])) / (2.0f * rho[j])), cl_cuda_examples_sph_grad_spiky_kernel( dr ) );
}
__device__ float4 cl_cuda_examples_sph_viscosity_term( float4* vel, float* rho, int i, int j, float4 dr )
{
return float4_scale( float4_scale_inverted( float4_scale_flipped( 0.2f, float4_scale_flipped( (0.00020543f / 8.0f), float4_sub( vel[j], vel[i] ) ) ), rho[j] ), cl_cuda_examples_sph_rap_visc_kernel( dr ) );
}
__global__ void cl_cuda_examples_sph_update_force( float4* force, float4* pos, float4* vel, float* rho, float* prs, int n, int* neighbor_map )
{
{
int i = ((blockIdx.x * blockDim.x) + threadIdx.x);
if ((i < n)) {
{
float4 xi = pos[i];
float4 tmp = make_float4( 0.0f, 0.0f, 0.0f, 0.0f );
{
float4 x16150 = xi;
{
int i016129 = floorf( ((x16150.x - cl_cuda_examples_sph_origin.x) / cl_cuda_examples_sph_delta) );
int j016130 = floorf( ((x16150.y - cl_cuda_examples_sph_origin.y) / cl_cuda_examples_sph_delta) );
int k016131 = floorf( ((x16150.z - cl_cuda_examples_sph_origin.z) / cl_cuda_examples_sph_delta) );
for ( int i16132 = (i016129 - 1); ! (i16132 > (i016129 + 1)); i16132 = (i16132 + 1) )
{
for ( int j16133 = (j016130 - 1); ! (j16133 > (j016130 + 1)); j16133 = (j16133 + 1) )
{
for ( int k16134 = (k016131 - 1); ! (k16134 > (k016131 + 1)); k16134 = (k16134 + 1) )
{
for ( int l16135 = 1; ! (l16135 > neighbor_map[cl_cuda_examples_sph_offset( i16132, j16133, k16134, 0 )]); l16135 = (l16135 + 1) )
{
{
int j = neighbor_map[cl_cuda_examples_sph_offset( i16132, j16133, k16134, l16135 )];
if ((i != j)) {
{
float4 xj = pos[j];
{
float4 dr = float4_scale( float4_sub( xi, xj ), 0.004f );
if ((cl_cuda_examples_sph_norm( dr ) <= 0.005f)) {
tmp = float4_add( tmp, cl_cuda_examples_sph_pressure_term( rho, prs, i, j, dr ) );
tmp = float4_add( tmp, cl_cuda_examples_sph_viscosity_term( vel, rho, i, j, dr ) );
}
}
}
}
}
}
}
}
}
}
}
force[i] = tmp;
}
}
}
}
__global__ void cl_cuda_examples_sph_update_acceleration( float4* acc, float4* force, float* rho, int n )
{
{
int i = ((blockIdx.x * blockDim.x) + threadIdx.x);
if ((i < n)) {
acc[i] = float4_add( float4_scale_inverted( force[i], rho[i] ), make_float4( 0.0f, -9.8f, 0.0f, 0.0f ) );
}
}
}
__global__ void cl_cuda_examples_sph_update_velocity( float4* vel, float4* acc, int n )
{
{
int i = ((blockIdx.x * blockDim.x) + threadIdx.x);
if ((i < n)) {
vel[i] = float4_add( vel[i], float4_scale( acc[i], 0.0004f ) );
}
}
}
__global__ void cl_cuda_examples_sph_update_position( float4* pos, float4* vel, int n )
{
{
int i = ((blockIdx.x * blockDim.x) + threadIdx.x);
if ((i < n)) {
pos[i] = float4_add( pos[i], float4_scale_inverted( float4_scale( vel[i], 0.0004f ), 0.004f ) );
}
}
} hope this helps... |
Thanks for reporting. The following lines cause the errors, where
It seems that initialization of constant variables needs to be compiled into literals instead of function calling on MSVC, though I'm not sure relating with CUDA version. |
The following fix should work.
|
I've re-checked everything. The tests work, but for some reason the My guess is that it's just a MSVC thing and maybe have the code generator work around that? Everything else works beautifully, thanks for this library! It's saved me much headache and time!!! |
I'm facing the same problem and upon closer inspection, I'd found that the generated output is messed up, it tries to compile functions for diffuse0, diffuse1, sph and defglabal examples into one file! currently the only way for me is by restarting my Slime session (using SBCL). I'll try to find where we need to reset the output buffer (if any), before generating a new definitions. |
@alaa-alawi |
Yes, the problem is caused on MSVC. I need to fix forms as |
When they are used in functions:
When they appear in global initialization:
or
|
To fix this,
|
I try to take time this week to fix this problem. |
@takagi wouldn't it be feasible to attach the kernel definition with its associated symbol. That is through its properties list (Something similar to what is done here http://www.gigamonkeys.com/book/practical-parsing-binary-files.html) |
@alaa-alawi I think the technique would not feasible in cl-cuda's case, how can we get symbols associated with kernel definitions to be compiled? |
@serialhex Sorry for late response. Would you try the latest |
Rel. #73. |
@takagi Attaching symbols to a given kernel definition symbol (its generated defun) with a list of callees. (push callee (get kernel-symbol 'cl-cuda-kernel-calees)) ;; during kernel definition Then recursively compile each, collecting the result into a clean slate. What came to mind was the case when there will be separate cl-cuda helper/utilities libraries with their own packages, and how to compile the final kernel properly. Do you think using package alone as a kind of namespace will not help in this case? I assume not, however I'll need to validate this with some test cases. |
Okay. Then the questions I have are:
Sorry, I do not get your points. Are some special treatments needed in this case? I guess callees can be collected even if they are in other packages from their callers. If a kernel function is called from multiple callers, it would be collected multiple times from each caller. Would you give me an example? |
Additionally, we should take care of mutual recursion. |
All the examples worked on my machine, GTX 1070, cuda 8.0 on ubuntu 16.04. Plz change the issue title to something that mentions MSVC, it would give a bad impression on newcomers. |
It should be fixed via #83, so I close this issue. If the problem is not fixed, please reopen it. |
The error I am getting is:
when using the
vector-add
example.I'm pretty sure this is an error on NVIDIAs side, I thought I changed from 8.0.27 to 8.0.44, but unfortunately 8.0.44 didn't work and I tried 8.0.27 and it still didn't work. I'm going to try installing an earlier version (7.x) and see if that works.
The text was updated successfully, but these errors were encountered: