diff --git a/README.md b/README.md index a82ea0f..87cab1d 100644 --- a/README.md +++ b/README.md @@ -3,211 +3,29 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Levi Cai +* Tested on: Windows 7, i7 @ 3.4GHz 16GB, Nvidia NVS 310 (Moore 100C Lab) -### (TODO: Your README) - -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) - -Instructions (delete me) -======================== - -This is due Sunday, September 13 at midnight. - -**Summary:** In this project, you'll implement GPU stream compaction in CUDA, -from scratch. This algorithm is widely used, and will be important for -accelerating your path tracer project. - -Your stream compaction implementations in this project will simply remove `0`s -from an array of `int`s. In the path tracer, you will remove terminated paths -from an array of rays. - -In addition to being useful for your path tracer, this project is meant to -reorient your algorithmic thinking to the way of the GPU. On GPUs, many -algorithms can benefit from massive parallelism and, in particular, data -parallelism: executing the same code many times simultaneously with different -data. - -You'll implement a few different versions of the *Scan* (*Prefix Sum*) -algorithm. First, you'll implement a CPU version of the algorithm to reinforce -your understanding. Then, you'll write a few GPU implementations: "naive" and -"work-efficient." Finally, you'll use some of these to implement GPU stream -compaction. - -**Algorithm overview & details:** There are two primary references for details -on the implementation of scan and stream compaction. - -* The [slides on Parallel Algorithms](https://github.com/CIS565-Fall-2015/cis565-fall-2015.github.io/raw/master/lectures/2-Parallel-Algorithms.pptx) - for Scan, Stream Compaction, and Work-Efficient Parallel Scan. -* GPU Gems 3, Chapter 39 - [Parallel Prefix Sum (Scan) with CUDA](http://http.developer.nvidia.com/GPUGems3/gpugems3_ch39.html). - -Your GPU stream compaction implementation will live inside of the -`stream_compaction` subproject. This way, you will be able to easily copy it -over for use in your GPU path tracer. - - -## Part 0: The Usual - -This project (and all other CUDA projects in this course) requires an NVIDIA -graphics card with CUDA capability. Any card with Compute Capability 2.0 -(`sm_20`) or greater will work. Check your GPU on this -[compatibility table](https://developer.nvidia.com/cuda-gpus). -If you do not have a personal machine with these specs, you may use those -computers in the Moore 100B/C which have supported GPUs. - -**HOWEVER**: If you need to use the lab computer for your development, you will -not presently be able to do GPU performance profiling. This will be very -important for debugging performance bottlenecks in your program. - -### Useful existing code - -* `stream_compaction/common.h` - * `checkCUDAError` macro: checks for CUDA errors and exits if there were any. - * `ilog2ceil(x)`: computes the ceiling of log2(x), as an integer. -* `main.cpp` - * Some testing code for your implementations. - - -## Part 1: CPU Scan & Stream Compaction - -This stream compaction method will remove `0`s from an array of `int`s. - -In `stream_compaction/cpu.cu`, implement: - -* `StreamCompaction::CPU::scan`: compute an exclusive prefix sum. -* `StreamCompaction::CPU::compactWithoutScan`: stream compaction without using - the `scan` function. -* `StreamCompaction::CPU::compactWithScan`: stream compaction using the `scan` - function. Map the input array to an array of 0s and 1s, scan it, and use - scatter to produce the output. You will need a **CPU** scatter implementation - for this (see slides or GPU Gems chapter for an explanation). - -These implementations should only be a few lines long. - - -## Part 2: Naive GPU Scan Algorithm - -In `stream_compaction/naive.cu`, implement `StreamCompaction::Naive::scan` - -This uses the "Naive" algorithm from GPU Gems 3, Section 39.2.1. We haven't yet -taught shared memory, and you **shouldn't use it yet**. Example 39-1 uses -shared memory, but is limited to operating on very small arrays! Instead, write -this using global memory only. As a result of this, you will have to do -`ilog2ceil(n)` separate kernel invocations. - -Beware of errors in Example 39-1 in the book; both the pseudocode and the CUDA -code in the online version of Chapter 39 are known to have a few small errors -(in superscripting, missing braces, bad indentation, etc.) - -Since the parallel scan algorithm operates on a binary tree structure, it works -best with arrays with power-of-two length. Make sure your implementation works -on non-power-of-two sized arrays (see `ilog2ceil`). This requires extra memory -- your intermediate array sizes will need to be rounded to the next power of -two. - - -## Part 3: Work-Efficient GPU Scan & Stream Compaction - -### 3.1. Scan - -In `stream_compaction/efficient.cu`, implement -`StreamCompaction::Efficient::scan` - -All of the text in Part 2 applies. - -* This uses the "Work-Efficient" algorithm from GPU Gems 3, Section 39.2.2. -* Beware of errors in Example 39-2. -* Test non-power-of-two sized arrays. - -### 3.2. Stream Compaction - -This stream compaction method will remove `0`s from an array of `int`s. - -In `stream_compaction/efficient.cu`, implement -`StreamCompaction::Efficient::compact` - -For compaction, you will also need to implement the scatter algorithm presented -in the slides and the GPU Gems chapter. - -In `stream_compaction/common.cu`, implement these for use in `compact`: - -* `StreamCompaction::Common::kernMapToBoolean` -* `StreamCompaction::Common::kernScatter` - - -## Part 4: Using Thrust's Implementation - -In `stream_compaction/thrust.cu`, implement: - -* `StreamCompaction::Thrust::scan` - -This should be a very short function which wraps a call to the Thrust library -function `thrust::exclusive_scan(first, last, result)`. - -To measure timing, be sure to exclude memory operations by passing -`exclusive_scan` a `thrust::device_vector` (which is already allocated on the -GPU). You can create a `thrust::device_vector` by creating a -`thrust::host_vector` from the given pointer, then casting it. - - -## Part 5: Radix Sort (Extra Credit) (+10) - -Add an additional module to the `stream_compaction` subproject. Implement radix -sort using one of your scan implementations. Add tests to check its correctness. - - -## Write-up - -1. Update all of the TODOs at the top of this README. -2. Add a description of this project including a list of its features. -3. Add your performance analysis (see below). +### Questions -All extra credit features must be documented in your README, explaining its -value (with performance comparison, if applicable!) and showing an example how -it works. For radix sort, show how it is called and an example of its output. +![](images/comparison.PNG) -Always profile with Release mode builds and run without debugging. +Above you can see the comparison between the run-times of all the different algorithms. Any initial and final cudamemcpy/malloc's are NOT included, however, any intermediate ones ARE, which is I believe a fairly large contributor to the fact that the algorithms are in general slower than the CPU implementation. I also believe that the array sizes are relatively small, so perhaps the overhead of launching CUDA programs out-weighs the parallelization (though I'm not certain about this to be honest). -### Questions +In terms of the GPU implementations, the work-efficient algorithm is in general an order of magnitude faster than the naive version. This is most likely because of the necessary overhead of the sequential nature and increased number of operations of the algorithm in addition to the extra cudaMemcpy call in each iteration at each depth in my implementation. -* Roughly optimize the block sizes of each of your implementations for minimal - run time on your GPU. - * (You shouldn't compare unoptimized implementations to each other!) +One thing to note is the Thrust (1) and Thrust (2). I noticed that the first thrust run in a single session took much longer than the second run (1st for a power-of-two and 2nd for a non-power-of-two), I am wondering if it is perhaps caching somewhere. It is clear that it is doing some kind of memory allocation from the timeline (session only contained a run from thrust): -* Compare all of these GPU Scan implementations (Naive, Work-Efficient, and - Thrust) to the serial CPU version of Scan. Plot a graph of the comparison - (with array size on the independent axis). - * You should use CUDA events for timing. Be sure **not** to include any - explicit memory operations in your performance measurements, for - comparability. - * To guess at what might be happening inside the Thrust implementation, take - a look at the Nsight timeline for its execution. +![](images/timeline.PNG) -* Write a brief explanation of the phenomena you see here. - * Can you find the performance bottlenecks? Is it memory I/O? Computation? Is - it different for each implementation? +Overall sample output: -* Paste the output of the test program into a triple-backtick block in your - README. - * If you add your own tests (e.g. for radix sort or to test additional corner - cases), be sure to mention it explicitly. +![](images/2_15_results.PNG) -These questions should help guide you in performance analysis on future -assignments, as well. +## Radix Sort -## Submit +Sample radix output with tests: -If you have modified any of the `CMakeLists.txt` files at all (aside from the -list of `SOURCE_FILES`), you must test that your project can build in Moore -100B/C. Beware of any build issues discussed on the Google Group. +![](images/radix.PNG) -1. Open a GitHub pull request so that we can see that you have finished. - The title should be "Submission: YOUR NAME". -2. Send an email to the TA (gmail: kainino1+cis565@) with: - * **Subject**: in the form of `[CIS565] Project 2: PENNKEY` - * Direct link to your pull request on GitHub - * In the form of a grade (0-100+) with comments, evaluate your own - performance on the project. - * Feedback on the project itself, if any. +In terms of performance of the radix sort, it seems to be consistently ~0.05s regardless of the size of the array. I'm not sure why this is the case, as I do have several memcpy's in the implementation which I would expect to slow it down dramatically as the size of the array increases. Though perhaps this is an issue with cudaTiming on CPU malloc's vs. cudaMallocs, as there are few cudaMalloc's in my implementation. diff --git a/images/2_15_results.PNG b/images/2_15_results.PNG new file mode 100644 index 0000000..30a4fde Binary files /dev/null and b/images/2_15_results.PNG differ diff --git a/images/comparison.PNG b/images/comparison.PNG new file mode 100644 index 0000000..5cf7ebf Binary files /dev/null and b/images/comparison.PNG differ diff --git a/images/radix.PNG b/images/radix.PNG new file mode 100644 index 0000000..35dc591 Binary files /dev/null and b/images/radix.PNG differ diff --git a/images/timeline.PNG b/images/timeline.PNG new file mode 100644 index 0000000..426af51 Binary files /dev/null and b/images/timeline.PNG differ diff --git a/src/main.cpp b/src/main.cpp index 1f0482a..3b7b8c5 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -13,15 +13,16 @@ #include #include #include +#include #include "testing_helpers.hpp" int main(int argc, char* argv[]) { - const int SIZE = 1 << 8; + const int SIZE = 1 << 11; + //const int SIZE = 1 << 3; const int NPOT = SIZE - 3; int a[SIZE], b[SIZE], c[SIZE]; // Scan tests - printf("\n"); printf("****************\n"); printf("** SCAN TESTS **\n"); @@ -29,7 +30,19 @@ int main(int argc, char* argv[]) { genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case a[SIZE - 1] = 0; - printArray(SIZE, a, true); + //printArray(SIZE, a, true); + + printDesc("Radix sort"); + StreamCompaction::Radix::sort(SIZE, b, a); + printArray(SIZE, a, true); + printArray(SIZE, b, true); + + printDesc("Radix sort mini test"); + int x[8] = {4, 3, 2, 7, 6, 22, 38, 0}; + int y[8]; + StreamCompaction::Radix::sort(8, y, x); + printArray(8, x, true); + printArray(8, y, true); zeroArray(SIZE, b); printDesc("cpu scan, power-of-two"); @@ -45,46 +58,45 @@ int main(int argc, char* argv[]) { zeroArray(SIZE, c); printDesc("naive scan, power-of-two"); StreamCompaction::Naive::scan(SIZE, c, a); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("naive scan, non-power-of-two"); StreamCompaction::Naive::scan(NPOT, c, a); - //printArray(SIZE, c, true); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, power-of-two"); StreamCompaction::Efficient::scan(SIZE, c, a); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, non-power-of-two"); StreamCompaction::Efficient::scan(NPOT, c, a); - //printArray(NPOT, c, true); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); printDesc("thrust scan, power-of-two"); StreamCompaction::Thrust::scan(SIZE, c, a); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("thrust scan, non-power-of-two"); StreamCompaction::Thrust::scan(NPOT, c, a); - //printArray(NPOT, c, true); + printArray(NPOT, c, true); printCmpResult(NPOT, b, c); - + printf("\n"); printf("*****************************\n"); printf("** STREAM COMPACTION TESTS **\n"); printf("*****************************\n"); // Compaction tests - genArray(SIZE - 1, a, 4); // Leave a 0 at the end to test that edge case a[SIZE - 1] = 0; printArray(SIZE, a, true); @@ -114,7 +126,7 @@ int main(int argc, char* argv[]) { zeroArray(SIZE, c); printDesc("work-efficient compact, power-of-two"); count = StreamCompaction::Efficient::compact(SIZE, c, a); - //printArray(count, c, true); + printArray(count, c, true); printCmpLenResult(count, expectedCount, b, c); zeroArray(SIZE, c); @@ -122,4 +134,6 @@ int main(int argc, char* argv[]) { count = StreamCompaction::Efficient::compact(NPOT, c, a); //printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); + + while (1){}; // Just so I can see the output } diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt index cdbef77..bcc484e 100644 --- a/stream_compaction/CMakeLists.txt +++ b/stream_compaction/CMakeLists.txt @@ -9,6 +9,8 @@ set(SOURCE_FILES "efficient.cu" "thrust.h" "thrust.cu" + "radix.h" + "radix.cu" ) cuda_add_library(stream_compaction diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index fe872d4..e5d4dfc 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -23,7 +23,11 @@ namespace Common { * which map to 0 will be removed, and elements which map to 1 will be kept. */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { - // TODO + int k = threadIdx.x + (blockIdx.x * blockDim.x); + + if (k < n){ + bools[k] = !!idata[k]; + } } /** @@ -32,7 +36,13 @@ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { */ __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { - // TODO + int k = threadIdx.x + (blockIdx.x * blockDim.x); + + if (k < n){ + if (bools[k] == 1){ + odata[indices[k]] = idata[k]; + } + } } } diff --git a/stream_compaction/common.h b/stream_compaction/common.h index 4f52663..177b7c5 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -7,6 +7,8 @@ #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__) #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__) +#define MAXTHREADS 1024 + /** * Check for CUDA errors; print and exit if there was a problem. */ diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index e600c29..cedfda7 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -1,5 +1,7 @@ #include #include "cpu.h" +#include +#include namespace StreamCompaction { namespace CPU { @@ -8,8 +10,23 @@ namespace CPU { * CPU scan (prefix sum). */ void scan(int n, int *odata, const int *idata) { - // TODO - printf("TODO\n"); + + cudaEvent_t start,stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + + cudaEventRecord(start); + + odata[0] = 0; + for (int i=1; i>= 1) { + ++lg; + } + return lg; +} + +inline int ilog2ceil(int x) { + return ilog2(x - 1) + 1; +} namespace StreamCompaction { namespace CPU { diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index b2f739b..17f5c94 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -8,12 +8,77 @@ namespace Efficient { // TODO: __global__ +__global__ void upsweep(int n, int powd, int powd1, int* idata){ + int k = threadIdx.x + (blockIdx.x * blockDim.x); + + if (k < n){ + if (k % (powd1) == 0){ + idata[k + powd1 - 1] += idata[k + powd - 1]; + } + } +} + +__global__ void downsweep(int n, int powd, int powd1, int* idata){ + int k = threadIdx.x + (blockIdx.x * blockDim.x); + + if (k < n){ + if (k % (powd1) == 0){ + int t = idata[k + powd - 1]; + idata[k + powd - 1] = idata[k + powd1 - 1]; + idata[k + powd1 - 1] += t; + } + } +} + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - // TODO - printf("TODO\n"); + // Compute log-rounded n + int td = ilog2ceil(n); + int n2 = (int)pow(2,td); + + int numBlocks = (n2 - 1) / MAXTHREADS + 1; + + int n_size = n * sizeof(int); + int n2_size = n2 * sizeof(int); + + int* hst_idata2 = new int[n2](); + memcpy(hst_idata2, idata, n_size); + + int* dev_idata; + cudaMalloc((void**)&dev_idata, n2_size); + cudaMemcpy(dev_idata, hst_idata2, n2_size, cudaMemcpyHostToDevice); + + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + cudaEventRecord(start); + + // Scan + int powd, powd1; + for(int d=0; d>>(n2, powd, powd1, dev_idata); + } + + cudaMemset((void*)&dev_idata[n2-1],0,sizeof(int)); + for(int d=td-1; d>=0; d--){ + powd = (int)pow(2,d); + powd1 = (int)pow(2,d+1); + downsweep<<>>(n2, powd, powd1, dev_idata); + } + + cudaEventRecord(stop); + cudaEventSynchronize(stop); + float ms = 0; + cudaEventElapsedTime(&ms, start, stop); + //printf("efficient scan time (s): %f\n",ms/1000.0); + + // Remove leftover (from the log-rounded portion) + // No need to shift in this one I guess? + cudaMemcpy(odata, dev_idata, n_size, cudaMemcpyDeviceToHost); } /** @@ -26,8 +91,56 @@ void scan(int n, int *odata, const int *idata) { * @returns The number of elements remaining after compaction. */ int compact(int n, int *odata, const int *idata) { - // TODO - return -1; + int n_size = n*sizeof(int); + int numBlocks = (n - 1) / MAXTHREADS + 1; + int on = -1; + + // Initialize memory + int* hst_nz = (int*)malloc(n_size); + + int* dev_nz; + int* dev_idata; + int* dev_scan; + int* dev_odata; + + cudaMalloc((void**)&dev_nz, n_size); + cudaMalloc((void**)&dev_idata, n_size); + cudaMalloc((void**)&dev_scan, n_size); + cudaMalloc((void**)&dev_odata, n_size); + + // Nonzero + cudaMemcpy(dev_idata, idata, n_size, cudaMemcpyHostToDevice); + + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + cudaEventRecord(start); + + StreamCompaction::Common::kernMapToBoolean<<>>(n, dev_nz, dev_idata); + cudaDeviceSynchronize(); + + // TODO: technically only need the last element here + cudaMemcpy(hst_nz, dev_nz, n_size, cudaMemcpyDeviceToHost); + + // Scan + int* hst_scan = (int*)malloc(n_size); + scan(n, hst_scan, hst_nz); + on = hst_scan[n-1] + hst_nz[n-1]; + + // Scatter + cudaMemcpy(dev_scan, hst_scan, n_size, cudaMemcpyHostToDevice); + StreamCompaction::Common::kernScatter<<>>(n, dev_odata, dev_idata, dev_nz, dev_scan); + cudaDeviceSynchronize(); + + cudaEventRecord(stop); + cudaEventSynchronize(stop); + float ms = 0; + cudaEventElapsedTime(&ms, start, stop); + printf("efficient compact time (s): %f\n",ms/1000.0); + + cudaMemcpy(odata, dev_odata, n_size, cudaMemcpyDeviceToHost); + + return on; } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 3d86b60..eaab4fe 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -7,13 +7,61 @@ namespace StreamCompaction { namespace Naive { // TODO: __global__ +__global__ void kernScan(int n, int powd, int* odata, int* idata){ + int i = threadIdx.x + (blockIdx.x * blockDim.x); + //int powd = (int)pow(2,d); + + if (i < n){ + if (i >= powd){ + odata[i] = idata[i - powd] + idata[i]; + } else { + odata[i] = idata[i]; + } + } +} /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - // TODO - printf("TODO\n"); + // Initialize + int td = ilog2ceil(n); + int n2 = (int)pow(2,td); + int numBlocks = (n2-1) / MAXTHREADS + 1; + + int n_size = n * sizeof(int); + int n2_size = n2 * sizeof(int); + + int* dev_idata; + int* dev_odata; + + cudaMalloc((void**)&dev_idata, n2_size); + cudaMalloc((void**)&dev_odata, n2_size); + cudaMemcpy(dev_idata, idata, n_size, cudaMemcpyHostToDevice); + cudaMemset(dev_idata+n, 0, n2_size-n_size); + + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + + // Scan + cudaEventRecord(start); + for(int d=1; d<=td; d++){ + int powd = 1 << (d-1); + kernScan<<>>(n2, powd, dev_odata, dev_idata); + cudaThreadSynchronize(); + cudaMemcpy(dev_idata, dev_odata, n2_size, cudaMemcpyDeviceToDevice); + } + cudaEventRecord(stop); + cudaEventSynchronize(stop); + float ms = 0; + cudaEventElapsedTime(&ms, start, stop); + printf("naive time(s): %f\n", ms/1000.0); + + // Remove leftover (from the log-rounded portion) + // Do a shift right to make it an exclusive sum + odata[0] = 0; + cudaMemcpy(odata+1, dev_odata, n_size-sizeof(int), cudaMemcpyDeviceToHost); } } diff --git a/stream_compaction/radix.cu b/stream_compaction/radix.cu new file mode 100644 index 0000000..b1c8721 --- /dev/null +++ b/stream_compaction/radix.cu @@ -0,0 +1,147 @@ +#include "radix.h" +#include "common.h" +#include "efficient.h" +#include +#include + +namespace StreamCompaction { +namespace Radix { + +/** + * Maps an array to an array of 0s and 1s for stream compaction. Elements + * which map to 0 will be removed, and elements which map to 1 will be kept. + */ +__global__ void kernMapToBits(int n, int bit, int *bools, const int *idata) { + int k = threadIdx.x + (blockIdx.x * blockDim.x); + + if (k < n){ + int b = 1 << bit; + bools[k] = (b & idata[k]) >> bit; + } +} + +__global__ void kernNegate(int n, int* odata, const int* idata){ + int k = threadIdx.x + (blockIdx.x * blockDim.x); + + if (k < n){ + odata[k] = !idata[k]; + } +} + +__global__ void kernSumFalses(int n, int totalFalses, int* odata, const int* idata){ + int k = threadIdx.x + (blockIdx.x * blockDim.x); + + if (k < n){ + odata[k] = k - idata[k] + totalFalses; + } +} + +__global__ void kernMux(int n, int* odata, const int* bdata, const int* tdata, const int* fdata){ + int k = threadIdx.x + (blockIdx.x * blockDim.x); + + if (k < n){ + odata[k] = bdata[k] ? tdata[k] : fdata[k]; + } +} + +__global__ void kernScatter(int n, int *odata, + const int *idata, const int *indices) { + int k = threadIdx.x + (blockIdx.x * blockDim.x); + + if (k < n){ + odata[indices[k]] = idata[k]; + } +} + +void split(int n, int bit, int* odata, int* idata){ + int numBlocks = (n - 1) / MAXTHREADS + 1; + int n_size = n * sizeof(int); + + int* hst_e; + int* hst_f; + int* dev_idata; + int* dev_odata; + int* dev_b; + int* dev_e; + int* dev_f; + int* dev_t; + int* dev_d; + int totalFalses; + + hst_e = (int*)malloc(n_size); + hst_f = (int*)malloc(n_size); + cudaMalloc((void**)&dev_idata, n_size); + cudaMalloc((void**)&dev_odata, n_size); + cudaMalloc((void**)&dev_b, n_size); + cudaMalloc((void**)&dev_e, n_size); + cudaMalloc((void**)&dev_f, n_size); + cudaMalloc((void**)&dev_t, n_size); + cudaMalloc((void**)&dev_d, n_size); + + cudaMemcpy(dev_idata, idata, n_size, cudaMemcpyHostToDevice); + + // b - get bits + kernMapToBits<<>>(n, bit, dev_b, dev_idata); + cudaDeviceSynchronize(); + + int* hst_b = (int*)malloc(n_size); + + // e - negate bits + kernNegate<<>>(n, dev_e, dev_b); + cudaDeviceSynchronize(); + + // f - exclusive scan on e + cudaMemcpy(hst_e, dev_e, n_size, cudaMemcpyDeviceToHost); + StreamCompaction::Efficient::scan(n, hst_f, hst_e); + cudaMemcpy(dev_f, hst_f, n_size, cudaMemcpyHostToDevice); + cudaDeviceSynchronize(); + + // t - t[i] = i - f[i] + totalFalses + int en1; + cudaMemcpy(&en1, dev_e + (n - 1), sizeof(int), cudaMemcpyDeviceToHost); + totalFalses = en1 + hst_f[n - 1]; + kernSumFalses<<>>(n, totalFalses, dev_t, dev_f); + cudaDeviceSynchronize(); + + // d - d[i] = b[i] ? t[i] : f[i] + kernMux<<>>(n, dev_d, dev_b, dev_t, dev_f); + cudaDeviceSynchronize(); + + // scatter idata according to d + kernScatter<<>>(n, dev_odata, dev_idata, dev_d); + cudaDeviceSynchronize(); + + cudaMemcpy(odata, dev_odata, n_size, cudaMemcpyDeviceToHost); +} + +void sort(int n, int* odata, const int* idata){ + int numBits = 8 * sizeof(int); //TODO: probably only need log size of largest int in the array + int n_size = n*sizeof(int); + + int* hst_idata = (int*)malloc(n_size); + int* hst_odata = (int*)malloc(n_size); + int* tmp; + memcpy(hst_idata, idata, n_size); + + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + cudaEventRecord(start); + + for (int i = 0; i < numBits; i++){ + split(n, i, hst_odata, hst_idata); + cudaDeviceSynchronize(); + memcpy(hst_idata, hst_odata, n_size); + } + + cudaEventRecord(stop); + cudaEventSynchronize(stop); + float ms = 0; + cudaEventElapsedTime(&ms, start, stop); + printf("radix sort time (s): %f\n", ms / 1000.0); + + memcpy(odata, hst_odata, n_size); +} + +} +} diff --git a/stream_compaction/radix.h b/stream_compaction/radix.h new file mode 100644 index 0000000..bea7a3f --- /dev/null +++ b/stream_compaction/radix.h @@ -0,0 +1,14 @@ +#pragma once + +#include +#include +#include + +#define MAXTHREADS 1024 + +namespace StreamCompaction { +namespace Radix { + void split(int n, int bit, int* odata, int* idata); + void sort(int n, int* odata, const int* idata); +} +} diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index d8dbb32..a15b6ad 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -16,6 +16,29 @@ void scan(int n, int *odata, const int *idata) { // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + + // Initialize + thrust::host_vector hst_v_in(idata,idata+n); + thrust::device_vector v_in = hst_v_in; + thrust::device_vector v_out(n); + + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + + cudaEventRecord(start); + + // Scan + thrust::exclusive_scan(v_in.begin(), v_in.end(), v_in.begin()); + + cudaEventRecord(stop); + cudaEventSynchronize(stop); + float ms = 0; + cudaEventElapsedTime(&ms, start, stop); + printf("thrust time (s): %f\n",ms/1000.0); + + thrust::host_vector hst_v_out = v_in; + memcpy(odata,hst_v_out.data(),n*sizeof(int)); } }