diff --git a/README.md b/README.md index b71c458..912d688 100644 --- a/README.md +++ b/README.md @@ -3,11 +3,675 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* (TODO) Rony Edde (redde) +* Tested on: Windows 10, i7-6700k @ 4.00GHz 64GB, GTX 980M 8GB (Personal Laptop) ### (TODO: Your README) Include analysis, etc. (Remember, this is public, so don't put anything here that you don't want to share with the world.) + +This is an implementation of scan and reduce on the CPU and GPU +for stream compaction. +There are 2 types of arrays to consider. Powers of 2 and non +powers of 2 which are part of the test. + +* Scan + * The first method used is on the CPU where all elements are + added using a for loop. This works very well on small arrays. + + * The second method consists of using the GPU to compute the + scan result. This is a naive implementation where the first + iteration copies elements left to right (exclusive), then + 2^(i+1) for a depth of log2(n). Each depth skips 2^(d-1). + This is faster than the CPU version for large arrays. + + * The third method uses a double sweep. An upsweep, followed by + a downsweep using a balanced tree form. Each sweeps takes + log2(n-1) iterations but the calls on the GPU are only taking + place on multiples of 2^(d+1). This should be fast because + there are only O(n) adds for the up sweep and O(n) adds and + O(n) swaps. + + * Thrust scan uses CUDA's thrust exclusive function which is + built in the CUDA library. + +* Stream Compaction + * The first implementation is on the CPU where a for loop looks + for values greater than 0 and adds them to the new array while + incrementing the count when a non zero value is found. + + * The second implementation uses the CPU but also uses the scan + function to look up indices. + + * The third implementation uses the GPU to generate a [0 1] + mapped array which is then run into the scan GPU function + and used as an index lookup for placing the elements. + After the scan function all non zero elements will be + present wich will result in a compact array with no zeros. + + +* Thrust + * The implementation is mentionned in the Scan section. + +* Radix sort + * There are 2 versions of the Radix sort. This first one runs + on the CPU using a CPU version on scan. + + * The second version uses the GPU to run the scan. It also uses + a GPU function to determine the max number in the array. This + is used to determine how many loops are needed before we reach + the maximum amount of decimals in the maximum number. + Another GPU function is used to shift the scan from exclusive + to inclusive which is needed for a correct radix sort. + There are multiple scan functions that can be used, a few are + benchmarked. + +* Benchmarks + * Running benhmarks on a range of 256 to 65536 with a power of 4 + increment gave the following results: + + +``` +**************** +** SCAN TESTS ** +**************** + [ 38 19 38 37 5 47 15 35 0 12 3 0 42 ... 26 0 ] +==== cpu scan, power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 6203 6229 ] +==== cpu scan, non-power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 6146 6190 ] + passed +==== naive scan, power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 6203 6229 ] + passed +==== naive scan, non-power-of-two ==== + [ 0 38 57 95 132 137 184 199 234 234 246 249 249 ... 0 0 ] + passed +==== work-efficient scan, power-of-two ==== + passed +==== work-efficient scan, non-power-of-two ==== + passed +==== thrust scan, power-of-two ==== + passed +==== thrust scan, non-power-of-two ==== + passed +==== cpu radix sort ==== + [ 0 0 0 0 0 0 1 1 1 1 2 2 2 ... 49 49 ] + passed +==== gpu radix sort cpu scan ==== + [ 0 0 0 0 0 0 1 1 1 1 2 2 2 ... 49 49 ] + passed +==== gpu radix sort naive scan ==== + [ 0 0 0 0 0 0 1 1 1 1 2 2 2 ... 49 49 ] + passed +==== gpu radix sort thrust scan ==== + [ 0 0 0 0 0 0 1 1 1 1 2 2 2 ... 49 49 ] + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 2 3 2 1 3 1 1 1 2 0 1 0 2 ... 0 0 ] +==== cpu compact without scan, power-of-two ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 2 1 ] + passed +==== cpu compact without scan, non-power-of-two ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 3 2 ] + passed +==== cpu compact with scan ==== + [ 2 3 2 1 3 1 1 1 2 1 2 1 1 ... 2 1 ] + passed +==== work-efficient compact, power-of-two ==== + passed +==== work-efficient compact, non-power-of-two ==== + passed + + +==== BENCHMARK on 256 items ==== +naive scan time: 0.054762 +upsweep time: 0.053930 +downsweep time: 0.066717 +thurst time: 0.001216 + +==== BENCHMARK on 4096 items ==== +naive scan time: 0.055395 +upsweep time: 0.056800 +downsweep time: 0.063933 +thurst time: 0.001216 + +==== BENCHMARK on 65536 items ==== +naive scan time: 0.055034 +upsweep time: 0.054170 +downsweep time: 0.063862 +thurst time: 0.001190 + +==== BENCHMARK on 1048576 items ==== +naive scan time: 0.055942 +upsweep time: 0.057514 +downsweep time: 0.066336 +thurst time: 0.001194 + +==== BENCHMARK on 16777216 items ==== +naive scan time: 0.057802 +upsweep time: 0.056128 +downsweep time: 0.066230 +thurst time: 0.001184 + +==== BENCHMARK on 268435456 items ==== +naive scan time: 0.056563 +upsweep time: 0.055456 +downsweep time: 0.066339 +thurst time: 0.001190 + +-------- GPU calls results -------- +naive: 0.054762 0.055395 0.055034 0.055942 0.057802 0.056563 +upsweep: 0.053930 0.056800 0.054170 0.057514 0.056128 0.055456 +downsweep: 0.066717 0.063933 0.063862 0.066336 0.066230 0.066339 +thrust: 0.001216 0.001216 0.001190 0.001194 0.001184 0.001190 + +**************** +** BENCHMARKS ** +**************** + +** SIZE = 256 ** +**************** + [ 38 19 38 37 5 47 15 35 0 12 3 0 42 ... 26 0 ] +==== cpu scan, power-of-two ==== +Time: 0 microSeconds +==== naive scan, power-of-two ==== +Time: 1204 microSeconds +==== work-efficient scan, power-of-two ==== +Time: 747 microSeconds +==== thrust scan, power-of-two ==== +Time: 58 microSeconds +==== cpu radix sort ==== +Time: 89 microSeconds +==== gpu radix sort cpu scan ==== +Time: 1209 microSeconds +==== gpu radix sort naive scan ==== +Time: 1356 microSeconds +==== gpu radix sort thrust scan ==== +Time: 1080 microSeconds +==== cpu compact without scan, power-of-two ==== +Time: 0 microSeconds +==== cpu compact with scan ==== +Time: 2 microSeconds +==== work-efficient compact, power-of-two ==== +Time: 1020 microSeconds + +** SIZE = 512 ** +**************** + [ 38 19 38 37 5 47 15 35 0 12 3 0 42 ... 11 0 ] +==== cpu scan, power-of-two ==== +Time: 1 microSeconds +==== naive scan, power-of-two ==== +Time: 563 microSeconds +==== work-efficient scan, power-of-two ==== +Time: 657 microSeconds +==== thrust scan, power-of-two ==== +Time: 91 microSeconds +==== cpu radix sort ==== +Time: 142 microSeconds +==== gpu radix sort cpu scan ==== +Time: 2253 microSeconds +==== gpu radix sort naive scan ==== +Time: 2997 microSeconds +==== gpu radix sort thrust scan ==== +Time: 2042 microSeconds +==== cpu compact without scan, power-of-two ==== +Time: 1 microSeconds +==== cpu compact with scan ==== +Time: 7 microSeconds +==== work-efficient compact, power-of-two ==== +Time: 862 microSeconds + +** SIZE = 1024 ** +**************** + [ 38 19 38 37 5 47 15 35 0 12 3 0 42 ... 40 0 ] +==== cpu scan, power-of-two ==== +Time: 2 microSeconds +==== naive scan, power-of-two ==== +Time: 595 microSeconds +==== work-efficient scan, power-of-two ==== +Time: 674 microSeconds +==== thrust scan, power-of-two ==== +Time: 178 microSeconds +==== cpu radix sort ==== +Time: 282 microSeconds +==== gpu radix sort cpu scan ==== +Time: 2152 microSeconds +==== gpu radix sort naive scan ==== +Time: 3216 microSeconds +==== gpu radix sort thrust scan ==== +Time: 2087 microSeconds +==== cpu compact without scan, power-of-two ==== +Time: 3 microSeconds +==== cpu compact with scan ==== +Time: 9 microSeconds +==== work-efficient compact, power-of-two ==== +Time: 829 microSeconds + +** SIZE = 2048 ** +**************** + [ 38 19 38 37 5 47 15 35 0 12 3 0 42 ... 32 0 ] +==== cpu scan, power-of-two ==== +Time: 4 microSeconds +==== naive scan, power-of-two ==== +Time: 546 microSeconds +==== work-efficient scan, power-of-two ==== +Time: 766 microSeconds +==== thrust scan, power-of-two ==== +Time: 371 microSeconds +==== cpu radix sort ==== +Time: 568 microSeconds +==== gpu radix sort cpu scan ==== +Time: 2536 microSeconds +==== gpu radix sort naive scan ==== +Time: 3176 microSeconds +==== gpu radix sort thrust scan ==== +Time: 2185 microSeconds +==== cpu compact without scan, power-of-two ==== +Time: 6 microSeconds +==== cpu compact with scan ==== +Time: 19 microSeconds +==== work-efficient compact, power-of-two ==== +Time: 853 microSeconds + +** SIZE = 4096 ** +**************** + [ 38 19 38 37 5 47 15 35 0 12 3 0 42 ... 24 0 ] +==== cpu scan, power-of-two ==== +Time: 8 microSeconds +==== naive scan, power-of-two ==== +Time: 615 microSeconds +==== work-efficient scan, power-of-two ==== +Time: 786 microSeconds +==== thrust scan, power-of-two ==== +Time: 704 microSeconds +==== cpu radix sort ==== +Time: 1135 microSeconds +==== gpu radix sort cpu scan ==== +Time: 2460 microSeconds +==== gpu radix sort naive scan ==== +Time: 3368 microSeconds +==== gpu radix sort thrust scan ==== +Time: 2429 microSeconds +==== cpu compact without scan, power-of-two ==== +Time: 14 microSeconds +==== cpu compact with scan ==== +Time: 40 microSeconds +==== work-efficient compact, power-of-two ==== +Time: 983 microSeconds + +** SIZE = 8192 ** +**************** + [ 38 19 38 37 5 47 15 35 0 12 3 0 42 ... 4 0 ] +==== cpu scan, power-of-two ==== +Time: 18 microSeconds +==== naive scan, power-of-two ==== +Time: 629 microSeconds +==== work-efficient scan, power-of-two ==== +Time: 784 microSeconds +==== thrust scan, power-of-two ==== +Time: 1444 microSeconds +==== cpu radix sort ==== +Time: 2306 microSeconds +==== gpu radix sort cpu scan ==== +Time: 3193 microSeconds +==== gpu radix sort naive scan ==== +Time: 3786 microSeconds +==== gpu radix sort thrust scan ==== +Time: 2776 microSeconds +==== cpu compact without scan, power-of-two ==== +Time: 30 microSeconds +==== cpu compact with scan ==== +Time: 81 microSeconds +==== work-efficient compact, power-of-two ==== +Time: 1021 microSeconds + +** SIZE = 16384 ** +**************** + [ 38 19 38 37 5 47 15 35 0 12 3 0 42 ... 26 0 ] +==== cpu scan, power-of-two ==== +Time: 32 microSeconds +==== naive scan, power-of-two ==== +Time: 653 microSeconds +==== work-efficient scan, power-of-two ==== +Time: 1079 microSeconds +==== thrust scan, power-of-two ==== +Time: 2876 microSeconds +==== cpu radix sort ==== +Time: 4595 microSeconds +==== gpu radix sort cpu scan ==== +Time: 4210 microSeconds +==== gpu radix sort naive scan ==== +Time: 4916 microSeconds +==== gpu radix sort thrust scan ==== +Time: 3774 microSeconds +==== cpu compact without scan, power-of-two ==== +Time: 63 microSeconds +==== cpu compact with scan ==== +Time: 165 microSeconds +==== work-efficient compact, power-of-two ==== +Time: 1257 microSeconds + +** SIZE = 32768 ** +**************** + [ 38 19 38 37 5 47 15 35 0 12 3 0 42 ... 7 0 ] +==== cpu scan, power-of-two ==== +Time: 65 microSeconds +==== naive scan, power-of-two ==== +Time: 808 microSeconds +==== work-efficient scan, power-of-two ==== +Time: 1274 microSeconds +==== thrust scan, power-of-two ==== +Time: 5643 microSeconds +==== cpu radix sort ==== +Time: 8982 microSeconds +==== gpu radix sort cpu scan ==== +Time: 6846 microSeconds +==== gpu radix sort naive scan ==== +Time: 7885 microSeconds +==== gpu radix sort thrust scan ==== +Time: 6811 microSeconds +==== cpu compact without scan, power-of-two ==== +Time: 128 microSeconds +==== cpu compact with scan ==== +Time: 335 microSeconds +==== work-efficient compact, power-of-two ==== +Time: 1705 microSeconds + +** SIZE = 65536 ** +**************** + [ 38 19 38 37 5 47 15 35 0 12 3 0 42 ... 35 0 ] +==== cpu scan, power-of-two ==== +Time: 131 microSeconds +==== naive scan, power-of-two ==== +Time: 1140 microSeconds +==== work-efficient scan, power-of-two ==== +Time: 1760 microSeconds +==== thrust scan, power-of-two ==== +Time: 11324 microSeconds +==== cpu radix sort ==== +Time: 18094 microSeconds +==== gpu radix sort cpu scan ==== +Time: 11267 microSeconds +==== gpu radix sort naive scan ==== +Time: 11923 microSeconds +==== gpu radix sort thrust scan ==== +Time: 10084 microSeconds +==== cpu compact without scan, power-of-two ==== +Time: 260 microSeconds +==== cpu compact with scan ==== +Time: 672 microSeconds +==== work-efficient compact, power-of-two ==== +Time: 2814 microSeconds + +** SIZE = 131072 ** +**************** + [ 38 19 38 37 5 47 15 35 0 12 3 0 42 ... 10 0 ] +==== cpu scan, power-of-two ==== +Time: 263 microSeconds +==== naive scan, power-of-two ==== +Time: 1650 microSeconds +==== work-efficient scan, power-of-two ==== +Time: 3079 microSeconds +==== thrust scan, power-of-two ==== +Time: 22802 microSeconds +==== cpu radix sort ==== +Time: 36258 microSeconds +==== gpu radix sort cpu scan ==== +Time: 19548 microSeconds +==== gpu radix sort naive scan ==== +Time: 19723 microSeconds +==== gpu radix sort thrust scan ==== +Time: 18615 microSeconds +==== cpu compact without scan, power-of-two ==== +Time: 517 microSeconds +==== cpu compact with scan ==== +Time: 1571 microSeconds +==== work-efficient compact, power-of-two ==== +Time: 1465 microSeconds + +** SIZE = 262144 ** +**************** + [ 38 19 38 37 5 47 15 35 0 12 3 0 42 ... 42 0 ] +==== cpu scan, power-of-two ==== +Time: 569 microSeconds +==== naive scan, power-of-two ==== +Time: 17 microSeconds +==== work-efficient scan, power-of-two ==== +Time: 25 microSeconds +==== thrust scan, power-of-two ==== +Time: 45705 microSeconds +==== cpu radix sort ==== +Time: 73556 microSeconds +==== gpu radix sort cpu scan ==== +Time: 5243 microSeconds +==== gpu radix sort naive scan ==== +Time: 5223 microSeconds +==== gpu radix sort thrust scan ==== +Time: 5214 microSeconds +==== cpu compact without scan, power-of-two ==== +Time: 1056 microSeconds +==== cpu compact with scan ==== +Time: 3225 microSeconds +==== work-efficient compact, power-of-two ==== +Time: 612 microSeconds + +** SIZE = 524288 ** +**************** + [ 38 19 38 37 5 47 15 35 0 12 3 0 42 ... 42 0 ] +==== cpu scan, power-of-two ==== +Time: 1139 microSeconds +==== naive scan, power-of-two ==== +Time: 18 microSeconds +==== work-efficient scan, power-of-two ==== +Time: 26 microSeconds +==== thrust scan, power-of-two ==== +Time: 92465 microSeconds +==== cpu radix sort ==== +Time: 144470 microSeconds +==== gpu radix sort cpu scan ==== +Time: 10536 microSeconds +==== gpu radix sort naive scan ==== +Time: 10634 microSeconds +==== gpu radix sort thrust scan ==== +Time: 10634 microSeconds +==== cpu compact without scan, power-of-two ==== +Time: 2098 microSeconds +==== cpu compact with scan ==== +Time: 6596 microSeconds +==== work-efficient compact, power-of-two ==== +Time: 1197 microSeconds + +** SIZE = 1048576 ** +**************** + [ 38 19 38 37 5 47 15 35 0 12 3 0 42 ... 6 0 ] +==== cpu scan, power-of-two ==== +Time: 2288 microSeconds +==== naive scan, power-of-two ==== +Time: 19 microSeconds +==== work-efficient scan, power-of-two ==== +Time: 27 microSeconds +==== thrust scan, power-of-two ==== +Time: 185075 microSeconds +==== cpu radix sort ==== +Time: 289369 microSeconds +==== gpu radix sort cpu scan ==== +Time: 22012 microSeconds +==== gpu radix sort naive scan ==== +Time: 21773 microSeconds +==== gpu radix sort thrust scan ==== +Time: 21896 microSeconds +==== cpu compact without scan, power-of-two ==== +Time: 4228 microSeconds +==== cpu compact with scan ==== +Time: 13323 microSeconds +==== work-efficient compact, power-of-two ==== +Time: 2729 microSeconds + +** SIZE = 2097152 ** +**************** + [ 38 19 38 37 5 47 15 35 0 12 3 0 42 ... 2 0 ] +==== cpu scan, power-of-two ==== +Time: 4616 microSeconds +==== naive scan, power-of-two ==== +Time: 20 microSeconds +==== work-efficient scan, power-of-two ==== +Time: 29 microSeconds +==== thrust scan, power-of-two ==== +Time: 369500 microSeconds +==== cpu radix sort ==== +Time: 587131 microSeconds +==== gpu radix sort cpu scan ==== +Time: 44624 microSeconds +==== gpu radix sort naive scan ==== +Time: 44796 microSeconds +==== gpu radix sort thrust scan ==== +Time: 44965 microSeconds +==== cpu compact without scan, power-of-two ==== +Time: 8423 microSeconds +==== cpu compact with scan ==== +Time: 27154 microSeconds +==== work-efficient compact, power-of-two ==== +Time: 6133 microSeconds + +** SIZE = 4194304 ** +**************** + [ 38 19 38 37 5 47 15 35 0 12 3 0 42 ... 26 0 ] +==== cpu scan, power-of-two ==== +Time: 9468 microSeconds +==== naive scan, power-of-two ==== +Time: 20 microSeconds +==== work-efficient scan, power-of-two ==== +Time: 33 microSeconds +==== thrust scan, power-of-two ==== +Time: 736214 microSeconds +==== cpu radix sort ==== +Time: 1182810 microSeconds +==== gpu radix sort cpu scan ==== +Time: 90747 microSeconds +==== gpu radix sort naive scan ==== +Time: 89688 microSeconds +==== gpu radix sort thrust scan ==== +Time: 90998 microSeconds +==== cpu compact without scan, power-of-two ==== +Time: 17085 microSeconds +==== cpu compact with scan ==== +Time: 54581 microSeconds +==== work-efficient compact, power-of-two ==== +Time: 12071 microSeconds + +** SIZE = 8388608 ** +**************** + [ 38 19 38 37 5 47 15 35 0 12 3 0 42 ... 10 0 ] +==== cpu scan, power-of-two ==== +Time: 18279 microSeconds +==== naive scan, power-of-two ==== +Time: 24 microSeconds +==== work-efficient scan, power-of-two ==== +Time: 35 microSeconds +==== thrust scan, power-of-two ==== +Time: 1473584 microSeconds +==== cpu radix sort ==== +Time: 2342165 microSeconds +==== gpu radix sort cpu scan ==== +Time: 181044 microSeconds +==== gpu radix sort naive scan ==== +Time: 182013 microSeconds +==== gpu radix sort thrust scan ==== +Time: 181109 microSeconds +==== cpu compact without scan, power-of-two ==== +Time: 34329 microSeconds +==== cpu compact with scan ==== +Time: 109976 microSeconds +==== work-efficient compact, power-of-two ==== +Time: 23858 microSeconds + +** SIZE = 16777216 ** +**************** + [ 38 19 38 37 5 47 15 35 0 12 3 0 42 ... 42 0 ] +==== cpu scan, power-of-two ==== +Time: 37124 microSeconds +==== naive scan, power-of-two ==== +Time: 23 microSeconds +==== work-efficient scan, power-of-two ==== +Time: 35 microSeconds +==== thrust scan, power-of-two ==== +Time: 2988794 microSeconds +==== cpu radix sort ==== +Time: 4724527 microSeconds +==== gpu radix sort cpu scan ==== +Time: 374356 microSeconds +==== gpu radix sort naive scan ==== +Time: 367443 microSeconds +==== gpu radix sort thrust scan ==== +Time: 359604 microSeconds +==== cpu compact without scan, power-of-two ==== +Time: 68579 microSeconds +==== cpu compact with scan ==== +Time: 218190 microSeconds +==== work-efficient compact, power-of-two ==== +Time: 49360 microSeconds +Press any key to continue . . . +``` + +* Large scale results (in microseconds): +![largescale](./images/bench1.png) + +* Small scale results (in microseconds): +![largescale](./images/bench2.png) + +The significance is obvious between the CPU and GPU +implementations. When looking at small arrays, the +CPU seems to take an interesting advantage. However +on large arrays, the CPU implementations scale very +poorly, hence the need to have 2 graph scales. +Radix sort suffers particularly when running on the +CPU. One pattern that emerges is that the naive +and efficient methods scale logarithmically. +The efficient version was a fraction slower and that +is due to interrupts between the up and downscan. +Every time upscan is called, memory has to be +allocated and copied, the same happens with downscan +This is shown in the last figure. Despite that, +the loss is minimal. +Thrust scan does well and scales linearly albeit not +as good as the naive and efficient methods. +One thing that was interesting to note is that radix +benefitted more from the thrust scan than the other +GPU implementations. Again possibly due to the +multiple GPU functions and memory copies. +The get max number function is run on the GPU and +memory is copied back and forth. +THe biggest bottleneck in all the test was in the +memory copying especially between the host and +the device. The GPU functions runs extremely fast +in comparison to the memory copying. +The following benchmarks took a few minutes to run +while you'll notice that the GPU time per function +is close to negligeable. Time is is milliseconds: +* GPU functions timing (in milliseconds) +![GPU functions](./images/bench_GPU_functions.png) + +One bottleneck that manifests itself in the graph +below is from the kernDecimalsMap function. +Not enough resources are used during this function +which created a bottleneck that is minimal but not +desired. About 40% og the GPU is not used. +This is very obvious when looking at the following +timeline and the kernDecimalsMap function: +* Cuda memory calls frequency: +![timeline](./images/memoryswapping.png) + + +* DecimalsMap bottleneck: +![timeline](./images/bottleneck.png) + + + + + diff --git a/images/bench1.png b/images/bench1.png new file mode 100644 index 0000000..f447a49 Binary files /dev/null and b/images/bench1.png differ diff --git a/images/bench2.png b/images/bench2.png new file mode 100644 index 0000000..207a966 Binary files /dev/null and b/images/bench2.png differ diff --git a/images/bench_GPU_functions.png b/images/bench_GPU_functions.png new file mode 100644 index 0000000..86b19fa Binary files /dev/null and b/images/bench_GPU_functions.png differ diff --git a/images/bottleneck.png b/images/bottleneck.png new file mode 100644 index 0000000..2b47199 Binary files /dev/null and b/images/bottleneck.png differ diff --git a/images/memoryswapping.png b/images/memoryswapping.png new file mode 100644 index 0000000..4890ea9 Binary files /dev/null and b/images/memoryswapping.png differ diff --git a/src/main.cpp b/src/main.cpp index 675da35..a5a32ea 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -11,7 +11,18 @@ #include #include #include +#include #include "testing_helpers.hpp" +#include +#include +#include +#include +#include + +int comparator(const void* a, const void* b) +{ + return (*(int*)a - *(int*)b); +} int main(int argc, char* argv[]) { const int SIZE = 1 << 8; @@ -43,13 +54,13 @@ int main(int argc, char* argv[]) { zeroArray(SIZE, c); printDesc("naive scan, power-of-two"); StreamCompaction::Naive::scan(SIZE, c, a); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("naive scan, non-power-of-two"); StreamCompaction::Naive::scan(NPOT, c, a); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(NPOT, b, c); zeroArray(SIZE, c); @@ -76,6 +87,40 @@ int main(int argc, char* argv[]) { //printArray(NPOT, c, true); printCmpResult(NPOT, b, c); + printDesc("cpu radix sort"); + memcpy(b, a, sizeof(int)*SIZE); + memcpy(c, a, sizeof(int)*SIZE); + std::qsort(b, SIZE, sizeof(int), comparator); + //printArray(SIZE, b, true); + StreamCompaction::Radix::radixCPU(SIZE, c, a); + printArray(SIZE, c, true); + printCmpResult(NPOT, b, c); + + + printDesc("gpu radix sort cpu scan"); + memcpy(c, a, sizeof(int)*SIZE); + std::qsort(b, SIZE, sizeof(int), comparator); + //printArray(SIZE, b, true); + StreamCompaction::Radix::radixGPU(SIZE, c, a, StreamCompaction::Radix::CPU); + printArray(SIZE, c, true); + printCmpResult(NPOT, b, c); + + printDesc("gpu radix sort naive scan"); + memcpy(c, a, sizeof(int)*SIZE); + std::qsort(b, SIZE, sizeof(int), comparator); + //printArray(SIZE, b, true); + StreamCompaction::Radix::radixGPU(SIZE, c, a, StreamCompaction::Radix::NAIVE); + printArray(SIZE, c, true); + printCmpResult(NPOT, b, c); + + printDesc("gpu radix sort thrust scan"); + memcpy(c, a, sizeof(int)*SIZE); + std::qsort(b, SIZE, sizeof(int), comparator); + //printArray(SIZE, b, true); + StreamCompaction::Radix::radixGPU(SIZE, c, a, StreamCompaction::Radix::THRUST); + printArray(SIZE, c, true); + printCmpResult(NPOT, b, c); + printf("\n"); printf("*****************************\n"); printf("** STREAM COMPACTION TESTS **\n"); @@ -120,4 +165,329 @@ int main(int argc, char* argv[]) { count = StreamCompaction::Efficient::compact(NPOT, c, a); //printArray(count, c, true); printCmpLenResult(count, expectedNPOT, b, c); + + + + // bench tests + int iternum = 1000; + std::vectornaive_results; + std::vectorupsweep_results; + std::vectordownsweep_results; + std::vectorthrust_results; + + for (int i = 8; i < 15; i += 4) + { + int SZ = 1 << i; + int* A = new int[SZ]; + int* C = new int[SZ]; + + printf("\n\n==== BENCHMARK on %d items ====", SZ); + + float naivescan = 0; + for (int j = 0; j < iternum; j++) + { + genArray(SZ - 1, A, 50); + A[SZ - 1] = 0; + zeroArray(SZ, C); + naivescan += StreamCompaction::Naive::timeKernScan(SIZE, C, A); + } + printf("\nnaive scan time: %f", naivescan/iternum); + naive_results.push_back(naivescan / iternum); + + float upsweep = 0; + for (int j = 0; j < iternum; j++) + { + genArray(SZ - 1, A, 50); + A[SZ - 1] = 0; + zeroArray(SZ, C); + upsweep += StreamCompaction::Efficient::timeKernUpsweep(SIZE, C); + } + printf("\nupsweep time: %f", upsweep / iternum); + upsweep_results.push_back(upsweep / iternum); + + float downsweep = 0; + for (int j = 0; j < iternum; j++) + { + genArray(SZ - 1, A, 50); + A[SZ - 1] = 0; + zeroArray(SZ, C); + downsweep += StreamCompaction::Efficient::timeKernDownsweep(SIZE, C); + } + printf("\ndownsweep time: %f", downsweep / iternum); + downsweep_results.push_back(downsweep / iternum); + + float thrusttime = 0; + for (int j = 0; j < iternum; j++) + { + genArray(SZ - 1, A, 50); + A[SZ - 1] = 0; + zeroArray(SZ, C); + thrusttime += StreamCompaction::Thrust::timeThrust(SIZE, C, A); + } + printf("\nthurst time: %f", thrusttime / iternum); + thrust_results.push_back(thrusttime / iternum); + + delete[] A; + delete[] C; + } + + // print in format + printf("\n\n-------- GPU calls results --------"); + printf("\nnaive: "); for (int i = 0; i < naive_results.size(); i++) printf(" %f", naive_results[i]); printf("\n"); + printf("upsweep: "); for (int i = 0; i < upsweep_results.size(); i++) printf(" %f", upsweep_results[i]); printf("\n"); + printf("downsweep: "); for (int i = 0; i < downsweep_results.size(); i++) printf(" %f", downsweep_results[i]); printf("\n"); + printf("thrust: "); for (int i = 0; i < thrust_results.size(); i++) printf(" %f", thrust_results[i]); printf("\n"); + + { + // bechmarks + + // Scan tests + + printf("\n"); + printf("****************\n"); + printf("** BENCHMARKS **\n"); + printf("****************\n"); + + + clock_t t; + int f; + LARGE_INTEGER start, end, freq; + + + + + int iterations = 5; + int maxiterations = 25; + for (int i = 8; i < maxiterations; i += 1) + { + int SIZE = (1 << i); + int NPOT = SIZE - 3; + + printf("\n** SIZE = %d **\n", SIZE); + printf("****************\n"); + + int* a = new int[SIZE]; + int* b = new int[SIZE]; + int* c = new int[SIZE]; + zeroArray(SIZE, a); + zeroArray(SIZE, b); + zeroArray(SIZE, c); + + genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case + a[SIZE - 1] = 0; + printArray(SIZE, a, true); + + //t = clock(); + + //t = clock() - t; + //printf("Elapsed time %f second(s).\n", t, ((float)t) / CLOCKS_PER_SEC); + + zeroArray(SIZE, b); + printDesc("cpu scan, power-of-two"); + QueryPerformanceFrequency(&freq); + QueryPerformanceCounter(&start); + for (int j = 0; j < iterations; j++) + StreamCompaction::CPU::scan(SIZE, b, a); + QueryPerformanceCounter(&end); + std::cout << "Time: " << ((end.QuadPart - start.QuadPart) * 1000000 / freq.QuadPart)/iterations << " microSeconds\n"; + + /* + zeroArray(SIZE, c); + printDesc("cpu scan, non-power-of-two"); + QueryPerformanceFrequency(&freq); + QueryPerformanceCounter(&start); + for (int j = 0; j < iterations; j++) + StreamCompaction::CPU::scan(NPOT, c, a); + QueryPerformanceCounter(&end); + std::cout << "Time: " << ((end.QuadPart - start.QuadPart) * 1000000 / freq.QuadPart)/iterations << " microSeconds\n"; + */ + + + zeroArray(SIZE, c); + printDesc("naive scan, power-of-two"); + QueryPerformanceFrequency(&freq); + QueryPerformanceCounter(&start); + for (int j = 0; j < iterations; j++) + StreamCompaction::Naive::scan(SIZE, c, a); + QueryPerformanceCounter(&end); + std::cout << "Time: " << ((end.QuadPart - start.QuadPart) * 1000000 / freq.QuadPart)/iterations << " microSeconds\n"; + + /* + zeroArray(SIZE, c); + printDesc("naive scan, non-power-of-two"); + QueryPerformanceFrequency(&freq); + QueryPerformanceCounter(&start); + for (int j = 0; j < iterations; j++) + StreamCompaction::Naive::scan(NPOT, c, a); + QueryPerformanceCounter(&end); + std::cout << "Time: " << ((end.QuadPart - start.QuadPart) * 1000000 / freq.QuadPart)/iterations << " microSeconds\n"; + */ + + + + zeroArray(SIZE, c); + printDesc("work-efficient scan, power-of-two"); + QueryPerformanceFrequency(&freq); + QueryPerformanceCounter(&start); + for (int j = 0; j < iterations; j++) + StreamCompaction::Efficient::scan(SIZE, c, a); + QueryPerformanceCounter(&end); + std::cout << "Time: " << ((end.QuadPart - start.QuadPart) * 1000000 / freq.QuadPart)/iterations << " microSeconds\n"; + + /* + zeroArray(SIZE, c); + printDesc("work-efficient scan, non-power-of-two"); + QueryPerformanceFrequency(&freq); + QueryPerformanceCounter(&start); + for (int j = 0; j < iterations; j++) + StreamCompaction::Efficient::scan(NPOT, c, a); + QueryPerformanceCounter(&end); + std::cout << "Time: " << ((end.QuadPart - start.QuadPart) * 1000000 / freq.QuadPart)/iterations << " microSeconds\n"; + */ + + + zeroArray(SIZE, c); + printDesc("thrust scan, power-of-two"); + QueryPerformanceFrequency(&freq); + QueryPerformanceCounter(&start); + for (int j = 0; j < iterations; j++) + StreamCompaction::Thrust::scan(SIZE, c, a); + QueryPerformanceCounter(&end); + std::cout << "Time: " << ((end.QuadPart - start.QuadPart) * 1000000 / freq.QuadPart)/iterations << " microSeconds\n"; + + /* + zeroArray(SIZE, c); + printDesc("thrust scan, non-power-of-two"); + QueryPerformanceFrequency(&freq); + QueryPerformanceCounter(&start); + for (int j = 0; j < iterations; j++) + StreamCompaction::Thrust::scan(NPOT, c, a); + QueryPerformanceCounter(&end); + std::cout << "Time: " << ((end.QuadPart - start.QuadPart) * 1000000 / freq.QuadPart)/iterations << " microSeconds\n"; + */ + + + printDesc("cpu radix sort"); + memcpy(c, a, sizeof(int)*SIZE); + QueryPerformanceFrequency(&freq); + QueryPerformanceCounter(&start); + for (int j = 0; j < iterations; j++) + StreamCompaction::Radix::radixCPU(SIZE, c, a); + QueryPerformanceCounter(&end); + std::cout << "Time: " << ((end.QuadPart - start.QuadPart) * 1000000 / freq.QuadPart)/iterations << " microSeconds\n"; + + printDesc("gpu radix sort cpu scan"); + memcpy(c, a, sizeof(int)*SIZE); + QueryPerformanceFrequency(&freq); + QueryPerformanceCounter(&start); + for (int j = 0; j < iterations; j++) + StreamCompaction::Radix::radixGPU(SIZE, c, a, StreamCompaction::Radix::CPU); + QueryPerformanceCounter(&end); + std::cout << "Time: " << ((end.QuadPart - start.QuadPart) * 1000000 / freq.QuadPart)/iterations << " microSeconds\n"; + + printDesc("gpu radix sort naive scan"); + memcpy(c, a, sizeof(int)*SIZE); + QueryPerformanceFrequency(&freq); + QueryPerformanceCounter(&start); + for (int j = 0; j < iterations; j++) + StreamCompaction::Radix::radixGPU(SIZE, c, a, StreamCompaction::Radix::NAIVE); + QueryPerformanceCounter(&end); + std::cout << "Time: " << ((end.QuadPart - start.QuadPart) * 1000000 / freq.QuadPart)/iterations << " microSeconds\n"; + + printDesc("gpu radix sort thrust scan"); + memcpy(c, a, sizeof(int)*SIZE); + QueryPerformanceFrequency(&freq); + QueryPerformanceCounter(&start); + for (int j = 0; j < iterations; j++) + StreamCompaction::Radix::radixGPU(SIZE, c, a, StreamCompaction::Radix::THRUST); + QueryPerformanceCounter(&end); + std::cout << "Time: " << ((end.QuadPart - start.QuadPart) * 1000000 / freq.QuadPart)/iterations << " microSeconds\n"; + + + + + // compaction + + + genArray(SIZE - 1, a, 4); // Leave a 0 at the end to test that edge case + a[SIZE - 1] = 0; + + zeroArray(SIZE, b); + printDesc("cpu compact without scan, power-of-two"); + QueryPerformanceFrequency(&freq); + QueryPerformanceCounter(&start); + for (int j = 0; j < iterations; j++) + count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a); + QueryPerformanceCounter(&end); + std::cout << "Time: " << ((end.QuadPart - start.QuadPart) * 1000000 / freq.QuadPart) / iterations << " microSeconds\n"; + + + /* + zeroArray(SIZE, c); + printDesc("cpu compact without scan, non-power-of-two"); + QueryPerformanceFrequency(&freq); + QueryPerformanceCounter(&start); + for (int j = 0; j < iterations; j++) + count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a); + QueryPerformanceCounter(&end); + std::cout << "Time: " << ((end.QuadPart - start.QuadPart) * 1000000 / freq.QuadPart) / iterations << " microSeconds\n"; + */ + + + zeroArray(SIZE, c); + printDesc("cpu compact with scan"); + QueryPerformanceFrequency(&freq); + QueryPerformanceCounter(&start); + for (int j = 0; j < iterations; j++) + count = StreamCompaction::CPU::compactWithScan(SIZE, c, a); + QueryPerformanceCounter(&end); + std::cout << "Time: " << ((end.QuadPart - start.QuadPart) * 1000000 / freq.QuadPart) / iterations << " microSeconds\n"; + + zeroArray(SIZE, c); + printDesc("work-efficient compact, power-of-two"); + QueryPerformanceFrequency(&freq); + QueryPerformanceCounter(&start); + for (int j = 0; j < iterations; j++) + count = StreamCompaction::Efficient::compact(SIZE, c, a); + QueryPerformanceCounter(&end); + std::cout << "Time: " << ((end.QuadPart - start.QuadPart) * 1000000 / freq.QuadPart) / iterations << " microSeconds\n"; + + /* + zeroArray(SIZE, c); + printDesc("work-efficient compact, non-power-of-two"); + QueryPerformanceFrequency(&freq); + QueryPerformanceCounter(&start); + for (int j = 0; j < iterations; j++) + count = StreamCompaction::Efficient::compact(NPOT, c, a); + QueryPerformanceCounter(&end); + std::cout << "Time: " << ((end.QuadPart - start.QuadPart) * 1000000 / freq.QuadPart) / iterations << " microSeconds\n"; + */ + + + + + + + + + + + + + delete[] a; + delete[] b; + delete[] c; + + Sleep(1); + + } + + } + + + + + + + return 0; } diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt index cdbef77..bcc484e 100644 --- a/stream_compaction/CMakeLists.txt +++ b/stream_compaction/CMakeLists.txt @@ -9,6 +9,8 @@ set(SOURCE_FILES "efficient.cu" "thrust.h" "thrust.cu" + "radix.h" + "radix.cu" ) cuda_add_library(stream_compaction diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index fe872d4..1ad61e8 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -24,6 +24,12 @@ namespace Common { */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { // TODO + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) + return; + + bools[index] = idata[index] != 0; + } /** @@ -33,6 +39,12 @@ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { // TODO + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) + return; + + if (bools[index] == 1) + odata[indices[index]] = idata[index]; } } diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index e600c29..f37c227 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -9,7 +9,12 @@ namespace CPU { */ void scan(int n, int *odata, const int *idata) { // TODO - printf("TODO\n"); + //printf("TODO\n"); + odata[0] = 0; + for (int i = 1; i < n; i++) + { + odata[i] = odata[i-1] + idata[i-1]; + } } /** @@ -19,7 +24,18 @@ void scan(int n, int *odata, const int *idata) { */ int compactWithoutScan(int n, int *odata, const int *idata) { // TODO - return -1; + int count = 0; + odata[0] = 0; + + for (int i = 0; i < n; i++) + { + if (idata[i] != 0) + { + odata[count] = idata[i]; + count++; + } + } + return count; } /** @@ -29,7 +45,52 @@ int compactWithoutScan(int n, int *odata, const int *idata) { */ int compactWithScan(int n, int *odata, const int *idata) { // TODO - return -1; + //return -1; + + int* mapdata = new int[n]; + int* scandata = new int[n]; + + memset(mapdata, 0, n*sizeof(int)); + memset(scandata, 0, n*sizeof(int)); + + + for (int i = 0; i < n; i++) + { + if (idata[i] != 0) + mapdata[i] = 1; + } + + scan(n, scandata, mapdata); + + int count = 0; + for (int i = 0; i < n; i++) + { + //odata[count] = mapdata[i] * idata[scandata[i]]; + //count += mapdata[i]; + if (mapdata[i] != 0) + { + odata[scandata[i]] = idata[i]; + } + } + count = scandata[n - 1] + mapdata[n-1]; + + /* + printf("\n%-10s", "idata: "); for (int i = 0; i < 20; i++) + printf("%d ", idata[i]); + printf("\n%-10s", "mapdata: "); for (int i = 0; i < 20; i++) + printf("%d ", mapdata[i]); + printf("\n%-10s", "scandata: "); for (int i = 0; i < 20; i++) + printf("%d ", scandata[i]); + printf("\n%-10s", "odata: "); for (int i = 0; i < 20; i++) + printf("%d ", odata[i]); + printf("\n"); + */ + + delete[] mapdata; + delete[] scandata; + + return count; + } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index b2f739b..30b68fa 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -2,33 +2,348 @@ #include #include "common.h" #include "efficient.h" +#include "stdio.h" namespace StreamCompaction { -namespace Efficient { + namespace Efficient { -// TODO: __global__ + // TODO: __global__ +#define blockSize 128 -/** - * Performs prefix-sum (aka scan) on idata, storing the result into odata. - */ -void scan(int n, int *odata, const int *idata) { - // TODO - printf("TODO\n"); -} + //int* dev_idata; + //int* dev_odata; + //int* dev_bools; + //int* dev_indices; + //int* dev_scandata; -/** - * Performs stream compaction on idata, storing the result into odata. - * All zeroes are discarded. - * - * @param n The number of elements in idata. - * @param odata The array into which to store elements. - * @param idata The array of elements to compact. - * @returns The number of elements remaining after compaction. - */ -int compact(int n, int *odata, const int *idata) { - // TODO - return -1; -} -} + __global__ void kernupsweep(int n, int d, int* dev_scandata) + { + int k = (blockIdx.x * blockDim.x) + threadIdx.x; + if (k >= n) + return; + + if (k % (1 << (d + 1)) == 0) + dev_scandata[k + (1 << (d + 1)) - 1] += dev_scandata[k + (1 << d) - 1];// +dev_scandata[k + (1 << (d + 1)) - 1]; + } + + __global__ void kerndownsweep(int n, int d, int* dev_scandata) + { + int k = (blockIdx.x * blockDim.x) + threadIdx.x; + if (k >= n) + return; + + if (k % (1 << (d + 1)) == 0) + { + int t = dev_scandata[k + (1 << d) - 1]; + dev_scandata[k + (1 << d) - 1] = dev_scandata[k + (1 << (d + 1)) - 1]; + dev_scandata[k + (1 << (d + 1)) - 1] = t + dev_scandata[k + (1 << (d + 1)) - 1]; + } + } + + __global__ void kernscan(int n, int d, int* dev_scandata, int* dev_scandata2) + { + int k = (blockIdx.x * blockDim.x) + threadIdx.x; + if (k >= n) + return; + + if (k >= (1 << (d - 1))) + dev_scandata[k] = dev_scandata2[k] + dev_scandata2[k - (1 << (d - 1))]; + } + + __global__ void kerninc2exc(int n, int* dev_scandata, int* dev_scandata2) + { + int k = (blockIdx.x * blockDim.x) + threadIdx.x; + if (k >= n) + return; + + if (k == 0) + dev_scandata[0] = 0; + else + dev_scandata[k] = dev_scandata2[k - 1]; + } + + /* + void setup(int n, const int* idata) + { + cudaMalloc((void**)&dev_scandata, n * sizeof(int)); + cudaMemcpy(dev_scandata, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + } + + void cleanup(void) + { + cudaFree(dev_scandata); + } + + void setupcompact(int n, const int* idata) + { + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + cudaMalloc((void**)&dev_bools, n * sizeof(int)); + cudaMalloc((void**)&dev_indices, n * sizeof(int)); + cudaMalloc((void**)&dev_scandata, n * sizeof(int)); + + cudaMemset(dev_bools, 0, sizeof(int) * n); + cudaMemset(dev_indices, 0, sizeof(int) * n); + cudaMemcpy(dev_scandata, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + } + + void cleanupcompact(void) + { + cudaFree(dev_idata); + cudaFree(dev_odata); + cudaFree(dev_bools); + cudaFree(dev_indices); + cudaFree(dev_scandata); + } + */ + + /** + * Performs prefix-sum (aka scan) on idata, storing the result into odata. + */ + void scan(int n, int *odata, const int *idata) { + // TODO + //printf("TODO\n"); + + + int nold = n; + + if (n % ilog2ceil(n) == 0) + { + n = 1 << ilog2ceil(n); + dim3 blocks((n + blockSize - 1) / blockSize); + //setup(n, idata); + int* dev_scandata; + cudaMalloc((void**)&dev_scandata, n * sizeof(int)); + cudaMemcpy(dev_scandata, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + + + for (int d = 0; d <= ilog2ceil(n) - 1; d++) + { + kernupsweep << > >(n, d, dev_scandata); + } + + cudaMemset(dev_scandata + (n - 1), 0, sizeof(int)); + for (int d = ilog2ceil(n); d >= 0; d--) + { + kerndownsweep << > >(n, d, dev_scandata); + } + cudaMemcpy(odata, dev_scandata, sizeof(int) * n, cudaMemcpyDeviceToHost); + + cudaFree(dev_scandata); + } + else + { + n = 1 << ilog2ceil(n); + dim3 blocks((n + blockSize - 1) / blockSize); + //setup(n, idata); + int* dev_scandata; + cudaMalloc((void**)&dev_scandata, n * sizeof(int)); + cudaMemcpy(dev_scandata, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + + for (int d = 0; d <= ilog2ceil(n) - 1; d++) + { + kernupsweep << > >(n, d, dev_scandata); + } + + cudaMemset(dev_scandata + (n - 1), 0, sizeof(int)); + + for (int d = ilog2ceil(n); d >= 0; d--) + { + kerndownsweep << > >(n, d, dev_scandata); + } + cudaMemcpy(odata, dev_scandata, sizeof(int) * n, cudaMemcpyDeviceToHost); + cudaFree(dev_scandata); + } + + //cleanup(); + + } + + + float timeKernUpsweep(int n, const int *idata) + { + int* dev_idata; + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + + cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + + dim3 blocks((n + blockSize - 1) / blockSize); + + cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord(start); + + if (n % 2 == 0) + { + for (int d = 0; d <= ilog2ceil(n) - 1; d++) + { + kernupsweep << > >(n, d, dev_idata); + } + } + cudaEventRecord(stop); cudaEventSynchronize(stop); float milliseconds = 0; cudaEventElapsedTime(&milliseconds, start, stop); + //printf("\nELAPSED TIME = %f\n", milliseconds); + + cudaEventDestroy(start); cudaEventDestroy(stop); + + cudaFree(dev_idata); + + return milliseconds; + } + + float timeKernDownsweep(int n, const int *idata) + { + int* dev_idata; + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + + cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + + dim3 blocks((n + blockSize - 1) / blockSize); + + cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord(start); + + if (n % 2 == 0) + { + for (int d = 0; d <= ilog2ceil(n) - 1; d++) + { + kerndownsweep << > >(n, d, dev_idata); + } + } + cudaEventRecord(stop); cudaEventSynchronize(stop); float milliseconds = 0; cudaEventElapsedTime(&milliseconds, start, stop); + //printf("\nELAPSED TIME = %f\n", milliseconds); + + cudaEventDestroy(start); cudaEventDestroy(stop); + + cudaFree(dev_idata); + + return milliseconds; + } + + + /** + * Performs stream compaction on idata, storing the result into odata. + * All zeroes are discarded. + * + * @param n The number of elements in idata. + * @param odata The array into which to store elements. + * @param idata The array of elements to compact. + * @returns The number of elements remaining after compaction. + */ + int compact(int n, int *odata, const int *idata) { + // TODO + + //dim3 blocks((n + blockSize - 1) / blockSize); + int count = 0; + + if (n % ilog2ceil(n) == 0) + { + n = 1 << ilog2ceil(n); + dim3 blocks((n + blockSize - 1) / blockSize); + + //setupcompact(n, idata); + int* dev_idata; + int* dev_odata; + int* dev_bools; + int* dev_indices; + int* dev_scandata; + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + cudaMalloc((void**)&dev_bools, n * sizeof(int)); + cudaMalloc((void**)&dev_indices, n * sizeof(int)); + cudaMalloc((void**)&dev_scandata, n * sizeof(int)); + + + int* bools = new int[n]; + int* indices = new int[n]; + memset(bools, 0, sizeof(int)*n); + memset(indices, 0, sizeof(int)*n); + cudaMemcpy(dev_idata, idata, sizeof(int)*n, cudaMemcpyHostToDevice); + cudaMemcpy(dev_odata, odata, sizeof(int)*n, cudaMemcpyHostToDevice); + + StreamCompaction::Common::kernMapToBoolean << > > (n, dev_bools, dev_idata); + + scan(n, dev_indices, dev_bools); + cudaMemcpy(bools, dev_bools, sizeof(int) * n, cudaMemcpyDeviceToHost); + cudaMemcpy(indices, dev_indices, sizeof(int) * n, cudaMemcpyDeviceToHost); + + StreamCompaction::Common::kernScatter << > > (n, dev_odata, dev_idata, dev_bools, dev_indices); + + cudaMemcpy(odata, dev_odata, sizeof(int)*n, cudaMemcpyDeviceToHost); + + /* + printf("\n%-10s", "bools: "); for (int i = 0; i < 20; i++) + printf("%d ", bools[i]); + printf("\n%-10s", "scans: "); for (int i = 0; i < 20; i++) + printf("%d ", indices[i]); + printf("\n%-10s", "idata: "); for (int i = 0; i < 20; i++) + printf("%d ", idata[i]); + printf("\n%-10s", "odata: "); for (int i = 0; i < 20; i++) + printf("%d ", odata[i]); + printf("\n"); + */ + + count = indices[n - 1] + bools[n - 1]; + + //cleanupcompact(); + cudaFree(dev_idata); + cudaFree(dev_odata); + cudaFree(dev_bools); + cudaFree(dev_indices); + cudaFree(dev_scandata); + + delete[] bools; + delete[] indices; + } + else + { + n = 1 << ilog2ceil(n); + dim3 blocks((n + blockSize - 1) / blockSize); + + //setupcompact(n, idata); + int* dev_idata; + int* dev_odata; + int* dev_bools; + int* dev_indices; + int* dev_scandata; + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + cudaMalloc((void**)&dev_bools, n * sizeof(int)); + cudaMalloc((void**)&dev_indices, n * sizeof(int)); + cudaMalloc((void**)&dev_scandata, n * sizeof(int)); + + + int* bools = new int[n]; + int* indices = new int[n]; + memset(bools, 0, sizeof(int)*n); + memset(indices, 0, sizeof(int)*n); + cudaMemcpy(dev_idata, idata, sizeof(int)*n, cudaMemcpyHostToDevice); + cudaMemcpy(dev_odata, odata, sizeof(int)*n, cudaMemcpyHostToDevice); + + StreamCompaction::Common::kernMapToBoolean << > > (n, dev_bools, dev_idata); + + scan(n, dev_indices, dev_bools); + cudaMemcpy(bools, dev_bools, sizeof(int) * n, cudaMemcpyDeviceToHost); + cudaMemcpy(indices, dev_indices, sizeof(int) * n, cudaMemcpyDeviceToHost); + + StreamCompaction::Common::kernScatter << > > (n, dev_odata, dev_idata, dev_bools, dev_indices); + + cudaMemcpy(odata, dev_odata, sizeof(int)*n, cudaMemcpyDeviceToHost); + + count = indices[n - 1] + bools[n - 1] - 1; + //for (int i = 0; i < n; i++) + // printf("%d ", indices[i]); + //for (int i = 0; i < n; i++) + // printf("%d ", bools[i]); + //cleanupcompact(); + cudaFree(dev_idata); + cudaFree(dev_odata); + cudaFree(dev_bools); + cudaFree(dev_indices); + cudaFree(dev_scandata); + + delete[] bools; + delete[] indices; + } + + return count; + } + } } diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h index 395ba10..6e24625 100644 --- a/stream_compaction/efficient.h +++ b/stream_compaction/efficient.h @@ -3,6 +3,8 @@ namespace StreamCompaction { namespace Efficient { void scan(int n, int *odata, const int *idata); + float timeKernUpsweep(int n, const int *idata); + float timeKernDownsweep(int n, const int *idata); int compact(int n, int *odata, const int *idata); } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 3d86b60..ffdc72c 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -2,18 +2,130 @@ #include #include "common.h" #include "naive.h" +#include namespace StreamCompaction { namespace Naive { // TODO: __global__ + + + +#define blockSize 128 + + int* dev_scandata; + int* dev_scandata2; + + + __global__ void kernscan(int n, int d, int* dev_scandata, int* dev_scandata2) + { + int k = (blockIdx.x * blockDim.x) + threadIdx.x; + if (k >= n) + return; + + if (k >= 1 << (d - 1)) + { + dev_scandata[k] = dev_scandata2[k] + dev_scandata2[k - (1 << (d - 1))]; + dev_scandata2[k] = dev_scandata[k]; //swap + } + } + + __global__ void kerninc2exc(int n, int* dev_scandata, int* dev_scandata2) + { + int k = (blockIdx.x * blockDim.x) + threadIdx.x; + if (k >= n) + return; + + if (k == 0) + dev_scandata[0] = 0; + else + dev_scandata[k] = dev_scandata2[k-1]; + } + + float timeKernScan(int n, int *odata, const int *idata) + { + int* dev_odata; + int* dev_idata; + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + + cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + cudaMemcpy(dev_odata, dev_idata, sizeof(int) * n, cudaMemcpyDeviceToDevice); + + + dim3 blocks((n + blockSize - 1) / blockSize); + + + cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord(start); + int count = 0; + if (n % 2 == 0) + { + for (int d = 1; d <= ilog2ceil(n); d++) + { + kernscan << > >(n, d, dev_scandata, dev_scandata2); + count++; + } + } + cudaEventRecord(stop); cudaEventSynchronize(stop); float milliseconds = 0; cudaEventElapsedTime(&milliseconds, start, stop); + //printf("\nELAPSED TIME = %f\n", milliseconds); + + cudaEventDestroy(start); cudaEventDestroy(stop); + + cudaFree(dev_odata); + cudaFree(dev_idata); + + return milliseconds; + } + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ -void scan(int n, int *odata, const int *idata) { - // TODO - printf("TODO\n"); + void scan(int n, int *odata, const int *idata) { + // TODO + //printf("TODO\n"); + + + cudaMalloc((void**)&dev_scandata, n * sizeof(int)); + cudaMalloc((void**)&dev_scandata2, n * sizeof(int)); + + cudaMemcpy(dev_scandata2, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + cudaMemcpy(dev_scandata, dev_scandata2, sizeof(int) * n, cudaMemcpyDeviceToDevice); + + dim3 blocks((n + blockSize - 1) / blockSize); + + int count = 0; + if (n % ilog2(n) == 0) + { + for (int d = 1; d <= ilog2ceil(n); d++) + { + kernscan << > >(n, d, dev_scandata, dev_scandata2); + count++; + } + } + else + { + for (int d = 1; d <= ilog2ceil(n)+1; d++) + { + kernscan << > >(n, d, dev_scandata, dev_scandata2); + count++; + } + } + + + if (n % ilog2(n) == 0) + { + kerninc2exc << > >(n, dev_scandata2, dev_scandata); + cudaMemcpy(odata, dev_scandata2, sizeof(int) * n, cudaMemcpyDeviceToHost); + } + else + { + kerninc2exc << > >(n, dev_scandata, dev_scandata2); + cudaMemcpy(odata, dev_scandata, sizeof(int) * n, cudaMemcpyDeviceToHost); + } + + cudaFree(dev_scandata); + cudaFree(dev_scandata2); } } diff --git a/stream_compaction/naive.h b/stream_compaction/naive.h index 21152d6..10a82d7 100644 --- a/stream_compaction/naive.h +++ b/stream_compaction/naive.h @@ -3,5 +3,6 @@ namespace StreamCompaction { namespace Naive { void scan(int n, int *odata, const int *idata); + float timeKernScan(int n, int *odata, const int *idata); } } diff --git a/stream_compaction/radix.cu b/stream_compaction/radix.cu new file mode 100644 index 0000000..a0f3e0c --- /dev/null +++ b/stream_compaction/radix.cu @@ -0,0 +1,285 @@ +#include +#include +#include "common.h" +#include "naive.h" +#include "radix.h" +#include "efficient.h" +#include "thrust.h" +#include "cpu.h" +#include + +namespace StreamCompaction { +namespace Radix { + + +#define blockSize 128 + + int getDigit(int number, int i) + { + int d = 0; + if (i == 1) + { + d = number % (10 * i); + return d; + } + else if (i > 1) + { + int currpower = (pow(10, i)); + int prevpower = (pow(10, i - 1)); + + if (number < prevpower) + return -1; + + int nextdigits = number % (prevpower); + d = (number%currpower - nextdigits) / prevpower; + + return d; + } + return -1; + } + + + + + __global__ void kernDecimalsMap(int n, int sbit, int *decimals, const int *idata) { + // TODO + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) + return; + + int number = idata[index]; + + // get digit + int d = 0; + if (sbit == 1) + { + d = number % (10 * sbit); + } + else if (sbit > 1) + { + int currpower = (int)(pow(double(10), double(sbit))); + int prevpower = (int)(pow(double(10), double(sbit - 1))); + + if (number < prevpower) + d = -1; + + int nextdigits = number % (prevpower); + d = (number%currpower - nextdigits) / prevpower; + } + else + d = -1; + // end get digit + + decimals[index] = d; + } + + + __global__ void kernexc2inc(int n, int* dev_scandata, int* dev_scandata2) + { + int k = (blockIdx.x * blockDim.x) + threadIdx.x; + if (k >= n-1) + return; + + dev_scandata[k] = dev_scandata2[k+1]; + } + + void exc2inc(int n, int *odata, const int *idata) + { + dim3 blocks((n + blockSize - 1) / blockSize); + int* dev_idata; + int* dev_odata; + cudaMalloc((void**)&dev_idata, sizeof(int) * n); + cudaMalloc((void**)&dev_odata, sizeof(int) * n); + cudaMemcpy(dev_idata, idata, sizeof(int)*n, cudaMemcpyHostToDevice); + cudaMemcpy(dev_odata, odata, sizeof(int)*n, cudaMemcpyHostToDevice); + + kernexc2inc << > >(n, dev_odata, dev_idata); + cudaMemcpy(odata, dev_odata, sizeof(int)*n, cudaMemcpyDeviceToHost); + + if (n>1) + odata[n - 1] += odata[n-2]; + + cudaFree(dev_idata); + cudaFree(dev_odata); + } + + int findMax(int n, const int* nums) + { + int max = 0; + for (int i = 0; i < n; i++) + max = max < nums[i] ? nums[i] : max; + return max; + } + + + __global__ void kernFindMax(int n, int d, int* dev_scandata, int* dev_scandata2) + { + int k = (blockIdx.x * blockDim.x) + threadIdx.x; + if (k >= n) + return; + + if (k >= 1 << (d - 1)) + { + dev_scandata[k] = dev_scandata2[k] < dev_scandata2[k - (1 << (d - 1))] ? dev_scandata2[k - (1 << (d - 1))] : dev_scandata2[k]; + dev_scandata2[k] = dev_scandata[k]; //swap + } + } + + void findMaxGPU(int n, int *odata, const int *idata) + { + dim3 blocks((n + blockSize - 1) / blockSize); + int* dev_scandata; + int* dev_scandata2; + cudaMalloc((void**)&dev_scandata, n * sizeof(int)); + cudaMalloc((void**)&dev_scandata2, n * sizeof(int)); + + cudaMemcpy(dev_scandata2, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + cudaMemcpy(dev_scandata, dev_scandata2, sizeof(int) * n, cudaMemcpyDeviceToDevice); + + if (n % 2 == 0) + { + for (int d = 1; d <= ilog2ceil(n); d++) + kernFindMax << > >(n, d, dev_scandata, dev_scandata2); + } + else + { + for (int d = 1; d <= ilog2ceil(n) + 1; d++) + kernFindMax << > >(n, d, dev_scandata, dev_scandata2); + } + + cudaMemcpy(odata, dev_scandata, sizeof(int) * n, cudaMemcpyDeviceToHost); + cudaFree(dev_scandata); + cudaFree(dev_scandata2); + } +/** + * Performs prefix-sum (aka scan) on idata, storing the result into odata. + */ + void radixCPU(int n, int *odata, const int *idata) { + // TODO + //printf("TODO\n"); + + int* mapping = new int[10]; + int* idatacpy = new int[n]; + memcpy(idatacpy, idata, sizeof(int)*n); + + memset(odata, 0, sizeof(int)*n); + + int sbit = 1; + bool notdone = true; + + while (notdone) + { + memset(mapping, 0, sizeof(int) * 10); + notdone = false; + for (int i = 0; i < n; i++) + { + int number = idatacpy[i]; + int val = getDigit(number, sbit); + if (val != -1) + { + mapping[val]++; + notdone = true; + } + else + mapping[0]++; + } + if (notdone == false) + break; + + for (int i = 1; i < 10; i++) + mapping[i] += mapping[i - 1]; + + for (int i = n - 1; i >= 0; i--) + { + int number = idatacpy[i]; + int digit = getDigit(number, sbit); + int mapidx = digit == -1 ? 0 : digit; + mapping[mapidx] -= 1; + int index = mapping[mapidx]; + odata[index] = number; + } + + memcpy(idatacpy, odata, sizeof(int)*n); + sbit += 1; + } +} + void radixGPU(int n, int *odata, const int *idata, gputesttype testtype) { + // TODO + //printf("TODO\n"); + + dim3 blocks((n + blockSize - 1) / blockSize); + + int* dev_decmapping; + int* dev_idata; + cudaMalloc((void**)&dev_decmapping, sizeof(int) * n); + cudaMalloc((void**)&dev_idata, sizeof(int) * n); + cudaMemset(dev_decmapping, 0, sizeof(int) * n); + cudaMemcpy(dev_idata, idata, sizeof(int)*n, cudaMemcpyHostToDevice); + + int* idatacpy = new int[n]; + int* decmapping = new int[n]; + int* deccount = new int[10]; + int* decscan = new int[10]; + memcpy(idatacpy, idata, sizeof(int) * n); + memset(decmapping, 0, sizeof(int) * n); + + memset(odata, 0, sizeof(int)*n); + + + findMaxGPU(n, odata, idata); + int maxnum = odata[n - 1]; + + memset(odata, 0, sizeof(int)*n); + + int maxbit = maxnum > 0 ? (int)log10((double)maxnum) + 1 : 1; + + for (int sbit = 1; sbit <= maxbit; sbit++) + { + + memset(deccount, 0, sizeof(int) * 10); + memset(decscan, 0, sizeof(int) * 10); + + kernDecimalsMap << > >(n, sbit, dev_decmapping, dev_idata); + cudaMemcpy(decmapping, dev_decmapping, sizeof(int)*n, cudaMemcpyDeviceToHost); + for (int i = 0; i < n; i++) + deccount[decmapping[i]]++; + + + if (testtype == 0) + CPU::scan(10, decscan, deccount); + else if (testtype == 1) + Naive::scan(10, decscan, deccount); + else if (testtype == 2) + Thrust::scan(10, decscan, deccount); + + + exc2inc(10, deccount, decscan); + + + for (int i = n - 1; i >= 0; i--) + { + int number = idatacpy[i]; + int digit = getDigit(number, sbit); + int mapidx = digit < 0 ? 0 : digit; + deccount[mapidx] -= 1; + int index = deccount[mapidx]; + if (index>=0) + odata[index] = number; + } + + memcpy(idatacpy, odata, sizeof(int) * n); + + } + + memcpy(idatacpy, odata, sizeof(int) * n); + + cudaFree(dev_decmapping); + cudaFree(dev_idata); + + delete[] decmapping; + delete[] deccount; + delete[] decscan; + delete[] idatacpy; + } +} +} diff --git a/stream_compaction/radix.h b/stream_compaction/radix.h new file mode 100644 index 0000000..5f65e70 --- /dev/null +++ b/stream_compaction/radix.h @@ -0,0 +1,11 @@ +#pragma once + + +namespace StreamCompaction { +namespace Radix { + enum gputesttype { CPU, NAIVE, THRUST }; + + void radixCPU(int n, int *odata, const int *idata); + void radixGPU(int n, int *odata, const int *idata, gputesttype testtype); +} +} diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index d8dbb32..1b0334d 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -9,6 +9,31 @@ namespace StreamCompaction { namespace Thrust { + + float timeThrust(int n, int *odata, const int *idata) + { + int* dev_odata; + int* dev_idata; + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + + cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + cudaMemcpy(dev_odata, dev_idata, sizeof(int) * n, cudaMemcpyDeviceToDevice); + + thrust::host_vector dv_in(idata, idata + n); + thrust::host_vector dv_out(odata, odata + n); + + cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord(start); + thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + cudaEventRecord(stop); cudaEventSynchronize(stop); float milliseconds = 0; cudaEventElapsedTime(&milliseconds, start, stop); + //printf("\nELAPSED TIME = %f\n", milliseconds); + cudaEventDestroy(start); cudaEventDestroy(stop); + + cudaFree(dev_odata); + cudaFree(dev_idata); + + return milliseconds; + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ @@ -16,6 +41,18 @@ void scan(int n, int *odata, const int *idata) { // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + + //thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + + + thrust::host_vector dv_in(idata, idata + n); + thrust::host_vector dv_out(odata, odata + n); + + thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + + //odata = &(dv_out.front()); + for (int i = 0; i < n; i++) + odata[i] = dv_out[i]; } } diff --git a/stream_compaction/thrust.h b/stream_compaction/thrust.h index 06707f3..ccacb34 100644 --- a/stream_compaction/thrust.h +++ b/stream_compaction/thrust.h @@ -3,5 +3,6 @@ namespace StreamCompaction { namespace Thrust { void scan(int n, int *odata, const int *idata); + float timeThrust(int n, int *odata, const int *idata); } }