diff --git a/README.md b/README.md
index b71c458..912d688 100644
--- a/README.md
+++ b/README.md
@@ -3,11 +3,675 @@ CUDA Stream Compaction
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
 
-* (TODO) YOUR NAME HERE
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* (TODO) Rony Edde (redde)
+* Tested on: Windows 10, i7-6700k @ 4.00GHz 64GB, GTX 980M 8GB (Personal Laptop)
 
 ### (TODO: Your README)
 
 Include analysis, etc. (Remember, this is public, so don't put
 anything here that you don't want to share with the world.)
 
+
+This is an implementation of scan and reduce on the CPU and GPU
+for stream compaction.
+There are 2 types of arrays to consider.  Powers of 2 and non
+powers of 2 which are part of the test.
+
+* Scan
+  * The first method used is on the CPU where all elements are
+  added using a for loop.  This works very well on small arrays.
+
+  * The second method consists of using the GPU to compute the
+  scan result.  This is a naive implementation where the first
+  iteration copies elements left to right (exclusive), then 
+  2^(i+1) for a depth of log2(n).  Each depth skips 2^(d-1).
+  This is faster than the CPU version for large arrays.
+
+  * The third method uses a double sweep.  An upsweep, followed by
+  a downsweep using a balanced tree form.  Each sweeps takes 
+  log2(n-1) iterations but the calls on the GPU are only taking
+  place on multiples of 2^(d+1).  This should be fast because
+  there are only O(n) adds for the up sweep and O(n) adds and
+  O(n) swaps.
+
+  * Thrust scan uses CUDA's thrust exclusive function which is 
+  built in the CUDA library.
+
+* Stream Compaction
+  * The first implementation is on the CPU where a for loop looks
+  for values greater than 0 and adds them to the new array while
+  incrementing the count when a non zero value is found.
+
+  * The second implementation uses the CPU but also uses the scan
+  function to look up indices.
+
+  * The third implementation uses the GPU to generate a [0 1] 
+  mapped array which is then run into the scan GPU function
+  and used as an index lookup for placing the elements.
+  After the scan function all non zero elements will be 
+  present wich will result in a compact array with no zeros.
+
+
+* Thrust 
+  * The implementation is mentionned in the Scan section.
+
+* Radix sort
+  * There are 2 versions of the Radix sort.  This first one runs
+  on the CPU using a CPU version on scan.
+
+  * The second version uses the GPU to run the scan.  It also uses
+  a GPU function to determine the max number in the array.  This
+  is used to determine how many loops are needed before we reach
+  the maximum amount of decimals in the maximum number.
+  Another GPU function is used to shift the scan from exclusive
+  to inclusive which is needed for a correct radix sort.
+  There are multiple scan functions that can be used, a few are
+  benchmarked.
+
+* Benchmarks
+  * Running benhmarks on a range of 256 to 65536 with a power of 4
+  increment gave the following results:
+
+
+```
+****************
+** SCAN TESTS **
+****************
+    [  38  19  38  37   5  47  15  35   0  12   3   0  42 ...  26   0 ]
+==== cpu scan, power-of-two ====
+    [   0  38  57  95 132 137 184 199 234 234 246 249 249 ... 6203 6229 ]
+==== cpu scan, non-power-of-two ====
+    [   0  38  57  95 132 137 184 199 234 234 246 249 249 ... 6146 6190 ]
+    passed
+==== naive scan, power-of-two ====
+    [   0  38  57  95 132 137 184 199 234 234 246 249 249 ... 6203 6229 ]
+    passed
+==== naive scan, non-power-of-two ====
+    [   0  38  57  95 132 137 184 199 234 234 246 249 249 ...   0   0 ]
+    passed
+==== work-efficient scan, power-of-two ====
+    passed
+==== work-efficient scan, non-power-of-two ====
+    passed
+==== thrust scan, power-of-two ====
+    passed
+==== thrust scan, non-power-of-two ====
+    passed
+==== cpu radix sort ====
+    [   0   0   0   0   0   0   1   1   1   1   2   2   2 ...  49  49 ]
+    passed
+==== gpu radix sort cpu scan ====
+    [   0   0   0   0   0   0   1   1   1   1   2   2   2 ...  49  49 ]
+    passed
+==== gpu radix sort naive scan ====
+    [   0   0   0   0   0   0   1   1   1   1   2   2   2 ...  49  49 ]
+    passed
+==== gpu radix sort thrust scan ====
+    [   0   0   0   0   0   0   1   1   1   1   2   2   2 ...  49  49 ]
+    passed
+
+*****************************
+** STREAM COMPACTION TESTS **
+*****************************
+    [   2   3   2   1   3   1   1   1   2   0   1   0   2 ...   0   0 ]
+==== cpu compact without scan, power-of-two ====
+    [   2   3   2   1   3   1   1   1   2   1   2   1   1 ...   2   1 ]
+    passed
+==== cpu compact without scan, non-power-of-two ====
+    [   2   3   2   1   3   1   1   1   2   1   2   1   1 ...   3   2 ]
+    passed
+==== cpu compact with scan ====
+    [   2   3   2   1   3   1   1   1   2   1   2   1   1 ...   2   1 ]
+    passed
+==== work-efficient compact, power-of-two ====
+    passed
+==== work-efficient compact, non-power-of-two ====
+    passed
+
+
+==== BENCHMARK on 256 items ====
+naive scan time: 0.054762
+upsweep time: 0.053930
+downsweep time: 0.066717
+thurst time: 0.001216
+
+==== BENCHMARK on 4096 items ====
+naive scan time: 0.055395
+upsweep time: 0.056800
+downsweep time: 0.063933
+thurst time: 0.001216
+
+==== BENCHMARK on 65536 items ====
+naive scan time: 0.055034
+upsweep time: 0.054170
+downsweep time: 0.063862
+thurst time: 0.001190
+
+==== BENCHMARK on 1048576 items ====
+naive scan time: 0.055942
+upsweep time: 0.057514
+downsweep time: 0.066336
+thurst time: 0.001194
+
+==== BENCHMARK on 16777216 items ====
+naive scan time: 0.057802
+upsweep time: 0.056128
+downsweep time: 0.066230
+thurst time: 0.001184
+
+==== BENCHMARK on 268435456 items ====
+naive scan time: 0.056563
+upsweep time: 0.055456
+downsweep time: 0.066339
+thurst time: 0.001190
+
+-------- GPU calls results --------
+naive:  0.054762 0.055395 0.055034 0.055942 0.057802 0.056563
+upsweep:  0.053930 0.056800 0.054170 0.057514 0.056128 0.055456
+downsweep:  0.066717 0.063933 0.063862 0.066336 0.066230 0.066339
+thrust:  0.001216 0.001216 0.001190 0.001194 0.001184 0.001190
+
+****************
+** BENCHMARKS **
+****************
+
+** SIZE = 256 **
+****************
+    [  38  19  38  37   5  47  15  35   0  12   3   0  42 ...  26   0 ]
+==== cpu scan, power-of-two ====
+Time:  0 microSeconds
+==== naive scan, power-of-two ====
+Time:  1204 microSeconds
+==== work-efficient scan, power-of-two ====
+Time:  747 microSeconds
+==== thrust scan, power-of-two ====
+Time:  58 microSeconds
+==== cpu radix sort ====
+Time:  89 microSeconds
+==== gpu radix sort cpu scan ====
+Time:  1209 microSeconds
+==== gpu radix sort naive scan ====
+Time:  1356 microSeconds
+==== gpu radix sort thrust scan ====
+Time:  1080 microSeconds
+==== cpu compact without scan, power-of-two ====
+Time:  0 microSeconds
+==== cpu compact with scan ====
+Time:  2 microSeconds
+==== work-efficient compact, power-of-two ====
+Time:  1020 microSeconds
+
+** SIZE = 512 **
+****************
+    [  38  19  38  37   5  47  15  35   0  12   3   0  42 ...  11   0 ]
+==== cpu scan, power-of-two ====
+Time:  1 microSeconds
+==== naive scan, power-of-two ====
+Time:  563 microSeconds
+==== work-efficient scan, power-of-two ====
+Time:  657 microSeconds
+==== thrust scan, power-of-two ====
+Time:  91 microSeconds
+==== cpu radix sort ====
+Time:  142 microSeconds
+==== gpu radix sort cpu scan ====
+Time:  2253 microSeconds
+==== gpu radix sort naive scan ====
+Time:  2997 microSeconds
+==== gpu radix sort thrust scan ====
+Time:  2042 microSeconds
+==== cpu compact without scan, power-of-two ====
+Time:  1 microSeconds
+==== cpu compact with scan ====
+Time:  7 microSeconds
+==== work-efficient compact, power-of-two ====
+Time:  862 microSeconds
+
+** SIZE = 1024 **
+****************
+    [  38  19  38  37   5  47  15  35   0  12   3   0  42 ...  40   0 ]
+==== cpu scan, power-of-two ====
+Time:  2 microSeconds
+==== naive scan, power-of-two ====
+Time:  595 microSeconds
+==== work-efficient scan, power-of-two ====
+Time:  674 microSeconds
+==== thrust scan, power-of-two ====
+Time:  178 microSeconds
+==== cpu radix sort ====
+Time:  282 microSeconds
+==== gpu radix sort cpu scan ====
+Time:  2152 microSeconds
+==== gpu radix sort naive scan ====
+Time:  3216 microSeconds
+==== gpu radix sort thrust scan ====
+Time:  2087 microSeconds
+==== cpu compact without scan, power-of-two ====
+Time:  3 microSeconds
+==== cpu compact with scan ====
+Time:  9 microSeconds
+==== work-efficient compact, power-of-two ====
+Time:  829 microSeconds
+
+** SIZE = 2048 **
+****************
+    [  38  19  38  37   5  47  15  35   0  12   3   0  42 ...  32   0 ]
+==== cpu scan, power-of-two ====
+Time:  4 microSeconds
+==== naive scan, power-of-two ====
+Time:  546 microSeconds
+==== work-efficient scan, power-of-two ====
+Time:  766 microSeconds
+==== thrust scan, power-of-two ====
+Time:  371 microSeconds
+==== cpu radix sort ====
+Time:  568 microSeconds
+==== gpu radix sort cpu scan ====
+Time:  2536 microSeconds
+==== gpu radix sort naive scan ====
+Time:  3176 microSeconds
+==== gpu radix sort thrust scan ====
+Time:  2185 microSeconds
+==== cpu compact without scan, power-of-two ====
+Time:  6 microSeconds
+==== cpu compact with scan ====
+Time:  19 microSeconds
+==== work-efficient compact, power-of-two ====
+Time:  853 microSeconds
+
+** SIZE = 4096 **
+****************
+    [  38  19  38  37   5  47  15  35   0  12   3   0  42 ...  24   0 ]
+==== cpu scan, power-of-two ====
+Time:  8 microSeconds
+==== naive scan, power-of-two ====
+Time:  615 microSeconds
+==== work-efficient scan, power-of-two ====
+Time:  786 microSeconds
+==== thrust scan, power-of-two ====
+Time:  704 microSeconds
+==== cpu radix sort ====
+Time:  1135 microSeconds
+==== gpu radix sort cpu scan ====
+Time:  2460 microSeconds
+==== gpu radix sort naive scan ====
+Time:  3368 microSeconds
+==== gpu radix sort thrust scan ====
+Time:  2429 microSeconds
+==== cpu compact without scan, power-of-two ====
+Time:  14 microSeconds
+==== cpu compact with scan ====
+Time:  40 microSeconds
+==== work-efficient compact, power-of-two ====
+Time:  983 microSeconds
+
+** SIZE = 8192 **
+****************
+    [  38  19  38  37   5  47  15  35   0  12   3   0  42 ...   4   0 ]
+==== cpu scan, power-of-two ====
+Time:  18 microSeconds
+==== naive scan, power-of-two ====
+Time:  629 microSeconds
+==== work-efficient scan, power-of-two ====
+Time:  784 microSeconds
+==== thrust scan, power-of-two ====
+Time:  1444 microSeconds
+==== cpu radix sort ====
+Time:  2306 microSeconds
+==== gpu radix sort cpu scan ====
+Time:  3193 microSeconds
+==== gpu radix sort naive scan ====
+Time:  3786 microSeconds
+==== gpu radix sort thrust scan ====
+Time:  2776 microSeconds
+==== cpu compact without scan, power-of-two ====
+Time:  30 microSeconds
+==== cpu compact with scan ====
+Time:  81 microSeconds
+==== work-efficient compact, power-of-two ====
+Time:  1021 microSeconds
+
+** SIZE = 16384 **
+****************
+    [  38  19  38  37   5  47  15  35   0  12   3   0  42 ...  26   0 ]
+==== cpu scan, power-of-two ====
+Time:  32 microSeconds
+==== naive scan, power-of-two ====
+Time:  653 microSeconds
+==== work-efficient scan, power-of-two ====
+Time:  1079 microSeconds
+==== thrust scan, power-of-two ====
+Time:  2876 microSeconds
+==== cpu radix sort ====
+Time:  4595 microSeconds
+==== gpu radix sort cpu scan ====
+Time:  4210 microSeconds
+==== gpu radix sort naive scan ====
+Time:  4916 microSeconds
+==== gpu radix sort thrust scan ====
+Time:  3774 microSeconds
+==== cpu compact without scan, power-of-two ====
+Time:  63 microSeconds
+==== cpu compact with scan ====
+Time:  165 microSeconds
+==== work-efficient compact, power-of-two ====
+Time:  1257 microSeconds
+
+** SIZE = 32768 **
+****************
+    [  38  19  38  37   5  47  15  35   0  12   3   0  42 ...   7   0 ]
+==== cpu scan, power-of-two ====
+Time:  65 microSeconds
+==== naive scan, power-of-two ====
+Time:  808 microSeconds
+==== work-efficient scan, power-of-two ====
+Time:  1274 microSeconds
+==== thrust scan, power-of-two ====
+Time:  5643 microSeconds
+==== cpu radix sort ====
+Time:  8982 microSeconds
+==== gpu radix sort cpu scan ====
+Time:  6846 microSeconds
+==== gpu radix sort naive scan ====
+Time:  7885 microSeconds
+==== gpu radix sort thrust scan ====
+Time:  6811 microSeconds
+==== cpu compact without scan, power-of-two ====
+Time:  128 microSeconds
+==== cpu compact with scan ====
+Time:  335 microSeconds
+==== work-efficient compact, power-of-two ====
+Time:  1705 microSeconds
+
+** SIZE = 65536 **
+****************
+    [  38  19  38  37   5  47  15  35   0  12   3   0  42 ...  35   0 ]
+==== cpu scan, power-of-two ====
+Time:  131 microSeconds
+==== naive scan, power-of-two ====
+Time:  1140 microSeconds
+==== work-efficient scan, power-of-two ====
+Time:  1760 microSeconds
+==== thrust scan, power-of-two ====
+Time:  11324 microSeconds
+==== cpu radix sort ====
+Time:  18094 microSeconds
+==== gpu radix sort cpu scan ====
+Time:  11267 microSeconds
+==== gpu radix sort naive scan ====
+Time:  11923 microSeconds
+==== gpu radix sort thrust scan ====
+Time:  10084 microSeconds
+==== cpu compact without scan, power-of-two ====
+Time:  260 microSeconds
+==== cpu compact with scan ====
+Time:  672 microSeconds
+==== work-efficient compact, power-of-two ====
+Time:  2814 microSeconds
+
+** SIZE = 131072 **
+****************
+    [  38  19  38  37   5  47  15  35   0  12   3   0  42 ...  10   0 ]
+==== cpu scan, power-of-two ====
+Time:  263 microSeconds
+==== naive scan, power-of-two ====
+Time:  1650 microSeconds
+==== work-efficient scan, power-of-two ====
+Time:  3079 microSeconds
+==== thrust scan, power-of-two ====
+Time:  22802 microSeconds
+==== cpu radix sort ====
+Time:  36258 microSeconds
+==== gpu radix sort cpu scan ====
+Time:  19548 microSeconds
+==== gpu radix sort naive scan ====
+Time:  19723 microSeconds
+==== gpu radix sort thrust scan ====
+Time:  18615 microSeconds
+==== cpu compact without scan, power-of-two ====
+Time:  517 microSeconds
+==== cpu compact with scan ====
+Time:  1571 microSeconds
+==== work-efficient compact, power-of-two ====
+Time:  1465 microSeconds
+
+** SIZE = 262144 **
+****************
+    [  38  19  38  37   5  47  15  35   0  12   3   0  42 ...  42   0 ]
+==== cpu scan, power-of-two ====
+Time:  569 microSeconds
+==== naive scan, power-of-two ====
+Time:  17 microSeconds
+==== work-efficient scan, power-of-two ====
+Time:  25 microSeconds
+==== thrust scan, power-of-two ====
+Time:  45705 microSeconds
+==== cpu radix sort ====
+Time:  73556 microSeconds
+==== gpu radix sort cpu scan ====
+Time:  5243 microSeconds
+==== gpu radix sort naive scan ====
+Time:  5223 microSeconds
+==== gpu radix sort thrust scan ====
+Time:  5214 microSeconds
+==== cpu compact without scan, power-of-two ====
+Time:  1056 microSeconds
+==== cpu compact with scan ====
+Time:  3225 microSeconds
+==== work-efficient compact, power-of-two ====
+Time:  612 microSeconds
+
+** SIZE = 524288 **
+****************
+    [  38  19  38  37   5  47  15  35   0  12   3   0  42 ...  42   0 ]
+==== cpu scan, power-of-two ====
+Time:  1139 microSeconds
+==== naive scan, power-of-two ====
+Time:  18 microSeconds
+==== work-efficient scan, power-of-two ====
+Time:  26 microSeconds
+==== thrust scan, power-of-two ====
+Time:  92465 microSeconds
+==== cpu radix sort ====
+Time:  144470 microSeconds
+==== gpu radix sort cpu scan ====
+Time:  10536 microSeconds
+==== gpu radix sort naive scan ====
+Time:  10634 microSeconds
+==== gpu radix sort thrust scan ====
+Time:  10634 microSeconds
+==== cpu compact without scan, power-of-two ====
+Time:  2098 microSeconds
+==== cpu compact with scan ====
+Time:  6596 microSeconds
+==== work-efficient compact, power-of-two ====
+Time:  1197 microSeconds
+
+** SIZE = 1048576 **
+****************
+    [  38  19  38  37   5  47  15  35   0  12   3   0  42 ...   6   0 ]
+==== cpu scan, power-of-two ====
+Time:  2288 microSeconds
+==== naive scan, power-of-two ====
+Time:  19 microSeconds
+==== work-efficient scan, power-of-two ====
+Time:  27 microSeconds
+==== thrust scan, power-of-two ====
+Time:  185075 microSeconds
+==== cpu radix sort ====
+Time:  289369 microSeconds
+==== gpu radix sort cpu scan ====
+Time:  22012 microSeconds
+==== gpu radix sort naive scan ====
+Time:  21773 microSeconds
+==== gpu radix sort thrust scan ====
+Time:  21896 microSeconds
+==== cpu compact without scan, power-of-two ====
+Time:  4228 microSeconds
+==== cpu compact with scan ====
+Time:  13323 microSeconds
+==== work-efficient compact, power-of-two ====
+Time:  2729 microSeconds
+
+** SIZE = 2097152 **
+****************
+    [  38  19  38  37   5  47  15  35   0  12   3   0  42 ...   2   0 ]
+==== cpu scan, power-of-two ====
+Time:  4616 microSeconds
+==== naive scan, power-of-two ====
+Time:  20 microSeconds
+==== work-efficient scan, power-of-two ====
+Time:  29 microSeconds
+==== thrust scan, power-of-two ====
+Time:  369500 microSeconds
+==== cpu radix sort ====
+Time:  587131 microSeconds
+==== gpu radix sort cpu scan ====
+Time:  44624 microSeconds
+==== gpu radix sort naive scan ====
+Time:  44796 microSeconds
+==== gpu radix sort thrust scan ====
+Time:  44965 microSeconds
+==== cpu compact without scan, power-of-two ====
+Time:  8423 microSeconds
+==== cpu compact with scan ====
+Time:  27154 microSeconds
+==== work-efficient compact, power-of-two ====
+Time:  6133 microSeconds
+
+** SIZE = 4194304 **
+****************
+    [  38  19  38  37   5  47  15  35   0  12   3   0  42 ...  26   0 ]
+==== cpu scan, power-of-two ====
+Time:  9468 microSeconds
+==== naive scan, power-of-two ====
+Time:  20 microSeconds
+==== work-efficient scan, power-of-two ====
+Time:  33 microSeconds
+==== thrust scan, power-of-two ====
+Time:  736214 microSeconds
+==== cpu radix sort ====
+Time:  1182810 microSeconds
+==== gpu radix sort cpu scan ====
+Time:  90747 microSeconds
+==== gpu radix sort naive scan ====
+Time:  89688 microSeconds
+==== gpu radix sort thrust scan ====
+Time:  90998 microSeconds
+==== cpu compact without scan, power-of-two ====
+Time:  17085 microSeconds
+==== cpu compact with scan ====
+Time:  54581 microSeconds
+==== work-efficient compact, power-of-two ====
+Time:  12071 microSeconds
+
+** SIZE = 8388608 **
+****************
+    [  38  19  38  37   5  47  15  35   0  12   3   0  42 ...  10   0 ]
+==== cpu scan, power-of-two ====
+Time:  18279 microSeconds
+==== naive scan, power-of-two ====
+Time:  24 microSeconds
+==== work-efficient scan, power-of-two ====
+Time:  35 microSeconds
+==== thrust scan, power-of-two ====
+Time:  1473584 microSeconds
+==== cpu radix sort ====
+Time:  2342165 microSeconds
+==== gpu radix sort cpu scan ====
+Time:  181044 microSeconds
+==== gpu radix sort naive scan ====
+Time:  182013 microSeconds
+==== gpu radix sort thrust scan ====
+Time:  181109 microSeconds
+==== cpu compact without scan, power-of-two ====
+Time:  34329 microSeconds
+==== cpu compact with scan ====
+Time:  109976 microSeconds
+==== work-efficient compact, power-of-two ====
+Time:  23858 microSeconds
+
+** SIZE = 16777216 **
+****************
+    [  38  19  38  37   5  47  15  35   0  12   3   0  42 ...  42   0 ]
+==== cpu scan, power-of-two ====
+Time:  37124 microSeconds
+==== naive scan, power-of-two ====
+Time:  23 microSeconds
+==== work-efficient scan, power-of-two ====
+Time:  35 microSeconds
+==== thrust scan, power-of-two ====
+Time:  2988794 microSeconds
+==== cpu radix sort ====
+Time:  4724527 microSeconds
+==== gpu radix sort cpu scan ====
+Time:  374356 microSeconds
+==== gpu radix sort naive scan ====
+Time:  367443 microSeconds
+==== gpu radix sort thrust scan ====
+Time:  359604 microSeconds
+==== cpu compact without scan, power-of-two ====
+Time:  68579 microSeconds
+==== cpu compact with scan ====
+Time:  218190 microSeconds
+==== work-efficient compact, power-of-two ====
+Time:  49360 microSeconds
+Press any key to continue . . .
+```
+
+* Large scale results (in microseconds):
+![largescale](./images/bench1.png)
+
+* Small scale results (in microseconds):
+![largescale](./images/bench2.png)
+
+The significance is obvious between the CPU and GPU
+implementations.  When looking at small arrays, the
+CPU seems to take an interesting advantage.  However
+on large arrays, the CPU implementations scale very
+poorly, hence the need to have 2 graph scales.
+Radix sort suffers particularly when running on the 
+CPU.  One pattern that emerges is that the naive
+and efficient methods scale logarithmically.
+The efficient version was a fraction slower and that
+is due to interrupts between the up and downscan.
+Every time upscan is called, memory has to be
+allocated and copied, the same happens with downscan
+This is shown in the last figure.  Despite that,
+the loss is minimal.
+Thrust scan does well and scales linearly albeit not
+as good as the naive and efficient methods.
+One thing that was interesting to note is that radix
+benefitted more from the thrust scan than the other
+GPU implementations.  Again possibly due to the 
+multiple GPU functions and memory copies.
+The get max number function is run on the GPU and 
+memory is copied back and forth.
+THe biggest bottleneck in all the test was in the 
+memory copying especially between the host and
+the device.  The GPU functions runs extremely fast
+in comparison to the memory copying.
+The following benchmarks took a few minutes to run
+while you'll notice that the GPU time per function
+is close to negligeable.  Time is is milliseconds:
+* GPU functions timing (in milliseconds)
+![GPU functions](./images/bench_GPU_functions.png)
+
+One bottleneck that manifests itself in the graph
+below is from the kernDecimalsMap function.
+Not enough resources are used during this function
+which created a bottleneck that is minimal but not
+desired.  About 40% og the GPU is not used.
+This is very obvious when looking at the following
+timeline and the kernDecimalsMap function:
+* Cuda memory calls frequency:
+![timeline](./images/memoryswapping.png)
+
+
+* DecimalsMap bottleneck:
+![timeline](./images/bottleneck.png)
+
+
+
+
+
diff --git a/images/bench1.png b/images/bench1.png
new file mode 100644
index 0000000..f447a49
Binary files /dev/null and b/images/bench1.png differ
diff --git a/images/bench2.png b/images/bench2.png
new file mode 100644
index 0000000..207a966
Binary files /dev/null and b/images/bench2.png differ
diff --git a/images/bench_GPU_functions.png b/images/bench_GPU_functions.png
new file mode 100644
index 0000000..86b19fa
Binary files /dev/null and b/images/bench_GPU_functions.png differ
diff --git a/images/bottleneck.png b/images/bottleneck.png
new file mode 100644
index 0000000..2b47199
Binary files /dev/null and b/images/bottleneck.png differ
diff --git a/images/memoryswapping.png b/images/memoryswapping.png
new file mode 100644
index 0000000..4890ea9
Binary files /dev/null and b/images/memoryswapping.png differ
diff --git a/src/main.cpp b/src/main.cpp
index 675da35..a5a32ea 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -11,7 +11,18 @@
 #include <stream_compaction/naive.h>
 #include <stream_compaction/efficient.h>
 #include <stream_compaction/thrust.h>
+#include <stream_compaction/radix.h>
 #include "testing_helpers.hpp"
+#include <algorithm>
+#include <time.h>
+#include <iostream>
+#include <windows.h>
+#include <vector>
+
+int comparator(const void* a, const void* b)
+{
+    return (*(int*)a - *(int*)b);
+}
 
 int main(int argc, char* argv[]) {
     const int SIZE = 1 << 8;
@@ -43,13 +54,13 @@ int main(int argc, char* argv[]) {
     zeroArray(SIZE, c);
     printDesc("naive scan, power-of-two");
     StreamCompaction::Naive::scan(SIZE, c, a);
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
     zeroArray(SIZE, c);
     printDesc("naive scan, non-power-of-two");
     StreamCompaction::Naive::scan(NPOT, c, a);
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(NPOT, b, c);
 
     zeroArray(SIZE, c);
@@ -76,6 +87,40 @@ int main(int argc, char* argv[]) {
     //printArray(NPOT, c, true);
     printCmpResult(NPOT, b, c);
 
+    printDesc("cpu radix sort");
+    memcpy(b, a, sizeof(int)*SIZE);
+    memcpy(c, a, sizeof(int)*SIZE);
+    std::qsort(b, SIZE, sizeof(int), comparator);
+    //printArray(SIZE, b, true);
+    StreamCompaction::Radix::radixCPU(SIZE, c, a);
+    printArray(SIZE, c, true);
+    printCmpResult(NPOT, b, c);
+
+
+    printDesc("gpu radix sort cpu scan");
+    memcpy(c, a, sizeof(int)*SIZE);
+    std::qsort(b, SIZE, sizeof(int), comparator);
+    //printArray(SIZE, b, true);
+    StreamCompaction::Radix::radixGPU(SIZE, c, a, StreamCompaction::Radix::CPU);
+    printArray(SIZE, c, true);
+    printCmpResult(NPOT, b, c);
+
+    printDesc("gpu radix sort naive scan");
+    memcpy(c, a, sizeof(int)*SIZE);
+    std::qsort(b, SIZE, sizeof(int), comparator);
+    //printArray(SIZE, b, true);
+    StreamCompaction::Radix::radixGPU(SIZE, c, a, StreamCompaction::Radix::NAIVE);
+    printArray(SIZE, c, true);
+    printCmpResult(NPOT, b, c);
+
+    printDesc("gpu radix sort thrust scan");
+    memcpy(c, a, sizeof(int)*SIZE);
+    std::qsort(b, SIZE, sizeof(int), comparator);
+    //printArray(SIZE, b, true);
+    StreamCompaction::Radix::radixGPU(SIZE, c, a, StreamCompaction::Radix::THRUST);
+    printArray(SIZE, c, true);
+    printCmpResult(NPOT, b, c);
+
     printf("\n");
     printf("*****************************\n");
     printf("** STREAM COMPACTION TESTS **\n");
@@ -120,4 +165,329 @@ int main(int argc, char* argv[]) {
     count = StreamCompaction::Efficient::compact(NPOT, c, a);
     //printArray(count, c, true);
     printCmpLenResult(count, expectedNPOT, b, c);
+
+
+
+    // bench tests
+    int iternum = 1000;
+    std::vector<float>naive_results;
+    std::vector<float>upsweep_results;
+    std::vector<float>downsweep_results;
+    std::vector<float>thrust_results;
+
+    for (int i = 8; i < 15; i += 4)
+    {
+        int SZ = 1 << i;
+        int* A = new int[SZ];
+        int* C = new int[SZ];
+
+        printf("\n\n==== BENCHMARK on %d items ====", SZ);
+
+        float naivescan = 0;
+        for (int j = 0; j < iternum; j++)
+        {
+            genArray(SZ - 1, A, 50);
+            A[SZ - 1] = 0;
+            zeroArray(SZ, C);
+            naivescan += StreamCompaction::Naive::timeKernScan(SIZE, C, A);
+        }
+        printf("\nnaive scan time: %f", naivescan/iternum);
+        naive_results.push_back(naivescan / iternum);
+
+        float upsweep = 0;
+        for (int j = 0; j < iternum; j++)
+        {
+            genArray(SZ - 1, A, 50);
+            A[SZ - 1] = 0;
+            zeroArray(SZ, C);
+            upsweep += StreamCompaction::Efficient::timeKernUpsweep(SIZE, C);
+        }
+        printf("\nupsweep time: %f", upsweep / iternum);
+        upsweep_results.push_back(upsweep / iternum);
+
+        float downsweep = 0;
+        for (int j = 0; j < iternum; j++)
+        {
+            genArray(SZ - 1, A, 50);
+            A[SZ - 1] = 0;
+            zeroArray(SZ, C);
+            downsweep += StreamCompaction::Efficient::timeKernDownsweep(SIZE, C);
+        }
+        printf("\ndownsweep time: %f", downsweep / iternum);
+        downsweep_results.push_back(downsweep / iternum);
+
+        float thrusttime = 0;
+        for (int j = 0; j < iternum; j++)
+        {
+            genArray(SZ - 1, A, 50);
+            A[SZ - 1] = 0;
+            zeroArray(SZ, C);
+            thrusttime += StreamCompaction::Thrust::timeThrust(SIZE, C, A);
+        }
+        printf("\nthurst time: %f", thrusttime / iternum);
+        thrust_results.push_back(thrusttime / iternum);
+
+        delete[] A;
+        delete[] C;
+    }
+
+    // print in format
+    printf("\n\n-------- GPU calls results --------");
+    printf("\nnaive: "); for (int i = 0; i < naive_results.size(); i++) printf(" %f", naive_results[i]); printf("\n");
+    printf("upsweep: "); for (int i = 0; i < upsweep_results.size(); i++) printf(" %f", upsweep_results[i]); printf("\n");
+    printf("downsweep: "); for (int i = 0; i < downsweep_results.size(); i++) printf(" %f", downsweep_results[i]); printf("\n");
+    printf("thrust: "); for (int i = 0; i < thrust_results.size(); i++) printf(" %f", thrust_results[i]); printf("\n");
+
+    {
+        // bechmarks
+
+        // Scan tests
+
+        printf("\n");
+        printf("****************\n");
+        printf("** BENCHMARKS **\n");
+        printf("****************\n");
+
+        
+        clock_t t;
+        int f;
+        LARGE_INTEGER start, end, freq;
+
+
+
+        
+        int iterations = 5;
+        int maxiterations = 25;
+        for (int i = 8; i < maxiterations;  i += 1)
+        {
+            int SIZE = (1 << i);
+            int NPOT = SIZE - 3;
+
+            printf("\n** SIZE = %d **\n", SIZE);
+            printf("****************\n");
+
+            int* a = new int[SIZE];
+            int* b = new int[SIZE];
+            int* c = new int[SIZE];
+            zeroArray(SIZE, a);
+            zeroArray(SIZE, b);
+            zeroArray(SIZE, c);
+
+            genArray(SIZE - 1, a, 50);  // Leave a 0 at the end to test that edge case
+            a[SIZE - 1] = 0;
+            printArray(SIZE, a, true);
+
+            //t = clock();
+
+            //t = clock() - t;
+            //printf("Elapsed time %f second(s).\n", t, ((float)t) / CLOCKS_PER_SEC);
+
+            zeroArray(SIZE, b);
+            printDesc("cpu scan, power-of-two");
+            QueryPerformanceFrequency(&freq);
+            QueryPerformanceCounter(&start);
+            for (int j = 0; j < iterations; j++)
+                StreamCompaction::CPU::scan(SIZE, b, a);
+            QueryPerformanceCounter(&end);
+            std::cout << "Time:  " << ((end.QuadPart - start.QuadPart) * 1000000 / freq.QuadPart)/iterations << " microSeconds\n";
+
+            /*
+            zeroArray(SIZE, c);
+            printDesc("cpu scan, non-power-of-two");
+            QueryPerformanceFrequency(&freq);
+            QueryPerformanceCounter(&start);
+            for (int j = 0; j < iterations; j++)
+                StreamCompaction::CPU::scan(NPOT, c, a);
+            QueryPerformanceCounter(&end);
+            std::cout << "Time:  " << ((end.QuadPart - start.QuadPart) * 1000000 / freq.QuadPart)/iterations << " microSeconds\n";
+            */
+
+
+            zeroArray(SIZE, c);
+            printDesc("naive scan, power-of-two");
+            QueryPerformanceFrequency(&freq);
+            QueryPerformanceCounter(&start);
+            for (int j = 0; j < iterations; j++)
+                StreamCompaction::Naive::scan(SIZE, c, a);
+            QueryPerformanceCounter(&end);
+            std::cout << "Time:  " << ((end.QuadPart - start.QuadPart) * 1000000 / freq.QuadPart)/iterations << " microSeconds\n";
+
+            /*
+            zeroArray(SIZE, c);
+            printDesc("naive scan, non-power-of-two");
+            QueryPerformanceFrequency(&freq);
+            QueryPerformanceCounter(&start);
+            for (int j = 0; j < iterations; j++)
+                StreamCompaction::Naive::scan(NPOT, c, a);
+            QueryPerformanceCounter(&end);
+            std::cout << "Time:  " << ((end.QuadPart - start.QuadPart) * 1000000 / freq.QuadPart)/iterations << " microSeconds\n";
+            */
+
+
+
+            zeroArray(SIZE, c);
+            printDesc("work-efficient scan, power-of-two");
+            QueryPerformanceFrequency(&freq);
+            QueryPerformanceCounter(&start);
+            for (int j = 0; j < iterations; j++)
+                StreamCompaction::Efficient::scan(SIZE, c, a);
+            QueryPerformanceCounter(&end);
+            std::cout << "Time:  " << ((end.QuadPart - start.QuadPart) * 1000000 / freq.QuadPart)/iterations << " microSeconds\n";
+
+            /*
+            zeroArray(SIZE, c);
+            printDesc("work-efficient scan, non-power-of-two");
+            QueryPerformanceFrequency(&freq);
+            QueryPerformanceCounter(&start);
+            for (int j = 0; j < iterations; j++)
+                StreamCompaction::Efficient::scan(NPOT, c, a);
+            QueryPerformanceCounter(&end);
+            std::cout << "Time:  " << ((end.QuadPart - start.QuadPart) * 1000000 / freq.QuadPart)/iterations << " microSeconds\n";
+            */
+
+
+            zeroArray(SIZE, c);
+            printDesc("thrust scan, power-of-two");
+            QueryPerformanceFrequency(&freq);
+            QueryPerformanceCounter(&start);
+            for (int j = 0; j < iterations; j++)
+                StreamCompaction::Thrust::scan(SIZE, c, a);
+            QueryPerformanceCounter(&end);
+            std::cout << "Time:  " << ((end.QuadPart - start.QuadPart) * 1000000 / freq.QuadPart)/iterations << " microSeconds\n";
+
+            /*
+            zeroArray(SIZE, c);
+            printDesc("thrust scan, non-power-of-two");
+            QueryPerformanceFrequency(&freq);
+            QueryPerformanceCounter(&start);
+            for (int j = 0; j < iterations; j++)
+                StreamCompaction::Thrust::scan(NPOT, c, a);
+            QueryPerformanceCounter(&end);
+            std::cout << "Time:  " << ((end.QuadPart - start.QuadPart) * 1000000 / freq.QuadPart)/iterations << " microSeconds\n";
+            */
+
+
+            printDesc("cpu radix sort");
+            memcpy(c, a, sizeof(int)*SIZE);
+            QueryPerformanceFrequency(&freq);
+            QueryPerformanceCounter(&start);
+            for (int j = 0; j < iterations; j++)
+                StreamCompaction::Radix::radixCPU(SIZE, c, a);
+            QueryPerformanceCounter(&end);
+            std::cout << "Time:  " << ((end.QuadPart - start.QuadPart) * 1000000 / freq.QuadPart)/iterations << " microSeconds\n";
+
+            printDesc("gpu radix sort cpu scan");
+            memcpy(c, a, sizeof(int)*SIZE);
+            QueryPerformanceFrequency(&freq);
+            QueryPerformanceCounter(&start);
+            for (int j = 0; j < iterations; j++)
+                StreamCompaction::Radix::radixGPU(SIZE, c, a, StreamCompaction::Radix::CPU);
+            QueryPerformanceCounter(&end);
+            std::cout << "Time:  " << ((end.QuadPart - start.QuadPart) * 1000000 / freq.QuadPart)/iterations << " microSeconds\n";
+            
+            printDesc("gpu radix sort naive scan");
+            memcpy(c, a, sizeof(int)*SIZE);
+            QueryPerformanceFrequency(&freq);
+            QueryPerformanceCounter(&start);
+            for (int j = 0; j < iterations; j++)
+                StreamCompaction::Radix::radixGPU(SIZE, c, a, StreamCompaction::Radix::NAIVE);
+            QueryPerformanceCounter(&end);
+            std::cout << "Time:  " << ((end.QuadPart - start.QuadPart) * 1000000 / freq.QuadPart)/iterations << " microSeconds\n";
+
+            printDesc("gpu radix sort thrust scan");
+            memcpy(c, a, sizeof(int)*SIZE);
+            QueryPerformanceFrequency(&freq);
+            QueryPerformanceCounter(&start);
+            for (int j = 0; j < iterations; j++)
+                StreamCompaction::Radix::radixGPU(SIZE, c, a, StreamCompaction::Radix::THRUST);
+            QueryPerformanceCounter(&end);
+            std::cout << "Time:  " << ((end.QuadPart - start.QuadPart) * 1000000 / freq.QuadPart)/iterations << " microSeconds\n";
+
+
+
+
+            // compaction
+
+
+            genArray(SIZE - 1, a, 4);  // Leave a 0 at the end to test that edge case
+            a[SIZE - 1] = 0;
+
+            zeroArray(SIZE, b);
+            printDesc("cpu compact without scan, power-of-two");
+            QueryPerformanceFrequency(&freq);
+            QueryPerformanceCounter(&start);
+            for (int j = 0; j < iterations; j++)
+            count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a);
+            QueryPerformanceCounter(&end);
+            std::cout << "Time:  " << ((end.QuadPart - start.QuadPart) * 1000000 / freq.QuadPart) / iterations << " microSeconds\n";
+
+
+            /*
+            zeroArray(SIZE, c);
+            printDesc("cpu compact without scan, non-power-of-two");
+            QueryPerformanceFrequency(&freq);
+            QueryPerformanceCounter(&start);
+            for (int j = 0; j < iterations; j++)
+            count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a);
+            QueryPerformanceCounter(&end);
+            std::cout << "Time:  " << ((end.QuadPart - start.QuadPart) * 1000000 / freq.QuadPart) / iterations << " microSeconds\n";
+            */
+
+
+            zeroArray(SIZE, c);
+            printDesc("cpu compact with scan");
+            QueryPerformanceFrequency(&freq);
+            QueryPerformanceCounter(&start);
+            for (int j = 0; j < iterations; j++)
+            count = StreamCompaction::CPU::compactWithScan(SIZE, c, a);
+            QueryPerformanceCounter(&end);
+            std::cout << "Time:  " << ((end.QuadPart - start.QuadPart) * 1000000 / freq.QuadPart) / iterations << " microSeconds\n";
+
+            zeroArray(SIZE, c);
+            printDesc("work-efficient compact, power-of-two");
+            QueryPerformanceFrequency(&freq);
+            QueryPerformanceCounter(&start);
+            for (int j = 0; j < iterations; j++)
+            count = StreamCompaction::Efficient::compact(SIZE, c, a);
+            QueryPerformanceCounter(&end);
+            std::cout << "Time:  " << ((end.QuadPart - start.QuadPart) * 1000000 / freq.QuadPart) / iterations << " microSeconds\n";
+
+            /*
+            zeroArray(SIZE, c);
+            printDesc("work-efficient compact, non-power-of-two");
+            QueryPerformanceFrequency(&freq);
+            QueryPerformanceCounter(&start);
+            for (int j = 0; j < iterations; j++)
+            count = StreamCompaction::Efficient::compact(NPOT, c, a);
+            QueryPerformanceCounter(&end);
+            std::cout << "Time:  " << ((end.QuadPart - start.QuadPart) * 1000000 / freq.QuadPart) / iterations << " microSeconds\n";
+            */
+
+
+
+
+
+
+
+
+
+
+
+
+            delete[] a;
+            delete[] b;
+            delete[] c;
+
+            Sleep(1);
+
+        }
+
+    }
+
+
+
+
+
+
+    return 0;
 }
diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt
index cdbef77..bcc484e 100644
--- a/stream_compaction/CMakeLists.txt
+++ b/stream_compaction/CMakeLists.txt
@@ -9,6 +9,8 @@ set(SOURCE_FILES
     "efficient.cu"
     "thrust.h"
     "thrust.cu"
+    "radix.h"
+    "radix.cu"
     )
 
 cuda_add_library(stream_compaction
diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu
index fe872d4..1ad61e8 100644
--- a/stream_compaction/common.cu
+++ b/stream_compaction/common.cu
@@ -24,6 +24,12 @@ namespace Common {
  */
 __global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
     // TODO
+    int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (index >= n)
+        return;
+
+    bools[index] = idata[index] != 0;
+    
 }
 
 /**
@@ -33,6 +39,12 @@ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
 __global__ void kernScatter(int n, int *odata,
         const int *idata, const int *bools, const int *indices) {
     // TODO
+    int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (index >= n)
+        return;
+
+    if (bools[index] == 1)
+        odata[indices[index]] = idata[index];
 }
 
 }
diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
index e600c29..f37c227 100644
--- a/stream_compaction/cpu.cu
+++ b/stream_compaction/cpu.cu
@@ -9,7 +9,12 @@ namespace CPU {
  */
 void scan(int n, int *odata, const int *idata) {
     // TODO
-    printf("TODO\n");
+    //printf("TODO\n");
+    odata[0] = 0;
+    for (int i = 1; i < n; i++)
+    {
+        odata[i] = odata[i-1] + idata[i-1];
+    }
 }
 
 /**
@@ -19,7 +24,18 @@ void scan(int n, int *odata, const int *idata) {
  */
 int compactWithoutScan(int n, int *odata, const int *idata) {
     // TODO
-    return -1;
+    int count = 0;
+    odata[0] = 0;
+
+    for (int i = 0; i < n; i++)
+    {
+        if (idata[i] != 0)
+        {
+            odata[count] = idata[i];
+            count++;
+        }
+    }
+    return count;
 }
 
 /**
@@ -29,7 +45,52 @@ int compactWithoutScan(int n, int *odata, const int *idata) {
  */
 int compactWithScan(int n, int *odata, const int *idata) {
     // TODO
-    return -1;
+    //return -1;
+    
+    int* mapdata = new int[n];
+    int* scandata = new int[n];
+
+    memset(mapdata, 0, n*sizeof(int));
+    memset(scandata, 0, n*sizeof(int));
+    
+
+    for (int i = 0; i < n; i++)
+    {
+        if (idata[i] != 0)
+            mapdata[i] = 1;
+    }
+    
+    scan(n, scandata, mapdata);
+    
+    int count = 0;
+    for (int i = 0; i < n; i++)
+    {
+        //odata[count] = mapdata[i] * idata[scandata[i]];
+        //count += mapdata[i];
+        if (mapdata[i] != 0)
+        {
+            odata[scandata[i]] = idata[i];
+        }
+    }
+    count = scandata[n - 1] + mapdata[n-1];
+
+    /*
+    printf("\n%-10s", "idata: "); for (int i = 0; i < 20; i++)
+       printf("%d ", idata[i]);
+    printf("\n%-10s", "mapdata: "); for (int i = 0; i < 20; i++)
+        printf("%d ", mapdata[i]);
+    printf("\n%-10s", "scandata: "); for (int i = 0; i < 20; i++)
+        printf("%d ", scandata[i]);
+    printf("\n%-10s", "odata: "); for (int i = 0; i < 20; i++)
+        printf("%d ", odata[i]);
+    printf("\n");
+    */
+
+    delete[] mapdata;
+    delete[] scandata;
+
+    return count;
+
 }
 
 }
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index b2f739b..30b68fa 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -2,33 +2,348 @@
 #include <cuda_runtime.h>
 #include "common.h"
 #include "efficient.h"
+#include "stdio.h"
 
 namespace StreamCompaction {
-namespace Efficient {
+    namespace Efficient {
 
-// TODO: __global__
+        // TODO: __global__
+#define blockSize 128
 
-/**
- * Performs prefix-sum (aka scan) on idata, storing the result into odata.
- */
-void scan(int n, int *odata, const int *idata) {
-    // TODO
-    printf("TODO\n");
-}
+        //int* dev_idata;
+        //int* dev_odata;
+        //int* dev_bools;
+        //int* dev_indices;
+        //int* dev_scandata;
 
-/**
- * Performs stream compaction on idata, storing the result into odata.
- * All zeroes are discarded.
- *
- * @param n      The number of elements in idata.
- * @param odata  The array into which to store elements.
- * @param idata  The array of elements to compact.
- * @returns      The number of elements remaining after compaction.
- */
-int compact(int n, int *odata, const int *idata) {
-    // TODO
-    return -1;
-}
 
-}
+        __global__ void kernupsweep(int n, int d, int* dev_scandata)
+        {
+            int k = (blockIdx.x * blockDim.x) + threadIdx.x;
+            if (k >= n)
+                return;
+
+            if (k % (1 << (d + 1)) == 0)
+                dev_scandata[k + (1 << (d + 1)) - 1] += dev_scandata[k + (1 << d) - 1];// +dev_scandata[k + (1 << (d + 1)) - 1];
+        }
+
+        __global__ void kerndownsweep(int n, int d, int* dev_scandata)
+        {
+            int k = (blockIdx.x * blockDim.x) + threadIdx.x;
+            if (k >= n)
+                return;
+
+            if (k % (1 << (d + 1)) == 0)
+            {
+                int t = dev_scandata[k + (1 << d) - 1];
+                dev_scandata[k + (1 << d) - 1] = dev_scandata[k + (1 << (d + 1)) - 1];
+                dev_scandata[k + (1 << (d + 1)) - 1] = t + dev_scandata[k + (1 << (d + 1)) - 1];
+            }
+        }
+
+        __global__ void kernscan(int n, int d, int* dev_scandata, int* dev_scandata2)
+        {
+            int k = (blockIdx.x * blockDim.x) + threadIdx.x;
+            if (k >= n)
+                return;
+
+            if (k >= (1 << (d - 1)))
+                dev_scandata[k] = dev_scandata2[k] + dev_scandata2[k - (1 << (d - 1))];
+        }
+
+        __global__ void kerninc2exc(int n, int* dev_scandata, int* dev_scandata2)
+        {
+            int k = (blockIdx.x * blockDim.x) + threadIdx.x;
+            if (k >= n)
+                return;
+
+            if (k == 0)
+                dev_scandata[0] = 0;
+            else
+                dev_scandata[k] = dev_scandata2[k - 1];
+        }
+
+        /*
+        void setup(int n, const int* idata)
+        {
+        cudaMalloc((void**)&dev_scandata, n * sizeof(int));
+        cudaMemcpy(dev_scandata, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
+        }
+
+        void cleanup(void)
+        {
+        cudaFree(dev_scandata);
+        }
+
+        void setupcompact(int n, const int* idata)
+        {
+        cudaMalloc((void**)&dev_idata, n * sizeof(int));
+        cudaMalloc((void**)&dev_odata, n * sizeof(int));
+        cudaMalloc((void**)&dev_bools, n * sizeof(int));
+        cudaMalloc((void**)&dev_indices, n * sizeof(int));
+        cudaMalloc((void**)&dev_scandata, n * sizeof(int));
+
+        cudaMemset(dev_bools, 0, sizeof(int) * n);
+        cudaMemset(dev_indices, 0, sizeof(int) * n);
+        cudaMemcpy(dev_scandata, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
+        }
+
+        void cleanupcompact(void)
+        {
+        cudaFree(dev_idata);
+        cudaFree(dev_odata);
+        cudaFree(dev_bools);
+        cudaFree(dev_indices);
+        cudaFree(dev_scandata);
+        }
+        */
+
+        /**
+        * Performs prefix-sum (aka scan) on idata, storing the result into odata.
+        */
+        void scan(int n, int *odata, const int *idata) {
+            // TODO
+            //printf("TODO\n");
+
+
+            int nold = n;
+
+            if (n % ilog2ceil(n) == 0)
+            {
+                n = 1 << ilog2ceil(n);
+                dim3 blocks((n + blockSize - 1) / blockSize);
+                //setup(n, idata);
+                int* dev_scandata;
+                cudaMalloc((void**)&dev_scandata, n * sizeof(int));
+                cudaMemcpy(dev_scandata, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
+
+
+                for (int d = 0; d <= ilog2ceil(n) - 1; d++)
+                {
+                    kernupsweep << <blocks, blockSize >> >(n, d, dev_scandata);
+                }
+
+                cudaMemset(dev_scandata + (n - 1), 0, sizeof(int));
+                for (int d = ilog2ceil(n); d >= 0; d--)
+                {
+                    kerndownsweep << <blocks, blockSize >> >(n, d, dev_scandata);  
+                }
+                cudaMemcpy(odata, dev_scandata, sizeof(int) * n, cudaMemcpyDeviceToHost);
+
+                cudaFree(dev_scandata);
+            }
+            else
+            {
+                n = 1 << ilog2ceil(n);
+                dim3 blocks((n + blockSize - 1) / blockSize);
+                //setup(n, idata);
+                int* dev_scandata;
+                cudaMalloc((void**)&dev_scandata, n * sizeof(int));
+                cudaMemcpy(dev_scandata, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
+
+                for (int d = 0; d <= ilog2ceil(n) - 1; d++)
+                {
+                    kernupsweep << <blocks, blockSize >> >(n, d, dev_scandata);
+                }
+
+                cudaMemset(dev_scandata + (n - 1), 0, sizeof(int));
+
+                for (int d = ilog2ceil(n); d >= 0; d--)
+                {
+                    kerndownsweep << <blocks, blockSize >> >(n, d, dev_scandata);
+                }
+                cudaMemcpy(odata, dev_scandata, sizeof(int) * n, cudaMemcpyDeviceToHost);
+                cudaFree(dev_scandata);
+            }
+
+            //cleanup();
+
+        }
+
+        
+        float timeKernUpsweep(int n, const int *idata)
+        {
+            int* dev_idata;
+            cudaMalloc((void**)&dev_idata, n * sizeof(int));
+
+            cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
+
+            dim3 blocks((n + blockSize - 1) / blockSize);
+
+            cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord(start);
+
+            if (n % 2 == 0)
+            {
+                for (int d = 0; d <= ilog2ceil(n) - 1; d++)
+                {
+                    kernupsweep << <blocks, blockSize >> >(n, d, dev_idata);
+                }
+            }
+            cudaEventRecord(stop); cudaEventSynchronize(stop); float milliseconds = 0; cudaEventElapsedTime(&milliseconds, start, stop);
+            //printf("\nELAPSED TIME = %f\n", milliseconds);
+
+            cudaEventDestroy(start); cudaEventDestroy(stop);
+
+            cudaFree(dev_idata);
+
+            return milliseconds;
+        }
+
+        float timeKernDownsweep(int n, const int *idata)
+        {
+            int* dev_idata;
+            cudaMalloc((void**)&dev_idata, n * sizeof(int));
+
+            cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
+
+            dim3 blocks((n + blockSize - 1) / blockSize);
+
+            cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord(start);
+
+            if (n % 2 == 0)
+            {
+                for (int d = 0; d <= ilog2ceil(n) - 1; d++)
+                {
+                    kerndownsweep << <blocks, blockSize >> >(n, d, dev_idata);
+                }
+            }
+            cudaEventRecord(stop); cudaEventSynchronize(stop); float milliseconds = 0; cudaEventElapsedTime(&milliseconds, start, stop);
+            //printf("\nELAPSED TIME = %f\n", milliseconds);
+
+            cudaEventDestroy(start); cudaEventDestroy(stop);
+
+            cudaFree(dev_idata);
+
+            return milliseconds;
+        }
+        
+
+        /**
+        * Performs stream compaction on idata, storing the result into odata.
+        * All zeroes are discarded.
+        *
+        * @param n      The number of elements in idata.
+        * @param odata  The array into which to store elements.
+        * @param idata  The array of elements to compact.
+        * @returns      The number of elements remaining after compaction.
+        */
+        int compact(int n, int *odata, const int *idata) {
+            // TODO
+
+            //dim3 blocks((n + blockSize - 1) / blockSize);
+            int count = 0;
+
+            if (n % ilog2ceil(n) == 0)
+            {
+                n = 1 << ilog2ceil(n);
+                dim3 blocks((n + blockSize - 1) / blockSize);
+
+                //setupcompact(n, idata);
+                int* dev_idata;
+                int* dev_odata;
+                int* dev_bools;
+                int* dev_indices;
+                int* dev_scandata;
+                cudaMalloc((void**)&dev_idata, n * sizeof(int));
+                cudaMalloc((void**)&dev_odata, n * sizeof(int));
+                cudaMalloc((void**)&dev_bools, n * sizeof(int));
+                cudaMalloc((void**)&dev_indices, n * sizeof(int));
+                cudaMalloc((void**)&dev_scandata, n * sizeof(int));
+
+
+                int* bools = new int[n];
+                int* indices = new int[n];
+                memset(bools, 0, sizeof(int)*n);
+                memset(indices, 0, sizeof(int)*n);
+                cudaMemcpy(dev_idata, idata, sizeof(int)*n, cudaMemcpyHostToDevice);
+                cudaMemcpy(dev_odata, odata, sizeof(int)*n, cudaMemcpyHostToDevice);
+
+                StreamCompaction::Common::kernMapToBoolean << <blocks, blockSize >> > (n, dev_bools, dev_idata);
+
+                scan(n, dev_indices, dev_bools);
+                cudaMemcpy(bools, dev_bools, sizeof(int) * n, cudaMemcpyDeviceToHost);
+                cudaMemcpy(indices, dev_indices, sizeof(int) * n, cudaMemcpyDeviceToHost);
+
+                StreamCompaction::Common::kernScatter << <blocks, blockSize >> > (n, dev_odata, dev_idata, dev_bools, dev_indices);
+
+                cudaMemcpy(odata, dev_odata, sizeof(int)*n, cudaMemcpyDeviceToHost);
+
+                /*
+                printf("\n%-10s", "bools: "); for (int i = 0; i < 20; i++)
+                printf("%d ", bools[i]);
+                printf("\n%-10s", "scans: "); for (int i = 0; i < 20; i++)
+                printf("%d ", indices[i]);
+                printf("\n%-10s", "idata: "); for (int i = 0; i < 20; i++)
+                printf("%d ", idata[i]);
+                printf("\n%-10s", "odata: "); for (int i = 0; i < 20; i++)
+                printf("%d ", odata[i]);
+                printf("\n");
+                */
+
+                count = indices[n - 1] + bools[n - 1];
+
+                //cleanupcompact();
+                cudaFree(dev_idata);
+                cudaFree(dev_odata);
+                cudaFree(dev_bools);
+                cudaFree(dev_indices);
+                cudaFree(dev_scandata);
+
+                delete[] bools;
+                delete[] indices;
+            }
+            else
+            {
+                n = 1 << ilog2ceil(n);
+                dim3 blocks((n + blockSize - 1) / blockSize);
+
+                //setupcompact(n, idata);
+                int* dev_idata;
+                int* dev_odata;
+                int* dev_bools;
+                int* dev_indices;
+                int* dev_scandata;
+                cudaMalloc((void**)&dev_idata, n * sizeof(int));
+                cudaMalloc((void**)&dev_odata, n * sizeof(int));
+                cudaMalloc((void**)&dev_bools, n * sizeof(int));
+                cudaMalloc((void**)&dev_indices, n * sizeof(int));
+                cudaMalloc((void**)&dev_scandata, n * sizeof(int));
+
+
+                int* bools = new int[n];
+                int* indices = new int[n];
+                memset(bools, 0, sizeof(int)*n);
+                memset(indices, 0, sizeof(int)*n);
+                cudaMemcpy(dev_idata, idata, sizeof(int)*n, cudaMemcpyHostToDevice);
+                cudaMemcpy(dev_odata, odata, sizeof(int)*n, cudaMemcpyHostToDevice);
+
+                StreamCompaction::Common::kernMapToBoolean << <blocks, blockSize >> > (n, dev_bools, dev_idata);
+
+                scan(n, dev_indices, dev_bools);
+                cudaMemcpy(bools, dev_bools, sizeof(int) * n, cudaMemcpyDeviceToHost);
+                cudaMemcpy(indices, dev_indices, sizeof(int) * n, cudaMemcpyDeviceToHost);
+
+                StreamCompaction::Common::kernScatter << <blocks, blockSize >> > (n, dev_odata, dev_idata, dev_bools, dev_indices);
+
+                cudaMemcpy(odata, dev_odata, sizeof(int)*n, cudaMemcpyDeviceToHost);
+
+                count = indices[n - 1] + bools[n - 1] - 1;
+                //for (int i = 0; i < n; i++)
+                //   printf("%d ", indices[i]);
+                //for (int i = 0; i < n; i++)
+                //   printf("%d ", bools[i]);
+                //cleanupcompact();
+                cudaFree(dev_idata);
+                cudaFree(dev_odata);
+                cudaFree(dev_bools);
+                cudaFree(dev_indices);
+                cudaFree(dev_scandata);
+
+                delete[] bools;
+                delete[] indices;
+            }
+
+            return count;
+        }
+    }
 }
diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h
index 395ba10..6e24625 100644
--- a/stream_compaction/efficient.h
+++ b/stream_compaction/efficient.h
@@ -3,6 +3,8 @@
 namespace StreamCompaction {
 namespace Efficient {
     void scan(int n, int *odata, const int *idata);
+    float timeKernUpsweep(int n, const int *idata);
+    float timeKernDownsweep(int n, const int *idata);
 
     int compact(int n, int *odata, const int *idata);
 }
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index 3d86b60..ffdc72c 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -2,18 +2,130 @@
 #include <cuda_runtime.h>
 #include "common.h"
 #include "naive.h"
+#include <algorithm>
 
 namespace StreamCompaction {
 namespace Naive {
 
 // TODO: __global__
 
+
+
+
+#define blockSize 128
+
+    int* dev_scandata;
+    int* dev_scandata2;
+
+
+    __global__ void kernscan(int n, int d, int* dev_scandata, int* dev_scandata2)
+    {
+        int k = (blockIdx.x * blockDim.x) + threadIdx.x;
+        if (k >= n)
+            return;
+
+        if (k >= 1 << (d - 1))
+        {
+            dev_scandata[k] = dev_scandata2[k] + dev_scandata2[k - (1 << (d - 1))];
+            dev_scandata2[k] = dev_scandata[k];  //swap
+        }
+    }
+
+    __global__ void kerninc2exc(int n, int* dev_scandata, int* dev_scandata2)
+    {
+        int k = (blockIdx.x * blockDim.x) + threadIdx.x;
+        if (k >= n)
+            return;
+        
+        if (k == 0)
+            dev_scandata[0] = 0;
+        else
+            dev_scandata[k] = dev_scandata2[k-1];
+    }
+
+    float timeKernScan(int n, int *odata, const int *idata)
+    {
+        int* dev_odata;
+        int* dev_idata;
+        cudaMalloc((void**)&dev_odata, n * sizeof(int));
+        cudaMalloc((void**)&dev_idata, n * sizeof(int));
+
+        cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
+        cudaMemcpy(dev_odata, dev_idata, sizeof(int) * n, cudaMemcpyDeviceToDevice);
+
+
+        dim3 blocks((n + blockSize - 1) / blockSize);
+
+
+        cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord(start);
+        int count = 0;
+        if (n % 2 == 0)
+        {
+            for (int d = 1; d <= ilog2ceil(n); d++)
+            {
+                kernscan << <blocks, blockSize >> >(n, d, dev_scandata, dev_scandata2);
+                count++;
+            }
+        }
+        cudaEventRecord(stop); cudaEventSynchronize(stop); float milliseconds = 0; cudaEventElapsedTime(&milliseconds, start, stop);
+        //printf("\nELAPSED TIME = %f\n", milliseconds);
+
+        cudaEventDestroy(start); cudaEventDestroy(stop);
+
+        cudaFree(dev_odata);
+        cudaFree(dev_idata);
+
+        return milliseconds;
+    }
+
 /**
  * Performs prefix-sum (aka scan) on idata, storing the result into odata.
  */
-void scan(int n, int *odata, const int *idata) {
-    // TODO
-    printf("TODO\n");
+    void scan(int n, int *odata, const int *idata) {
+        // TODO
+        //printf("TODO\n");
+
+
+        cudaMalloc((void**)&dev_scandata, n * sizeof(int));
+        cudaMalloc((void**)&dev_scandata2, n * sizeof(int));
+
+        cudaMemcpy(dev_scandata2, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
+        cudaMemcpy(dev_scandata, dev_scandata2, sizeof(int) * n, cudaMemcpyDeviceToDevice);
+
+        dim3 blocks((n + blockSize - 1) / blockSize);
+
+        int count = 0;
+        if (n % ilog2(n) == 0)
+        {
+            for (int d = 1; d <= ilog2ceil(n); d++)
+            {
+                kernscan << <blocks, blockSize >> >(n, d, dev_scandata, dev_scandata2);
+                count++;
+            }
+        }
+        else
+        {
+            for (int d = 1; d <= ilog2ceil(n)+1; d++)
+            {
+                kernscan << <blocks, blockSize >> >(n, d, dev_scandata, dev_scandata2);
+                count++;
+            }
+        }
+
+        
+        if (n % ilog2(n) == 0)
+        {
+            kerninc2exc << <blocks, blockSize >> >(n, dev_scandata2, dev_scandata);
+            cudaMemcpy(odata, dev_scandata2, sizeof(int) * n, cudaMemcpyDeviceToHost);
+        }
+        else
+        {
+            kerninc2exc << <blocks, blockSize >> >(n, dev_scandata, dev_scandata2);
+            cudaMemcpy(odata, dev_scandata, sizeof(int) * n, cudaMemcpyDeviceToHost);
+        }
+
+    cudaFree(dev_scandata);
+    cudaFree(dev_scandata2);
 }
 
 }
diff --git a/stream_compaction/naive.h b/stream_compaction/naive.h
index 21152d6..10a82d7 100644
--- a/stream_compaction/naive.h
+++ b/stream_compaction/naive.h
@@ -3,5 +3,6 @@
 namespace StreamCompaction {
 namespace Naive {
     void scan(int n, int *odata, const int *idata);
+    float timeKernScan(int n, int *odata, const int *idata);
 }
 }
diff --git a/stream_compaction/radix.cu b/stream_compaction/radix.cu
new file mode 100644
index 0000000..a0f3e0c
--- /dev/null
+++ b/stream_compaction/radix.cu
@@ -0,0 +1,285 @@
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include "common.h"
+#include "naive.h"
+#include "radix.h"
+#include "efficient.h"
+#include "thrust.h"
+#include "cpu.h"
+#include <algorithm>
+
+namespace StreamCompaction {
+namespace Radix {
+
+
+#define blockSize 128
+
+    int getDigit(int number, int i)
+    {
+        int d = 0;
+        if (i == 1)
+        {
+            d = number % (10 * i);
+            return d;
+        }
+        else if (i > 1)
+        {
+            int currpower = (pow(10, i));
+            int prevpower = (pow(10, i - 1));
+
+            if (number < prevpower)
+                return -1;
+
+            int nextdigits = number % (prevpower);
+            d = (number%currpower - nextdigits) / prevpower;
+
+            return d;
+        }
+        return -1;
+    }
+
+
+
+
+    __global__ void kernDecimalsMap(int n, int sbit, int *decimals, const int *idata) {
+        // TODO
+        int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+        if (index >= n)
+            return;
+
+        int number = idata[index];
+
+        // get digit
+        int d = 0;
+        if (sbit == 1)
+        {
+            d = number % (10 * sbit);
+        }
+        else if (sbit > 1)
+        {
+            int currpower = (int)(pow(double(10), double(sbit)));
+            int prevpower = (int)(pow(double(10), double(sbit - 1)));
+
+            if (number < prevpower)
+                d = -1;
+
+            int nextdigits = number % (prevpower);
+            d = (number%currpower - nextdigits) / prevpower;
+        }
+        else
+            d = -1;
+        // end get digit
+
+        decimals[index] = d;
+    }
+
+
+    __global__ void kernexc2inc(int n, int* dev_scandata, int* dev_scandata2)
+    {
+        int k = (blockIdx.x * blockDim.x) + threadIdx.x;
+        if (k >= n-1)
+            return;
+
+        dev_scandata[k] = dev_scandata2[k+1];
+    }
+
+    void exc2inc(int n, int *odata, const int *idata)
+    {
+        dim3 blocks((n + blockSize - 1) / blockSize);
+        int* dev_idata;
+        int* dev_odata;
+        cudaMalloc((void**)&dev_idata, sizeof(int) * n);
+        cudaMalloc((void**)&dev_odata, sizeof(int) * n);
+        cudaMemcpy(dev_idata, idata, sizeof(int)*n, cudaMemcpyHostToDevice);
+        cudaMemcpy(dev_odata, odata, sizeof(int)*n, cudaMemcpyHostToDevice);
+
+        kernexc2inc << <blocks, blockSize >> >(n, dev_odata, dev_idata);
+        cudaMemcpy(odata, dev_odata, sizeof(int)*n, cudaMemcpyDeviceToHost);
+        
+        if (n>1)
+            odata[n - 1] += odata[n-2];
+
+        cudaFree(dev_idata);
+        cudaFree(dev_odata);
+    }
+
+    int findMax(int n, const int* nums)
+    {
+        int max = 0;
+        for (int i = 0; i < n; i++)
+            max = max < nums[i] ? nums[i] : max;
+        return max;
+    }
+
+
+    __global__ void kernFindMax(int n, int d, int* dev_scandata, int* dev_scandata2)
+    {
+        int k = (blockIdx.x * blockDim.x) + threadIdx.x;
+        if (k >= n)
+            return;
+
+        if (k >= 1 << (d - 1))
+        {
+            dev_scandata[k] = dev_scandata2[k] < dev_scandata2[k - (1 << (d - 1))] ? dev_scandata2[k - (1 << (d - 1))] : dev_scandata2[k];
+            dev_scandata2[k] = dev_scandata[k];  //swap
+        }
+    }
+
+    void findMaxGPU(int n, int *odata, const int *idata)
+    {
+        dim3 blocks((n + blockSize - 1) / blockSize);
+        int* dev_scandata;
+        int* dev_scandata2;
+        cudaMalloc((void**)&dev_scandata, n * sizeof(int));
+        cudaMalloc((void**)&dev_scandata2, n * sizeof(int));
+
+        cudaMemcpy(dev_scandata2, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
+        cudaMemcpy(dev_scandata, dev_scandata2, sizeof(int) * n, cudaMemcpyDeviceToDevice);
+
+        if (n % 2 == 0)
+        {
+            for (int d = 1; d <= ilog2ceil(n); d++)
+                kernFindMax << <blocks, blockSize >> >(n, d, dev_scandata, dev_scandata2);
+        }
+        else
+        {
+            for (int d = 1; d <= ilog2ceil(n) + 1; d++)
+                kernFindMax << <blocks, blockSize >> >(n, d, dev_scandata, dev_scandata2);
+        }
+
+        cudaMemcpy(odata, dev_scandata, sizeof(int) * n, cudaMemcpyDeviceToHost);
+        cudaFree(dev_scandata);
+        cudaFree(dev_scandata2);
+    }
+/**
+ * Performs prefix-sum (aka scan) on idata, storing the result into odata.
+ */
+    void radixCPU(int n, int *odata, const int *idata) {
+        // TODO
+        //printf("TODO\n");
+
+        int* mapping = new int[10];
+        int* idatacpy = new int[n];
+        memcpy(idatacpy, idata, sizeof(int)*n);
+
+        memset(odata, 0, sizeof(int)*n);
+        
+        int sbit = 1;
+        bool notdone = true;
+
+        while (notdone)
+        {
+            memset(mapping, 0, sizeof(int) * 10);
+            notdone = false;
+            for (int i = 0; i < n; i++)
+            {
+                int number = idatacpy[i];
+                int val = getDigit(number, sbit);
+                if (val != -1)
+                {
+                    mapping[val]++;
+                    notdone = true;
+                }
+                else
+                    mapping[0]++;
+            }
+            if (notdone == false)
+                break;
+
+            for (int i = 1; i < 10; i++)
+                mapping[i] += mapping[i - 1];
+
+            for (int i = n - 1; i >= 0; i--)
+            {
+                int number = idatacpy[i];
+                int digit = getDigit(number, sbit);
+                int mapidx = digit == -1 ? 0 : digit;
+                mapping[mapidx] -= 1;
+                int index = mapping[mapidx];
+                odata[index] = number;
+            }
+
+            memcpy(idatacpy, odata, sizeof(int)*n);
+            sbit += 1;
+        }
+}
+    void radixGPU(int n, int *odata, const int *idata, gputesttype testtype) {
+        // TODO
+        //printf("TODO\n");
+
+        dim3 blocks((n + blockSize - 1) / blockSize);
+
+        int* dev_decmapping;
+        int* dev_idata;
+        cudaMalloc((void**)&dev_decmapping, sizeof(int) * n);
+        cudaMalloc((void**)&dev_idata, sizeof(int) * n);
+        cudaMemset(dev_decmapping, 0, sizeof(int) * n);
+        cudaMemcpy(dev_idata, idata, sizeof(int)*n, cudaMemcpyHostToDevice);
+
+        int* idatacpy = new int[n];
+        int* decmapping = new int[n];
+        int* deccount = new int[10];
+        int* decscan = new int[10];
+        memcpy(idatacpy, idata, sizeof(int) * n);
+        memset(decmapping, 0, sizeof(int) * n);
+        
+        memset(odata, 0, sizeof(int)*n);
+
+
+        findMaxGPU(n, odata, idata);
+        int maxnum = odata[n - 1];
+
+        memset(odata, 0, sizeof(int)*n);
+
+        int maxbit = maxnum > 0 ? (int)log10((double)maxnum) + 1 : 1;
+
+        for (int sbit = 1; sbit <= maxbit; sbit++)
+        {
+
+            memset(deccount, 0, sizeof(int) * 10);
+            memset(decscan, 0, sizeof(int) * 10);
+
+            kernDecimalsMap << <blocks, blockSize >> >(n, sbit, dev_decmapping, dev_idata);
+            cudaMemcpy(decmapping, dev_decmapping, sizeof(int)*n, cudaMemcpyDeviceToHost);
+            for (int i = 0; i < n; i++)
+                deccount[decmapping[i]]++;
+
+        
+            if (testtype == 0)
+                CPU::scan(10, decscan, deccount);
+            else if (testtype == 1)
+                Naive::scan(10, decscan, deccount);
+            else if (testtype == 2)
+                Thrust::scan(10, decscan, deccount);
+
+
+            exc2inc(10, deccount, decscan);
+        
+        
+            for (int i = n - 1; i >= 0; i--)
+            {
+                int number = idatacpy[i];
+                int digit = getDigit(number, sbit);
+                int mapidx = digit < 0 ? 0 : digit;
+                deccount[mapidx] -= 1;
+                int index = deccount[mapidx];
+                if (index>=0)
+                    odata[index] = number;
+            }
+
+            memcpy(idatacpy, odata, sizeof(int) * n);
+        
+        }
+
+        memcpy(idatacpy, odata, sizeof(int) * n);
+
+        cudaFree(dev_decmapping);
+        cudaFree(dev_idata);
+
+        delete[] decmapping;
+        delete[] deccount;
+        delete[] decscan;
+        delete[] idatacpy;
+    }
+}
+}
diff --git a/stream_compaction/radix.h b/stream_compaction/radix.h
new file mode 100644
index 0000000..5f65e70
--- /dev/null
+++ b/stream_compaction/radix.h
@@ -0,0 +1,11 @@
+#pragma once
+
+
+namespace StreamCompaction {
+namespace Radix {
+    enum gputesttype { CPU, NAIVE, THRUST };
+
+    void radixCPU(int n, int *odata, const int *idata);
+    void radixGPU(int n, int *odata, const int *idata, gputesttype testtype);
+}
+}
diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu
index d8dbb32..1b0334d 100644
--- a/stream_compaction/thrust.cu
+++ b/stream_compaction/thrust.cu
@@ -9,6 +9,31 @@
 namespace StreamCompaction {
 namespace Thrust {
 
+
+    float timeThrust(int n, int *odata, const int *idata)
+    {
+        int* dev_odata;
+        int* dev_idata;
+        cudaMalloc((void**)&dev_odata, n * sizeof(int));
+        cudaMalloc((void**)&dev_idata, n * sizeof(int));
+
+        cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
+        cudaMemcpy(dev_odata, dev_idata, sizeof(int) * n, cudaMemcpyDeviceToDevice);
+
+        thrust::host_vector<int> dv_in(idata, idata + n);
+        thrust::host_vector<int> dv_out(odata, odata + n);
+
+        cudaEvent_t start, stop; cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord(start);
+        thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
+        cudaEventRecord(stop); cudaEventSynchronize(stop); float milliseconds = 0; cudaEventElapsedTime(&milliseconds, start, stop);
+        //printf("\nELAPSED TIME = %f\n", milliseconds);
+        cudaEventDestroy(start); cudaEventDestroy(stop);
+
+        cudaFree(dev_odata);
+        cudaFree(dev_idata);
+
+        return milliseconds;
+    }
 /**
  * Performs prefix-sum (aka scan) on idata, storing the result into odata.
  */
@@ -16,6 +41,18 @@ void scan(int n, int *odata, const int *idata) {
     // TODO use `thrust::exclusive_scan`
     // example: for device_vectors dv_in and dv_out:
     // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
+
+    //thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
+
+
+    thrust::host_vector<int> dv_in(idata, idata + n);
+    thrust::host_vector<int> dv_out(odata, odata + n);
+
+    thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
+
+    //odata = &(dv_out.front());
+    for (int i = 0; i < n; i++)
+        odata[i] = dv_out[i];
 }
 
 }
diff --git a/stream_compaction/thrust.h b/stream_compaction/thrust.h
index 06707f3..ccacb34 100644
--- a/stream_compaction/thrust.h
+++ b/stream_compaction/thrust.h
@@ -3,5 +3,6 @@
 namespace StreamCompaction {
 namespace Thrust {
     void scan(int n, int *odata, const int *idata);
+    float timeThrust(int n, int *odata, const int *idata);
 }
 }