From 86ad524241764175f30cc526576a122d3291f5e2 Mon Sep 17 00:00:00 2001 From: Bradley Crusco Date: Thu, 10 Sep 2015 19:11:10 -0400 Subject: [PATCH 01/20] Implement prefix sum CPU scan. --- stream_compaction/cpu.cu | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index e600c29..636de70 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -8,8 +8,10 @@ namespace CPU { * CPU scan (prefix sum). */ void scan(int n, int *odata, const int *idata) { - // TODO - printf("TODO\n"); + odata[0] = 0; + for (int i = 1; i < n; i++) { + odata[i] = odata[i - 1] + idata[i - 1]; + } } /** From 2da81a4c2b61a9cff9bcc188662810d3a46e8d18 Mon Sep 17 00:00:00 2001 From: Bradley Crusco Date: Thu, 10 Sep 2015 19:20:17 -0400 Subject: [PATCH 02/20] Implement CPU compaction without scan. --- stream_compaction/cpu.cu | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 636de70..53aa60d 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -20,8 +20,15 @@ void scan(int n, int *odata, const int *idata) { * @returns the number of elements remaining after compaction. */ int compactWithoutScan(int n, int *odata, const int *idata) { - // TODO - return -1; + int j = 0; + for (int i = 0; i < n; i++) { + if (idata[i] != 0) { + odata[j] = idata[i]; + j++; + } + } + + return j; } /** From 49ab51c84a3d578fecb9711bd35fc8caa0abdcf5 Mon Sep 17 00:00:00 2001 From: Bradley Crusco Date: Thu, 10 Sep 2015 20:53:23 -0400 Subject: [PATCH 03/20] Implemented CPU compacting using scan. --- stream_compaction/cpu.cu | 32 ++++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 53aa60d..ed31187 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -1,4 +1,5 @@ #include +#include #include "cpu.h" namespace StreamCompaction { @@ -31,14 +32,41 @@ int compactWithoutScan(int n, int *odata, const int *idata) { return j; } +void zeroArray(int n, int *a) { + for (int i = 0; i < n; i++) { + a[i] = 0; + } +} + /** * CPU stream compaction using scan and scatter, like the parallel version. * * @returns the number of elements remaining after compaction. */ int compactWithScan(int n, int *odata, const int *idata) { - // TODO - return -1; + int *temp = (int*)malloc(n * sizeof(int)); + zeroArray(n, temp); + int *scan_output = (int*)malloc(n * sizeof(int)); + zeroArray(n, scan_output); + + // Compute temporary array + for (int i = 0; i < n; i++) { + if (idata[i] != 0) { + temp[i] = 1; + } + } + + // Run exclusive scan on the temporary array + scan(n, scan_output, temp); + + // Scatter + for (int i = 0; i < n; i++) { + if (temp[i] == 1) { + odata[scan_output[i]] = idata[i]; + } + } + + return scan_output[n - 1] + 1; } } From 379d9e2e2da653f5870fd37a5b16f009dd6f48cd Mon Sep 17 00:00:00 2001 From: Bradley Crusco Date: Sat, 12 Sep 2015 14:45:45 -0400 Subject: [PATCH 04/20] Naive scan implemented. --- stream_compaction/cpu.cu | 1 + stream_compaction/naive.cu | 50 +++++++++++++++++++++++++++++++++++--- 2 files changed, 48 insertions(+), 3 deletions(-) diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index ed31187..6ce08af 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -59,6 +59,7 @@ int compactWithScan(int n, int *odata, const int *idata) { // Run exclusive scan on the temporary array scan(n, scan_output, temp); + // TODO: Make seperate scatter function // Scatter for (int i = 0; i < n; i++) { if (temp[i] == 1) { diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 3d86b60..c919ec3 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -6,15 +6,59 @@ namespace StreamCompaction { namespace Naive { -// TODO: __global__ +__global__ void kern_scan(int d, int *idata, int *odata) { + int k = threadIdx.x; + + if (k >= (int)pow(2.0, (double)(d - 1))) { + odata[k] = idata[k - (int)pow(2.0, (double)(d - 1))] + idata[k]; + } + else { + odata[k] = idata[k]; + } +} + +void padArrayRange(int start, int end, int *a) { + for (int i = start; i < end; i++) { + a[i] = 0; + } +} /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - // TODO - printf("TODO\n"); + int m = pow(2, ilog2ceil(n)); + int *new_idata = (int*)malloc(m * sizeof(int)); + + // Expand array to next power of 2 size + for (int i = 0; i < n; i++) { + new_idata[i] = idata[i]; + } + padArrayRange(n, m, new_idata); + + int *dev_idata; + int *dev_odata; + + cudaMalloc((void**)&dev_idata, m * sizeof(int)); + cudaMemcpy(dev_idata, new_idata, m * sizeof(int), cudaMemcpyHostToDevice); + + cudaMalloc((void**)&dev_odata, m * sizeof(int)); + + // Execute scan on device + for (int d = 1; d <= ilog2ceil(n); d++) { + kern_scan<<<1, m>>>(d, dev_idata, dev_odata); + dev_idata = dev_odata; + } + + odata[0] = 0; + cudaMemcpy(odata + 1, dev_odata, (m * sizeof(int)) - sizeof(int), cudaMemcpyDeviceToHost); + + cudaFree(dev_idata); + cudaFree(dev_odata); + free(new_idata); } } } + + From d7e30f218281f4f6735250604d6a507ec0c91680 Mon Sep 17 00:00:00 2001 From: Bradley Crusco Date: Sat, 12 Sep 2015 15:35:04 -0400 Subject: [PATCH 05/20] Work efficient implementation of scan. --- stream_compaction/efficient.cu | 57 ++++++++++++++++++++++++++++++++-- 1 file changed, 54 insertions(+), 3 deletions(-) diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index b2f739b..eebfb0e 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -6,14 +6,65 @@ namespace StreamCompaction { namespace Efficient { -// TODO: __global__ +__global__ void up_sweep(int d, int *data) { + int k = threadIdx.x; + int p2d = pow(2.0, (double)d); + int p2da1 = pow(2.0, (double)(d + 1)); + if (k % p2da1 == 0) { + data[k + p2da1 - 1] += data[k + p2d - 1]; + } +} + +__global__ void down_sweep(int d, int *data) { + int k = threadIdx.x; + int p2d = pow(2.0, (double)d); + int p2da1 = pow(2.0, (double)(d + 1)); + + if (k % p2da1 == 0) { + int temp = data[k + p2d - 1]; + data[k + p2d - 1] = data[k + p2da1 - 1]; + data[k + p2da1 - 1] += temp; + } +} + +void padArrayRange(int start, int end, int *a) { + for (int i = start; i < end; i++) { + a[i] = 0; + } +} /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - // TODO - printf("TODO\n"); + int m = pow(2, ilog2ceil(n)); + int *new_idata = (int*)malloc(m * sizeof(int)); + + // Expand array to next power of 2 size + for (int i = 0; i < n; i++) { + new_idata[i] = idata[i]; + } + padArrayRange(n, m, new_idata); + + // Can use one array for input and output in this implementation + int *dev_data; + cudaMalloc((void**)&dev_data, m * sizeof(int)); + cudaMemcpy(dev_data, new_idata, m * sizeof(int), cudaMemcpyHostToDevice); + + // Execute scan on device + for (int d = 0; d < ilog2ceil(n); d++) { + up_sweep<<<1, m>>>(d, dev_data); + } + + cudaMemset((void*)&dev_data[m - 1], 0, sizeof(int)); + for (int d = ilog2ceil(n) - 1; d >= 0; d--) { + down_sweep<<<1, m>>>(d, dev_data); + } + + cudaMemcpy(odata, dev_data, n * sizeof(int), cudaMemcpyDeviceToHost); + + cudaFree(dev_data); + free(new_idata); } /** From bc40c7076fb34eaf2ba9d302a50c7dbeace53eef Mon Sep 17 00:00:00 2001 From: Bradley Crusco Date: Sat, 12 Sep 2015 16:18:56 -0400 Subject: [PATCH 06/20] Implement GPU compaction. Add an optimization to naive implementation to only deal with the valid data, not the padded data. --- stream_compaction/common.cu | 14 ++++++- stream_compaction/efficient.cu | 70 +++++++++++++++++++++++++--------- stream_compaction/naive.cu | 16 ++++---- 3 files changed, 74 insertions(+), 26 deletions(-) diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index fe872d4..fc71379 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -23,7 +23,11 @@ namespace Common { * which map to 0 will be removed, and elements which map to 1 will be kept. */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { - // TODO + int k = threadIdx.x; + + if (k < n) { + bools[k] = !!idata[k]; + } } /** @@ -32,7 +36,13 @@ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { */ __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { - // TODO + int k = threadIdx.x; + + if (k < n) { + if (bools[k] == 1) { + odata[indices[k]] = idata[k]; + } + } } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index eebfb0e..a4d4812 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -6,25 +6,31 @@ namespace StreamCompaction { namespace Efficient { -__global__ void up_sweep(int d, int *data) { +__global__ void up_sweep(int n, int d, int *data) { int k = threadIdx.x; - int p2d = pow(2.0, (double)d); - int p2da1 = pow(2.0, (double)(d + 1)); - if (k % p2da1 == 0) { - data[k + p2da1 - 1] += data[k + p2d - 1]; - } + if (k < n) { + int p2d = pow(2.0, (double)d); + int p2da1 = pow(2.0, (double)(d + 1)); + + if (k % p2da1 == 0) { + data[k + p2da1 - 1] += data[k + p2d - 1]; + } + } } -__global__ void down_sweep(int d, int *data) { +__global__ void down_sweep(int n, int d, int *data) { int k = threadIdx.x; - int p2d = pow(2.0, (double)d); - int p2da1 = pow(2.0, (double)(d + 1)); - if (k % p2da1 == 0) { - int temp = data[k + p2d - 1]; - data[k + p2d - 1] = data[k + p2da1 - 1]; - data[k + p2da1 - 1] += temp; + if (k < n) { + int p2d = pow(2.0, (double)d); + int p2da1 = pow(2.0, (double)(d + 1)); + + if (k % p2da1 == 0) { + int temp = data[k + p2d - 1]; + data[k + p2d - 1] = data[k + p2da1 - 1]; + data[k + p2da1 - 1] += temp; + } } } @@ -53,12 +59,12 @@ void scan(int n, int *odata, const int *idata) { // Execute scan on device for (int d = 0; d < ilog2ceil(n); d++) { - up_sweep<<<1, m>>>(d, dev_data); + up_sweep<<<1, m>>>(n, d, dev_data); } cudaMemset((void*)&dev_data[m - 1], 0, sizeof(int)); for (int d = ilog2ceil(n) - 1; d >= 0; d--) { - down_sweep<<<1, m>>>(d, dev_data); + down_sweep<<<1, m>>>(n, d, dev_data); } cudaMemcpy(odata, dev_data, n * sizeof(int), cudaMemcpyDeviceToHost); @@ -77,8 +83,38 @@ void scan(int n, int *odata, const int *idata) { * @returns The number of elements remaining after compaction. */ int compact(int n, int *odata, const int *idata) { - // TODO - return -1; + int *bools = (int*)malloc(n * sizeof(int)); + int *scan_data = (int*)malloc(n * sizeof(int)); + int num_remaining = -1; + + int *dev_bools; + int *dev_idata; + int *dev_odata; + int *dev_scan_data; + + cudaMalloc((void**)&dev_bools, n * sizeof(int)); + cudaMalloc((void**)&dev_idata, n * sizeof(int)); + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + + cudaMalloc((void**)&dev_odata, n * sizeof(int)); + cudaMalloc((void**)&dev_scan_data, n * sizeof(int)); + + // Map to boolean + StreamCompaction::Common::kernMapToBoolean<<<1, n>>>(n, dev_bools, dev_idata); + + cudaMemcpy(bools, dev_bools, n * sizeof(int), cudaMemcpyDeviceToHost); + + // Execute the scan + scan(n, scan_data, bools); + num_remaining = scan_data[n - 1] + bools[n - 1]; + + // Execute the scatter + cudaMemcpy(dev_scan_data, scan_data, n * sizeof(int), cudaMemcpyHostToDevice); + StreamCompaction::Common::kernScatter<<<1, n>>>(n, dev_odata, dev_idata, dev_bools, dev_scan_data); + + cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost); + + return num_remaining; } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index c919ec3..d79a41e 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -6,14 +6,16 @@ namespace StreamCompaction { namespace Naive { -__global__ void kern_scan(int d, int *idata, int *odata) { +__global__ void kern_scan(int n, int d, int *idata, int *odata) { int k = threadIdx.x; - if (k >= (int)pow(2.0, (double)(d - 1))) { - odata[k] = idata[k - (int)pow(2.0, (double)(d - 1))] + idata[k]; - } - else { - odata[k] = idata[k]; + if (k < n) { + if (k >= (int)pow(2.0, (double)(d - 1))) { + odata[k] = idata[k - (int)pow(2.0, (double)(d - 1))] + idata[k]; + } + else { + odata[k] = idata[k]; + } } } @@ -46,7 +48,7 @@ void scan(int n, int *odata, const int *idata) { // Execute scan on device for (int d = 1; d <= ilog2ceil(n); d++) { - kern_scan<<<1, m>>>(d, dev_idata, dev_odata); + kern_scan<<<1, m>>>(n, d, dev_idata, dev_odata); dev_idata = dev_odata; } From 73d141bb4bcb27a0b78c7ea10d86387f8e65ab8b Mon Sep 17 00:00:00 2001 From: Bradley Crusco Date: Sat, 12 Sep 2015 16:33:59 -0400 Subject: [PATCH 07/20] Thrust implementation, added memory cleanup to the work efficient implementation of compaction. --- stream_compaction/efficient.cu | 7 +++++++ stream_compaction/thrust.cu | 13 ++++++++++--- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index a4d4812..45ca8d3 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -114,6 +114,13 @@ int compact(int n, int *odata, const int *idata) { cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(dev_bools); + cudaFree(dev_idata); + cudaFree(dev_odata); + cudaFree(dev_scan_data); + free(bools); + free(scan_data); + return num_remaining; } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index d8dbb32..6d16f5f 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -13,9 +13,16 @@ namespace Thrust { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - // TODO use `thrust::exclusive_scan` - // example: for device_vectors dv_in and dv_out: - // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + thrust::host_vector hst_in(idata, idata + n); + thrust::device_vector dev_in = hst_in; + thrust::device_vector dev_out(n); + + thrust::exclusive_scan(dev_in.begin(), dev_in.end(), dev_out.begin()); + thrust::host_vector hst_out = dev_out; + + for (int i = 0; i < n; i++) { + odata[i] = hst_out[i]; + } } } From 4cc2da234bbee7dcca881600b425294b1086c280 Mon Sep 17 00:00:00 2001 From: Bradley Crusco Date: Sun, 13 Sep 2015 15:00:12 -0400 Subject: [PATCH 08/20] Update README.md --- README.md | 156 +----------------------------------------------------- 1 file changed, 2 insertions(+), 154 deletions(-) diff --git a/README.md b/README.md index a82ea0f..307fd6f 100644 --- a/README.md +++ b/README.md @@ -3,160 +3,8 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) - -### (TODO: Your README) - -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) - -Instructions (delete me) -======================== - -This is due Sunday, September 13 at midnight. - -**Summary:** In this project, you'll implement GPU stream compaction in CUDA, -from scratch. This algorithm is widely used, and will be important for -accelerating your path tracer project. - -Your stream compaction implementations in this project will simply remove `0`s -from an array of `int`s. In the path tracer, you will remove terminated paths -from an array of rays. - -In addition to being useful for your path tracer, this project is meant to -reorient your algorithmic thinking to the way of the GPU. On GPUs, many -algorithms can benefit from massive parallelism and, in particular, data -parallelism: executing the same code many times simultaneously with different -data. - -You'll implement a few different versions of the *Scan* (*Prefix Sum*) -algorithm. First, you'll implement a CPU version of the algorithm to reinforce -your understanding. Then, you'll write a few GPU implementations: "naive" and -"work-efficient." Finally, you'll use some of these to implement GPU stream -compaction. - -**Algorithm overview & details:** There are two primary references for details -on the implementation of scan and stream compaction. - -* The [slides on Parallel Algorithms](https://github.com/CIS565-Fall-2015/cis565-fall-2015.github.io/raw/master/lectures/2-Parallel-Algorithms.pptx) - for Scan, Stream Compaction, and Work-Efficient Parallel Scan. -* GPU Gems 3, Chapter 39 - [Parallel Prefix Sum (Scan) with CUDA](http://http.developer.nvidia.com/GPUGems3/gpugems3_ch39.html). - -Your GPU stream compaction implementation will live inside of the -`stream_compaction` subproject. This way, you will be able to easily copy it -over for use in your GPU path tracer. - - -## Part 0: The Usual - -This project (and all other CUDA projects in this course) requires an NVIDIA -graphics card with CUDA capability. Any card with Compute Capability 2.0 -(`sm_20`) or greater will work. Check your GPU on this -[compatibility table](https://developer.nvidia.com/cuda-gpus). -If you do not have a personal machine with these specs, you may use those -computers in the Moore 100B/C which have supported GPUs. - -**HOWEVER**: If you need to use the lab computer for your development, you will -not presently be able to do GPU performance profiling. This will be very -important for debugging performance bottlenecks in your program. - -### Useful existing code - -* `stream_compaction/common.h` - * `checkCUDAError` macro: checks for CUDA errors and exits if there were any. - * `ilog2ceil(x)`: computes the ceiling of log2(x), as an integer. -* `main.cpp` - * Some testing code for your implementations. - - -## Part 1: CPU Scan & Stream Compaction - -This stream compaction method will remove `0`s from an array of `int`s. - -In `stream_compaction/cpu.cu`, implement: - -* `StreamCompaction::CPU::scan`: compute an exclusive prefix sum. -* `StreamCompaction::CPU::compactWithoutScan`: stream compaction without using - the `scan` function. -* `StreamCompaction::CPU::compactWithScan`: stream compaction using the `scan` - function. Map the input array to an array of 0s and 1s, scan it, and use - scatter to produce the output. You will need a **CPU** scatter implementation - for this (see slides or GPU Gems chapter for an explanation). - -These implementations should only be a few lines long. - - -## Part 2: Naive GPU Scan Algorithm - -In `stream_compaction/naive.cu`, implement `StreamCompaction::Naive::scan` - -This uses the "Naive" algorithm from GPU Gems 3, Section 39.2.1. We haven't yet -taught shared memory, and you **shouldn't use it yet**. Example 39-1 uses -shared memory, but is limited to operating on very small arrays! Instead, write -this using global memory only. As a result of this, you will have to do -`ilog2ceil(n)` separate kernel invocations. - -Beware of errors in Example 39-1 in the book; both the pseudocode and the CUDA -code in the online version of Chapter 39 are known to have a few small errors -(in superscripting, missing braces, bad indentation, etc.) - -Since the parallel scan algorithm operates on a binary tree structure, it works -best with arrays with power-of-two length. Make sure your implementation works -on non-power-of-two sized arrays (see `ilog2ceil`). This requires extra memory -- your intermediate array sizes will need to be rounded to the next power of -two. - - -## Part 3: Work-Efficient GPU Scan & Stream Compaction - -### 3.1. Scan - -In `stream_compaction/efficient.cu`, implement -`StreamCompaction::Efficient::scan` - -All of the text in Part 2 applies. - -* This uses the "Work-Efficient" algorithm from GPU Gems 3, Section 39.2.2. -* Beware of errors in Example 39-2. -* Test non-power-of-two sized arrays. - -### 3.2. Stream Compaction - -This stream compaction method will remove `0`s from an array of `int`s. - -In `stream_compaction/efficient.cu`, implement -`StreamCompaction::Efficient::compact` - -For compaction, you will also need to implement the scatter algorithm presented -in the slides and the GPU Gems chapter. - -In `stream_compaction/common.cu`, implement these for use in `compact`: - -* `StreamCompaction::Common::kernMapToBoolean` -* `StreamCompaction::Common::kernScatter` - - -## Part 4: Using Thrust's Implementation - -In `stream_compaction/thrust.cu`, implement: - -* `StreamCompaction::Thrust::scan` - -This should be a very short function which wraps a call to the Thrust library -function `thrust::exclusive_scan(first, last, result)`. - -To measure timing, be sure to exclude memory operations by passing -`exclusive_scan` a `thrust::device_vector` (which is already allocated on the -GPU). You can create a `thrust::device_vector` by creating a -`thrust::host_vector` from the given pointer, then casting it. - - -## Part 5: Radix Sort (Extra Credit) (+10) - -Add an additional module to the `stream_compaction` subproject. Implement radix -sort using one of your scan implementations. Add tests to check its correctness. - +* Bradley Crusco +* Tested on: Windows 10, i7-3770K @ 3.50GHz 16GB, 2 x GTX 980 4096MB (Personal Computer) ## Write-up From 6df7a99c7129bd637a79c2658e71f71c6c3df0e0 Mon Sep 17 00:00:00 2001 From: Bradley Crusco Date: Sun, 13 Sep 2015 15:32:53 -0400 Subject: [PATCH 09/20] Better gird and block sizes than before. Change the calculation of the returned count for the cpu scan. --- stream_compaction/common.cu | 4 ++-- stream_compaction/common.h | 1 + stream_compaction/cpu.cu | 2 +- stream_compaction/efficient.cu | 16 ++++++++++------ stream_compaction/naive.cu | 7 +++++-- 5 files changed, 19 insertions(+), 11 deletions(-) diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index fc71379..41b537c 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -23,7 +23,7 @@ namespace Common { * which map to 0 will be removed, and elements which map to 1 will be kept. */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { - int k = threadIdx.x; + int k = threadIdx.x + (blockIdx.x * blockDim.x); if (k < n) { bools[k] = !!idata[k]; @@ -36,7 +36,7 @@ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { */ __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { - int k = threadIdx.x; + int k = threadIdx.x + (blockIdx.x * blockDim.x); if (k < n) { if (bools[k] == 1) { diff --git a/stream_compaction/common.h b/stream_compaction/common.h index 4f52663..5ce9f04 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -6,6 +6,7 @@ #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__) #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__) +#define blockSize 128 /** * Check for CUDA errors; print and exit if there was a problem. diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 6ce08af..6ee93bc 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -67,7 +67,7 @@ int compactWithScan(int n, int *odata, const int *idata) { } } - return scan_output[n - 1] + 1; + return scan_output[n - 1] + temp[n - 1]; } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 45ca8d3..ad32251 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -7,7 +7,7 @@ namespace StreamCompaction { namespace Efficient { __global__ void up_sweep(int n, int d, int *data) { - int k = threadIdx.x; + int k = threadIdx.x + (blockIdx.x * blockDim.x); if (k < n) { int p2d = pow(2.0, (double)d); @@ -20,7 +20,7 @@ __global__ void up_sweep(int n, int d, int *data) { } __global__ void down_sweep(int n, int d, int *data) { - int k = threadIdx.x; + int k = threadIdx.x + (blockIdx.x * blockDim.x); if (k < n) { int p2d = pow(2.0, (double)d); @@ -45,6 +45,8 @@ void padArrayRange(int start, int end, int *a) { void scan(int n, int *odata, const int *idata) { int m = pow(2, ilog2ceil(n)); int *new_idata = (int*)malloc(m * sizeof(int)); + dim3 fullBlocksPerGrid((m + blockSize - 1) / blockSize); + dim3 threadsPerBlock(blockSize); // Expand array to next power of 2 size for (int i = 0; i < n; i++) { @@ -59,12 +61,12 @@ void scan(int n, int *odata, const int *idata) { // Execute scan on device for (int d = 0; d < ilog2ceil(n); d++) { - up_sweep<<<1, m>>>(n, d, dev_data); + up_sweep<<>>(n, d, dev_data); } cudaMemset((void*)&dev_data[m - 1], 0, sizeof(int)); for (int d = ilog2ceil(n) - 1; d >= 0; d--) { - down_sweep<<<1, m>>>(n, d, dev_data); + down_sweep<<>>(n, d, dev_data); } cudaMemcpy(odata, dev_data, n * sizeof(int), cudaMemcpyDeviceToHost); @@ -86,6 +88,8 @@ int compact(int n, int *odata, const int *idata) { int *bools = (int*)malloc(n * sizeof(int)); int *scan_data = (int*)malloc(n * sizeof(int)); int num_remaining = -1; + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + dim3 threadsPerBlock(blockSize); int *dev_bools; int *dev_idata; @@ -100,7 +104,7 @@ int compact(int n, int *odata, const int *idata) { cudaMalloc((void**)&dev_scan_data, n * sizeof(int)); // Map to boolean - StreamCompaction::Common::kernMapToBoolean<<<1, n>>>(n, dev_bools, dev_idata); + StreamCompaction::Common::kernMapToBoolean<<>>(n, dev_bools, dev_idata); cudaMemcpy(bools, dev_bools, n * sizeof(int), cudaMemcpyDeviceToHost); @@ -110,7 +114,7 @@ int compact(int n, int *odata, const int *idata) { // Execute the scatter cudaMemcpy(dev_scan_data, scan_data, n * sizeof(int), cudaMemcpyHostToDevice); - StreamCompaction::Common::kernScatter<<<1, n>>>(n, dev_odata, dev_idata, dev_bools, dev_scan_data); + StreamCompaction::Common::kernScatter<<>>(n, dev_odata, dev_idata, dev_bools, dev_scan_data); cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost); diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index d79a41e..2a15f2b 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -7,7 +7,7 @@ namespace StreamCompaction { namespace Naive { __global__ void kern_scan(int n, int d, int *idata, int *odata) { - int k = threadIdx.x; + int k = threadIdx.x + (blockIdx.x * blockDim.x); if (k < n) { if (k >= (int)pow(2.0, (double)(d - 1))) { @@ -31,6 +31,8 @@ void padArrayRange(int start, int end, int *a) { void scan(int n, int *odata, const int *idata) { int m = pow(2, ilog2ceil(n)); int *new_idata = (int*)malloc(m * sizeof(int)); + dim3 fullBlocksPerGrid((m + blockSize - 1) / blockSize); + dim3 threadsPerBlock(blockSize); // Expand array to next power of 2 size for (int i = 0; i < n; i++) { @@ -48,7 +50,8 @@ void scan(int n, int *odata, const int *idata) { // Execute scan on device for (int d = 1; d <= ilog2ceil(n); d++) { - kern_scan<<<1, m>>>(n, d, dev_idata, dev_odata); + + kern_scan<<>>(n, d, dev_idata, dev_odata); dev_idata = dev_odata; } From 86f1f6f6a961aea835ba44a7240de2e5ba7ac8e4 Mon Sep 17 00:00:00 2001 From: Bradley Crusco Date: Sun, 13 Sep 2015 16:05:58 -0400 Subject: [PATCH 10/20] Added timing recording for CUDA execution. --- src/main.cpp | 11 +++++---- stream_compaction/efficient.cu | 41 ++++++++++++++++++++++++++++++++-- stream_compaction/efficient.h | 2 +- stream_compaction/naive.cu | 15 +++++++++++-- 4 files changed, 60 insertions(+), 9 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index efc8c06..7671783 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -16,7 +16,7 @@ #include "testing_helpers.hpp" int main(int argc, char* argv[]) { - const int SIZE = 1 << 8; + const int SIZE = 1 << 10; const int NPOT = SIZE - 3; int a[SIZE], b[SIZE], c[SIZE]; @@ -52,16 +52,19 @@ int main(int argc, char* argv[]) { StreamCompaction::Naive::scan(NPOT, c, a); //printArray(SIZE, c, true); printCmpResult(NPOT, b, c); - + + float ms_time = 0.0f; zeroArray(SIZE, c); printDesc("work-efficient scan, power-of-two"); - StreamCompaction::Efficient::scan(SIZE, c, a); + ms_time = StreamCompaction::Efficient::scan(SIZE, c, a); + printf("CUDA execution time for work efficient scan: %.5fms\n", ms_time); //printArray(SIZE, c, true); printCmpResult(SIZE, b, c); zeroArray(SIZE, c); printDesc("work-efficient scan, non-power-of-two"); - StreamCompaction::Efficient::scan(NPOT, c, a); + ms_time = StreamCompaction::Efficient::scan(NPOT, c, a); + printf("CUDA execution time for work efficient scan: %.5fms\n", ms_time); //printArray(NPOT, c, true); printCmpResult(NPOT, b, c); diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index ad32251..2cf8008 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -42,12 +42,18 @@ void padArrayRange(int start, int end, int *a) { /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ -void scan(int n, int *odata, const int *idata) { +float scan(int n, int *odata, const int *idata) { int m = pow(2, ilog2ceil(n)); int *new_idata = (int*)malloc(m * sizeof(int)); dim3 fullBlocksPerGrid((m + blockSize - 1) / blockSize); dim3 threadsPerBlock(blockSize); + cudaEvent_t start, stop; + float ms_time = 0.0f; + float ms_total_time = 0.0f; + cudaEventCreate(&start); + cudaEventCreate(&stop); + // Expand array to next power of 2 size for (int i = 0; i < n; i++) { new_idata[i] = idata[i]; @@ -60,19 +66,32 @@ void scan(int n, int *odata, const int *idata) { cudaMemcpy(dev_data, new_idata, m * sizeof(int), cudaMemcpyHostToDevice); // Execute scan on device + cudaEventRecord(start); for (int d = 0; d < ilog2ceil(n); d++) { up_sweep<<>>(n, d, dev_data); } + cudaEventRecord(stop); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&ms_time, start, stop); + ms_total_time += ms_time; + ms_time = 0.0f; cudaMemset((void*)&dev_data[m - 1], 0, sizeof(int)); + cudaEventRecord(start); for (int d = ilog2ceil(n) - 1; d >= 0; d--) { down_sweep<<>>(n, d, dev_data); } + cudaEventRecord(stop); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&ms_time, start, stop); + ms_total_time += ms_time; cudaMemcpy(odata, dev_data, n * sizeof(int), cudaMemcpyDeviceToHost); cudaFree(dev_data); free(new_idata); + + return ms_total_time; } /** @@ -91,6 +110,12 @@ int compact(int n, int *odata, const int *idata) { dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); dim3 threadsPerBlock(blockSize); + cudaEvent_t start, stop; + float ms_time = 0.0f; + float ms_total_time = 0.0f; + cudaEventCreate(&start); + cudaEventCreate(&stop); + int *dev_bools; int *dev_idata; int *dev_odata; @@ -104,17 +129,29 @@ int compact(int n, int *odata, const int *idata) { cudaMalloc((void**)&dev_scan_data, n * sizeof(int)); // Map to boolean + cudaEventRecord(start); StreamCompaction::Common::kernMapToBoolean<<>>(n, dev_bools, dev_idata); + cudaEventRecord(stop); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&ms_time, start, stop); + ms_total_time += ms_time; + ms_time = 0.0f; cudaMemcpy(bools, dev_bools, n * sizeof(int), cudaMemcpyDeviceToHost); // Execute the scan - scan(n, scan_data, bools); + ms_total_time += scan(n, scan_data, bools); num_remaining = scan_data[n - 1] + bools[n - 1]; // Execute the scatter cudaMemcpy(dev_scan_data, scan_data, n * sizeof(int), cudaMemcpyHostToDevice); + cudaEventRecord(start); StreamCompaction::Common::kernScatter<<>>(n, dev_odata, dev_idata, dev_bools, dev_scan_data); + cudaEventRecord(stop); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&ms_time, start, stop); + ms_total_time += ms_time; + printf("CUDA execution time for stream compaction: %.5fms\n", ms_total_time); cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost); diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h index 395ba10..57afdf6 100644 --- a/stream_compaction/efficient.h +++ b/stream_compaction/efficient.h @@ -2,7 +2,7 @@ namespace StreamCompaction { namespace Efficient { - void scan(int n, int *odata, const int *idata); + float scan(int n, int *odata, const int *idata); int compact(int n, int *odata, const int *idata); } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 2a15f2b..38bacab 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -34,6 +34,11 @@ void scan(int n, int *odata, const int *idata) { dim3 fullBlocksPerGrid((m + blockSize - 1) / blockSize); dim3 threadsPerBlock(blockSize); + cudaEvent_t start, stop; + float ms_time = 0.0f; + cudaEventCreate(&start); + cudaEventCreate(&stop); + // Expand array to next power of 2 size for (int i = 0; i < n; i++) { new_idata[i] = idata[i]; @@ -47,13 +52,19 @@ void scan(int n, int *odata, const int *idata) { cudaMemcpy(dev_idata, new_idata, m * sizeof(int), cudaMemcpyHostToDevice); cudaMalloc((void**)&dev_odata, m * sizeof(int)); - + + // Execute scan on device + cudaEventRecord(start); for (int d = 1; d <= ilog2ceil(n); d++) { - kern_scan<<>>(n, d, dev_idata, dev_odata); dev_idata = dev_odata; } + cudaEventRecord(stop); + cudaEventSynchronize(stop); + + cudaEventElapsedTime(&ms_time, start, stop); + printf("CUDA execution time for naive scan: %.5fms\n", ms_time); odata[0] = 0; cudaMemcpy(odata + 1, dev_odata, (m * sizeof(int)) - sizeof(int), cudaMemcpyDeviceToHost); From b85f3d505f1f16d557f57e708e50ac813aae502d Mon Sep 17 00:00:00 2001 From: Bradley Crusco Date: Sun, 13 Sep 2015 16:10:35 -0400 Subject: [PATCH 11/20] Increased block size. --- src/main.cpp | 2 +- stream_compaction/common.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index 7671783..83cfd6d 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -16,7 +16,7 @@ #include "testing_helpers.hpp" int main(int argc, char* argv[]) { - const int SIZE = 1 << 10; + const int SIZE = 1 << 8; const int NPOT = SIZE - 3; int a[SIZE], b[SIZE], c[SIZE]; diff --git a/stream_compaction/common.h b/stream_compaction/common.h index 5ce9f04..23aafae 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -6,7 +6,7 @@ #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__) #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__) -#define blockSize 128 +#define blockSize 512 /** * Check for CUDA errors; print and exit if there was a problem. From 1c8ae8523759b1fc047e5f23b4c94927f4d9e25c Mon Sep 17 00:00:00 2001 From: Bradley Crusco Date: Sun, 13 Sep 2015 16:48:06 -0400 Subject: [PATCH 12/20] Added logging for CPU --- src/main.cpp | 8 +++++--- stream_compaction/cpu.cu | 42 +++++++++++++++++++++++++++++++++++++--- stream_compaction/cpu.h | 2 +- 3 files changed, 45 insertions(+), 7 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index 83cfd6d..395bcbb 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -19,6 +19,7 @@ int main(int argc, char* argv[]) { const int SIZE = 1 << 8; const int NPOT = SIZE - 3; int a[SIZE], b[SIZE], c[SIZE]; + float ms_time = 0.0f; // Scan tests @@ -32,12 +33,14 @@ int main(int argc, char* argv[]) { zeroArray(SIZE, b); printDesc("cpu scan, power-of-two"); - StreamCompaction::CPU::scan(SIZE, b, a); + ms_time = StreamCompaction::CPU::scan(SIZE, b, a); + printf("CPU execution time for scan: %.5fms\n", ms_time); printArray(SIZE, b, true); zeroArray(SIZE, c); printDesc("cpu scan, non-power-of-two"); - StreamCompaction::CPU::scan(NPOT, c, a); + ms_time = StreamCompaction::CPU::scan(NPOT, c, a); + printf("CPU execution time for scan: %.5fms\n", ms_time); printArray(NPOT, b, true); printCmpResult(NPOT, b, c); @@ -53,7 +56,6 @@ int main(int argc, char* argv[]) { //printArray(SIZE, c, true); printCmpResult(NPOT, b, c); - float ms_time = 0.0f; zeroArray(SIZE, c); printDesc("work-efficient scan, power-of-two"); ms_time = StreamCompaction::Efficient::scan(SIZE, c, a); diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 6ee93bc..a94d063 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -1,6 +1,9 @@ #include #include #include "cpu.h" +#include +#include +#include namespace StreamCompaction { namespace CPU { @@ -8,11 +11,21 @@ namespace CPU { /** * CPU scan (prefix sum). */ -void scan(int n, int *odata, const int *idata) { +float scan(int n, int *odata, const int *idata) { + cudaEvent_t start, stop; + float ms_time = 0.0f; + cudaEventCreate(&start); + cudaEventCreate(&stop); + + cudaEventRecord(start); odata[0] = 0; for (int i = 1; i < n; i++) { odata[i] = odata[i - 1] + idata[i - 1]; } + cudaEventRecord(stop); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&ms_time, start, stop); + return ms_time; } /** @@ -21,6 +34,12 @@ void scan(int n, int *odata, const int *idata) { * @returns the number of elements remaining after compaction. */ int compactWithoutScan(int n, int *odata, const int *idata) { + cudaEvent_t start, stop; + float ms_time = 0.0f; + cudaEventCreate(&start); + cudaEventCreate(&stop); + + cudaEventRecord(start); int j = 0; for (int i = 0; i < n; i++) { if (idata[i] != 0) { @@ -28,6 +47,10 @@ int compactWithoutScan(int n, int *odata, const int *idata) { j++; } } + cudaEventRecord(stop); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&ms_time, start, stop); + printf("CPU execution time for compact without scan: %.5fms\n", ms_time); return j; } @@ -49,6 +72,12 @@ int compactWithScan(int n, int *odata, const int *idata) { int *scan_output = (int*)malloc(n * sizeof(int)); zeroArray(n, scan_output); + cudaEvent_t start, stop; + float ms_time = 0.0f; + float ms_total_time = 0.0f; + cudaEventCreate(&start); + cudaEventCreate(&stop); + // Compute temporary array for (int i = 0; i < n; i++) { if (idata[i] != 0) { @@ -57,15 +86,22 @@ int compactWithScan(int n, int *odata, const int *idata) { } // Run exclusive scan on the temporary array - scan(n, scan_output, temp); + ms_time = scan(n, scan_output, temp); + ms_total_time += ms_time; + ms_time = 0.0f; - // TODO: Make seperate scatter function // Scatter + cudaEventCreate(&start); for (int i = 0; i < n; i++) { if (temp[i] == 1) { odata[scan_output[i]] = idata[i]; } } + cudaEventRecord(stop); + cudaEventSynchronize(stop); + cudaEventElapsedTime(&ms_time, start, stop); + ms_total_time += ms_time; + printf("CPU execution time for compact with scan: %.5fms\n", ms_total_time); return scan_output[n - 1] + temp[n - 1]; } diff --git a/stream_compaction/cpu.h b/stream_compaction/cpu.h index 6348bf3..8f32b0d 100644 --- a/stream_compaction/cpu.h +++ b/stream_compaction/cpu.h @@ -2,7 +2,7 @@ namespace StreamCompaction { namespace CPU { - void scan(int n, int *odata, const int *idata); + float scan(int n, int *odata, const int *idata); int compactWithoutScan(int n, int *odata, const int *idata); From 863b67d723471bb1a5327eb6e21a3f78f61bcfff Mon Sep 17 00:00:00 2001 From: Bradley Crusco Date: Sun, 13 Sep 2015 17:46:38 -0400 Subject: [PATCH 13/20] Add analysis chart. --- images/Project 2 Analysis.png | Bin 0 -> 31901 bytes 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 images/Project 2 Analysis.png diff --git a/images/Project 2 Analysis.png b/images/Project 2 Analysis.png new file mode 100644 index 0000000000000000000000000000000000000000..5b27cf41a3d21a42094ad35ca9f28de7dfb4a9df GIT binary patch literal 31901 zcmeFaby$^6zc4z4qM#rO3W|gQD1vmyQWOv+6a)oC6cJE5lxAU}v_;3l0K@$bW|q%;_IbZ^zW>g-&b9Y@U)Q_6*F7`8`E||k>_^&~s$1!I(*ppuo>Dt; z4uDM&0Bn4I?Bj$@qWs;& zm&%Vn?(@4(b@t;3E&GWEdpnjrp{K6BXYcyrI#C(b#WnABGcK>X!+gTCF$aXK+}-f) z%!SRwE%6svehm~o4D>fQp`+UJ2hdo7Zo9O!<2x5rs3WL7nggsNb}a15%VFeRTs~Z{ z6z~bKNcB#P`yw-*$^z z`r~{TxaiiLId?Lz9LRro^x4T1XYelLVS!!k$5z9c0=JGIJJS>ZYMS#{ZDk-bAGP4oe8t3GQ7i#LpzQ zXLsJ4@Fi_@CNYcOm9%4X@-giem!=c)DL0{Qr#dIh7)w0#hrWWa?cV*;ZLH^fow$`t zIX>OL5VEJ2W`i=bHpjC~2bGVjQL_?8NavC`k{=AmUmr9alFnLeUt)Em z+b_I1;j2L^v(<()6D{dYLHBlj{qaXsnoHBI-OLy0m3$JJiQAZE zU-%0%@BF56{ISb{&CEP+L$<%7wj{KMaK7To_EUSDSZc_!@#IMn?K969%D7nX|9%k~ zuNN=*GWjf>c1)6D!}$jO8}zfx8Y(wmRMu7v0;eOg>@ogAD*B|Tn!ST*gUmD1ORO93 z#+wF+1hILa;oxOWqfTQ_+fyGTbDzC9;7Y>B3-4qbIY#`p7CEF|wGzpoci-l|-F<^x zkmud~zn$zdq%RxEYd#eCe!knlj?YftZbugTm+hU~CIXm>rvH>rnk}e!FnMiK3X0?5 z-x09+!#%a*hp!06KdDiXKAm;i<=BDkvkx`D+r1!WJ-Bd6{)D_1$HDxAXVtDBkUZpb zfXA9=j4x(i$G$x7r$1D#F*(RzJkxpHhBuCF|ZawETZq^aL9CoemqFbxC-r zFRnYIYxFKWvwr-`{O0_oy7!W`%yhCFoEubzFAhs@RTfLwP;<`mb-Jy^-Sk7Oett3P z>FTlSfo1lrhxW$rsihelnh{;PvgEuZcV6(k*7^Kl-k*{`1(H*K_9wX}P5xAGX#J_u z(D_EJ!Ly;hA@j}cH(m7v%W%tkH+yf4)=IyfbYip)D(4uFej?*L(B61YqEdXnxW2`B zJx5J+V_oA^eOX;{o$Sk5-KY;^@iEDH%FFlWUNN38(NHC&DG$7=GJQZ_`1EmA(zO|p z1SbjSRp)~_vpIKj#N3MA&br-q8*`hV*}QaQ^}y=hrMxAbjj|hgHg4GXeq-j=$p=RY zSqd*bSbdNZ7!Y_hK$6XwT|rb&j8V8s%0?pp=5a}Tsf*X^OdKj%Ov^+y#5%0qTDC~0 zNu3sXE3!w_w{maAhW^}2U%{H4nmdpCZ}i`DW_$h9h(ar~AcH!=yMhdYxkutIvR^1N zv`!n&T(YpXjj`#l@GN<7b%%wHMZM|t+xHQIO&P^=twvKuUDBC5zO4Y@FQaUrcXmeLd#(aXD9+_qSyGXBu0+UL5sk zUfi`S;dJC_|MHiv#nTJZy9WZZhqE1wMlQX!*^+r8GtcJS?Mt_V-0v>aFnKb?sy;2b zm-xm!dEi*JxpAZMz7)okCM)wf^|?c~-`lElzq{A?{@&52Gv?T(UoF*=`SW&Jb*Jt9 zu0!7G-XZIf>wBmSsM?AB#5aTz@^c^0W3NaXj{Q8wbf@yp$*&4Lc4{W2b0N9YTL%~h zZr{ICL#!E#pZ^w67<%86D`+zlL&|1vwktc%ZmTUI=9v7r6S^T~U*5j*eG?}lPMmt- z_LBAGju%lcFPFHKJS$N%eScNI*x0ZB;g0XlC8j+ud!l|P?piw1o^2=pRWV-r*V4Y8 zldk?kM;b3g=PxX5FFF`~P<4ClBaugfk0xZ|qc>K~)Dd5QKAo6lH7+#1BbLsV&i2Vf z+0g;51KGUK`4rS|X>`WQNBoXG_jpQ|yi=DUHgPGj@pZl?gGO82+#vZe$QK`Hy+#cES^Z8HNl9i|CAB?lOa5i+4>~iEVUxus#>o??^okUTYKhSetn_pqg-vA zpYj~(OjUmbg~^4De$l?7e>T%U_4KFH2JA*Hm&~+3e;)R9R-aW1)You-e|;>WwJFy% zH)V9Pxn3{vyuQJ_O^V4xrCY&L)y9_({5-=wWtKT5Rix&;E0=PzPrZDTK2+%S%WKHz zA&qBcP-UfHq@dY*ySJ~Lr%b+mE$?#iuJ0(edTX1Q8OBNNd%%~|J8EHUw(yv*dvIz{ zx~KWVC2P9RFJJ8#HtLl5`MmDnSkMTk;%=Ap%#zoMJ?`Zb^IY``dX7eYE-jnCJ$kVt zPP0EQTJgE_@X%0iy=mTQw~6RS3XboOezE;yRnsy(;87hjG(W23+Bv)Ok)di=kny!k zqdC2HZyd>9q#n9uF4>KHH_G{P_~smyyEM)oE9LRqZ~1w>eqKs!?oPY0y3&=$zedhB z*>&qa|;+3L%RVSwRx2cx!dh#7rGG1_5Io0?yBC$wGVBKiBqPOLCfy;`h zFsq!zn$gTkeM;2jq=M%4x^c4!r{tXN%Pn(Ik!!irtjHCLaIzY3gKb@?kCLxKVJ<4(3GNlvEiQ5=d(EA^Qj5FT$*syc_)IMb&Y;JmYz;AB>z@ z8I63UztrW6Rjz-#6I;uu-cl>J`_Hu4;Xcd80oM}ONj^_)`x~%{xHtxP(Wm`7mBXRT2{pM>VY7iX(*-qw)rixdOT05DcG_SZ>Zuj;DG!hCU zo0GfJN%`A@zVvhpEr@@)w>Ks&Os=GU`hAF0$k(x%u8-5#d78|I);&84Uuw6{(>0Akmrl%@e4tEX%7b7pdJ z`Lk;e724PrA5M6mke{t~PE;H;?NJE4F`OhjKzl?+KwSEV$)EGBgNs5O&3?IS^@8SH zp{A0ZmK6aZlgn>gCVgL+3-??ucJ=)}ni=Cf_T}ZvIfsG@sSv0roo}Zfy_>rAu_pX$ z%65lX4bR_Cjeo7R9cgc3uy&%Q1_AMq{El%x-h|jEF4?QomSM3`d0j5Uu5q!m9*wh8 zqzP}6wV{nVQfv9E?9OAWLJXIC?^oY`?-STQJ27BfVtdQZ1P-36cKsEWxN})~a^>OO zmwO9lS{9VA_O?~Y`^YUd=gFVy-&4Av%JF5nTR%h0^iR3Ye4_2_7cCn%yQ*csJRJl|QDr;8o7LH8$JO;f8I+qlxoJ*z5eN zZ_oXehT)d(KU{uucPCB5FCA0t@G5T847x8TJLj5L4rOtG=ZRpzL#DR-(Qn$kC3{$7 zCy$wBoYG1r#c2SNHcZL>@Wl)eN}~B&7AGDrLm6AfoYL8+*R|l18?z9VvwD8xtd>PnV-21U4z3g_=?&$2$nwq`)CiudVqCwGGTG?P`?`ydg1y}!5 z?r*tHN9AX~UpvCxAgr-A{l=b0`xD%~=fFqOUqSlJfCMqhJm zKhQj8F%+?P*5Oy&SWv}EHuuv`zRPpvhbKPn@zfr2{hD`~3aHjXI8;aZ)Ei}$f1Qx= z5oIbLDQ;rAe94}M@t3n)V`7(0Ow+7ur_SagvuYRNS^mB9srnI$%}3TqzZ03jrbMVD zFW*< zJPBQx85VRsZ#u6wO1?1eIS>?)l3I39DAexE&TAtvKQqR@^c6^! zfv1v&-yh2_vv!s?d}W#3M+45obf1{8IFbE4R*WSsU&7@@hIiV2uD&geZsVIzuR8hd zZ6=MW$h~nq5Iw^&;hG+&qPQG#VJf5ERWNRnNkOYf&P~d&z$d`F#eDSZ+>cS_2YWnj zlrxRCaOo~(%1nj!thzzIj;5kPk|bn+z2oGM!9*oHxQ1(B)z z38``Ub%(0@$h$|&p9GfVzslYEZoO~smt*T!WU?kFO-HzXH3&{v2`>h?+Vidkt}JB6 ze5Kp{G;VBA9MV~lt6rJoZfAU`-R5)V{et|kkDm{t)BQpl1$A>lI_^^VIO#=pvL&G1Ni z(!fr+yN%weru<6N{4WIb8w^JabF;I~Gi+>jxgC)SI4z+C7h1w4-T)Hg&V4k*&+XN0 zT{5;F+m$#JQ0~%w;vNSezQIMCiJ_1%4M@F*tG&_huiyat|FxP+&Ppp=K#2; zl6M2Ld{=&ceh?=OjZLOuy?9u8-=Gf~1a_&*>^*V=fm+|b{0}V%M@nAOfxWF6o@*1c zZZUcH!#WXvhrnLy7kQ^!%qm@t>~Cf6I;(Dm1Z%U-)`? zeJ!pkWqtXw(uDkWqBEtyAh;?%k1AMMsUBEg>+)XvSZ)U-Db5)Tsf0#V?OB!h{~$f|dc4%>UYYV1~k8 zU_-4#QVDdA^HG(#N8uQmp>HGcy`{{vWb3v!L$9aj4kLFb9P2F7HxPs5J^S6()YMdKYC;~n zbu+Rbp}GyNKX*uqATe$pFcB@KxEWSm@JZ5O#xi76#UdhE-R(>G@ldL4E10&xorq^A zvZ=bQD8hGuyrNJX0tqDErl>&)WSPkp?EZ-gaChNgtV-l1i0uV`%KN$3&!SYmP>~3f zCG`P^q%>5%oV5W-L_C4O)wdA@Q7rTi9el5&>^|_i9S{^T?Cv8F9{*!JU7HV18K$7p z05t=rN|2iS>=D+H0S@&Kr9f%`p1IB1BimB+cc23e7@>5c??4pE3>1*UkWOJWOnLJo zIHiID(h5=s^iUxPz&i*YNWQ@DXA;_B`VLD6_-tg+x7j_p5geurt^9c9P=D?F9*EcB z1UxwrP5y-$*L?z_PLjs7YYW0Df2r0*`y^A$@#HxhKzv$*U=yhkik8@b3DIw`e28xN zC;2OcMC8PfQUxF+lkJ$2<`9T|Y6vFzoeG3{AqKx!yn;xK5-^1-kllchLe*Y+L1Z&P zmm>E5XHmTSJK?T`S&XpsUI?1RjknR&gy8g%|B>#R07qn3pnE2k~Ks0FKjh+oNJ4n8lMhiYt z7F6G=w@37f<8hYjMIq3pK`(YLqywiZyX~XwCNfJc^BZD@+AuJ6DSQ?3@;3k!&qH+5 zX*$0H;&@}~@7cxDCaBzesEPvwAs(0`pfVc}_;mX1RygMqnl6v` z<$f4K1NDb8-|y6}QcIK`J7U3gyCgt=@#m35BV5yMavWN+0)?MX;39$&UPQiP#7zf? z$1qJCWG9HO%BSMpaK)$w+}%VT({t+}v}R!nBoieev?s;uj($Ze`aj|oR+?yqBwmq$ zR{TxP_iTs6UMPjdGR(XLf-x(Eqp(Jdi0y^Df2Po-$BWibIM{dqy$exryuo{kJCPYK zXFG;;hsYWp1Tj!Sy($1sZ-(5k!kR-o38@E?X0RRuAiR15T+vL4B#wB6B4x!>v|>~R zgF^fQbJr*<$aqBr#_T0t!9!V5h-L+0+ad!fAR-Oc{~DC}bzk^^+o&D7a1Qi`2X3^@ zEfzG5I2Y&1PAqsuonnG0NBbg{Z@T%8c|al4gI@S0SM(n86;AWZYzaSL~Xy+F&m(F?IaWWP{(wp2Qr4q1;C|{V!$SB5XJ#OFZY%%7G2GzzF?wJk`2^G{d!6Sl5sQ02F)2PWTdt)MzKx6WVoc3*KnQ@XfTe>AXq1&~=#<;0^+ zBk1eqbSrXA&2e3C{f-sf;CcUjh;efl6Rh7euC|Cn)!TtMb_dcxC98kh-wXwzdyYPE zR@sZbZRSO%n;@O??z+*}jY`kj!z*bfff9EDw4|-u6fgHCOy7~(pK_iTe3QAH0q6n!d}wBYD1fil@26TuLexUOc(QAfei;3!;!)!-9>spxQ~D)V?U+6 z2Z4!PahZX0fQbZ(jk6*srzff-lhi$hREo3_Vfh)mcm7x;_97!Z=Z?mE=|J?`5?i6V znJk=V4vB@p{E)iaZO2yFhcsQ939G(#xN50AVpSFPnj|)4(ssuiFF@2tThK-@TY&4e zu~yboEO!W-G%UK{Kq=k@+ZRzA&8K*;2t*akEZ~fXNF~U3M-Z58CyJ;5aTn^jf{^1- zx&KC0W;A^e?_?(rnE({+VH+8mg!t#*;s`3=OSHs}tsctHBm{dL=?9QjV2UFr2LZu) zjE#x>0ig$Yv0rNrMt)8BIbPBH2(1vqD@Ko@6|R&O{hT}{$TUq3Bi};Y8Uf88FkD2= zkoX>PJ~bN2ip-19yl@}CFX4B8Ojar{lrd<8ngNk~i`Z-8j9ERh4ZSnqcLDsq9~(1y zpN@bg<_ilRo5njDreQZHhTKOg4Ms%uwD}D zvGVahL@c!vLKqS*Vhru}Aq-it+m&WOE7Y+QB`#$X;J`C{@}3lpq^m8CRk`5|+PjWu zvMKSOy8sxEpUBuq)L5mHLKvVj5+o3@#8%8g2^?Dp=MjNPIE&I|LfZ{vVs){S9)=wR zxNH)6;G}STW@Ux~EXf8*kMI?*jYTwQA4Y4d$e4p=Vt=)l1!Kz}fUvbYj4|QIx@e9! zVNphy@4&`Ipu*a#iC3tZBNj$SU}3&MEVOgLJK`XoinN8W4+U{BjwzBx3v~zZd*GM-Ff1Y~0vjX3V5KtK=c-tVIyH=%pIozjpkk1D{?Uju=u_1L zjQA=Y;I)ba;UarcU?bMoIq+A*@G?I^4G2d@k|C(47BUTm(R%=xLe!FVKZF&&iTDmd z* zdBybMU`5IOcJTgO2VM*5VG~J_DVf;Dn#K;|5sf3q$QKMAZlStsOr?4{q^(%OERb(W zfnd9@gcQPOdwr+(zEj0r(+sZ5 z3^Hu1%R6Z-4aTcNzK0bO!p3+8a+#3WJjN|K%q4nP`k1_TL8T=M=EK(HxLE4d$b@a* zgtuff!6@~`JOwIKSVR|F^%tq}d)W{lVlY}ve$$Q|KuGBZ^aH1xL>BiF`#T851K7vj zO4EmqGAm#i0ta)*`w;~qkfk@5;YB#fCTbX=71wZ}dxuw)7I{LLKmb_}H@;hAf{mir ztOVjhb{9grrH*TqFIPCAS@GZ7i!`_zEg*3qLm=B?y*xXL;?+V}iQf)9wgnr>reYW@ zi04{R7Kg>$hhiqN6KNLV2VSz&cL%}pK=W(fTE=z!8&B4^86xUk_HLXvPB!`;`9 z27yCc> zO^bH7z~y5@nKasvMh>Rw#KV|)J`76{d4(D#{5=xWG#H3z(lwos)m*WKa$|Kyv+YCkx~5Tn%nD728*{dC3Z^ky2^ zpm!=ahUpAZ|8i!V;5_>U5e=P9SJ)o3(;@$fNJ}kRDR$}4v*+{bRfNnYBAxpqDAy=m z;TaS87V1Ou_+RF9-w!RcaZGcimkgf&{hh535|<&2VrptxkObo&*~io2x1eJbM2s|H z5cDmq;3+wQ&_Wsq_OyId@hdTzzSRc_^e90R0=ePkJoj0lKwr7;EMBif!b>F9HACcY{Yn|-(`oX$9w4y)sQ=n7|=S!N$1=Shd3YBdhz?) z9$YE)g#C%USlsr^f01yU#TkI4AC9mkQuY4RX%#_@v}dYn|c-gz4B~dn~`4@bdU0j-Ia3Pa+C-m>TVkarL}W&PN4WuVWh@ ztLW3t3GC<_GMzsf*bnPUSzFj(w2QsM=D_8%(?aIe1;K*{d!zq+sFC*^SYtX@>O%#d zDCk9}%JfFxg31OMU-qT8)f;~f+##&rn!jkhcE)ENUSG-`-CMF5OBOaW44$sibLl(< zE zRQ-+A#RGvyzK17PTRjrj3NBcE1^3cLsz$7;v<)ynnJ3jRZ=kt>-b&_c00SWRVb^^p zwEm!6qnbEfRN@QNCZg!(U}$Ke>UY?0Vi@gsyA#v$zykU)ZE-9S?AD3Lu`)pqt~TwY zQDxYeLIpffN(@XAAoox@LgvG)A;*L&%RYt8E^};-AQ(B|&q$lAaP0l_-3Zde@)di^ zFDPaays*)9%iVhSvF*zx!Q${`ZU6$+n&0vwjhIoU7_J=)Mklo*Na9?(K-KZ82zRL0 zUG~W6fnj+06{#MWRmWji>zsPnT8-2C0xFdKo%kl6#NA0r3Znw?hXWyxE**pdt_YBy;SO{>Y$UXh)9% z1yLCxL<8CIc5eJU15zWa{&?;g=O-PWzQvPz#p6slHdHhqEN%c2uIJX&i@iC z2ow}=LPzX8y+P&{>;bl8@siDu4eKt@p?*=kE7*Xs;lacZ&soa6+fyPk&kU7{l5Y4RMXHf-{|WiZAB_^w!7yZf zghhmS%^|Eh+8dzrCURz@tc_@ZawXiE3NXS33)Y6=*L|s|cGmrl!UBNlT}Uat1k5HejIBuQ7WJKZH*i)? z!pBQnaqtC|c+3wgqIf?HY1(m%3DIn(<97jADgMVzk`(?u%UKM}?yI;&2e@A*dWkvW zJTiGPaPnRHd~x*mAci zJMgQ5LI>(QYJZ^FQH9Gtk}Q&7k&OH=TGRj_-msM6;4Ze9+ztL@i*5fq$Jl`J(ba%1 z&o?8`V@8MBUB@(^uXj+JWN|R|K0E-KR;4K|hUMOBDKmPDNyheWr9(dz$m0DASP%9x zmu#kBaAbQ~3oi{m!e{Had8K*a7et-xi}WIh55J#9?~MELJ3ePnJc!?2VT34RE3x-j zUL9eg0oQo&*A4$}63BPDMnTR)oSZpFt?g~^*l0C*pR|87W;wj$Q z8s^f1#cVrMsxPiWr^AV%?M7XMblj6A>YW2{*^a=Ig}gmWY4e1f6mrE>%&`UCRYK0hpd z&{YdFJ(-{1*#uf0Lgh=+Od=Bsz4^fGQhmXC(>JYPb!qRV)yo4)3voOUi5FW6LzXwC z!&@CD(=XEe95jX1Xvv#}G?AzXh3Oni5=?#2{c@-wX#&Q?z8ct{BGO<{lA&_7epINA zw%4eN*%}U37y7CTgjQyf?oEd&t*nN0L01D{KK4s3QSwPsfq->#zn7Y_40!bzV-C~V z(Ts!5(UGT$?b{t5WSYuCEBSE%>M`Jk6-Cdb&{N}|ubalZ9wDSisC7_v3u5k+o<tG7wZWaX9sq{Ezp26(DX>~}P-rdT!3L4;x6>_i=MkBojchf3UAzzjrX`x;Y z$m%r8o=VV;E5G)?8BCG}(drnU?5n;%%#$053>1Q3FH2Fhx{qHLL?6TBn+zC#aAv+O zV&ihGst*IHl*>6#A9;%EC1h-sxCj3A4dyyFTJAsfav8x9tehYe8VX?})?lP}z-}*Q zzR6KYuXGI7`j9h_-YN>c9-8GmX9MgxN;10&)`S7eNsNJX&Hqvk8Elob6=MR{nBs0~ zl*yl+2$TuV<~SG7lNI84kAwID(*qLg9R8J)%_w7ds33wl8L^WxV^=-AUg(T*BnTNW z=2p7@+{x>prvwOsSSyI5*!f+)Sy?|T0~Ha&;4k@KaB{PfoPRMr7X?#AG=(_zFYiam z71J_xqz^L^oH5X(hoSX6FkO@i>d`@7@i2t}x={GUd^9)$0UR*l_ny?s1=MjTsAcLM~b8Pnv zyHRZKk?XbfugJleNSKaK9n@_8K#Kno4y)&Z>6W$t78Z#P3QkuAlRfAcT9~uA7BwMM zy0|h8tZ*CvGpEXVVy@HJQBmeQ!uiiAmJSnGKsbhR2eB|Q35mlykZ;2{Q%V78@*K89 z^syj8vnF(vY&{<)X#QXx0&X8TIDlOgShxrGw&&Q|1x62I>#*A-=&Mm|U*Snhl%=qY z12%MtG?e#f7?O!jm_5K8ir7R7V@E89Xrxji3>h`eE>LI)L2aA~Y$f0sJfoY(D#D;W{G94u<7L^j4`I3-J z1V+c<5Hn>+12)W(q9~BsAf#j*mO%}?>p_g9Y&-*xIZ4!%=^b!7e;A$)Aft=07X;m~ zKSW}o%=iLll%B!zB5r_FTq!_O;gJgYHANQ9C(Eh2=_%7+)j9xa3MGIGE|sQ4BYdky zni8&nFQj(i8D-M}IA4u|BW_jB{l!2XZWTq@DgvEWqK>A4~SE|1hE)SAG@$g zhC&q#81W}4U9vMSD`PrZ1u}~%MC!b)f5#g%%j2a}vJVz#ef;bZc7r5ESthOw3-FZv z64SRcdZWj9N>~`*Q?!qyV8Hl#4|L$KTmuLRj)V0BJb%%IhcqnLe#J(mN=BMCYId7 z15}c%3fMOeCNRM8In`h_E@-|e%9+%pfOQ`{uX7k|ZMQWlRu+TT8Q^09ugK7eNo+>% zWFJ%*zKa^TK1x8*6LxkC7>hnmqMJMG+Et!v%>BBQn(41n+{dr?!7 zUoN)n{V0^C0cD^NiU7a(FOqy(o43b(?_9G5cPhreAZ73KYQ<6w!l`I!0Z(18LnLBoSj z(a`&vQZe7N7QldV{m3XjQ+@|a|X*!!;!#=0F;E z#lg&rxWBuS@c9DJ7IA&x*~<Quqmo7;bg!E;N@@KWf1 zV&!Q-I&}O{#_}^!EcCEs=fbl;yzKe{B>|3LQFI$AKxi95H|25{N6J^P{c!La)&jD~ zFvMmHAZ*28)%0x&YT;p)0Y;UqxCAX$A44&T@q&FP_Xy@;RwTq1v|$+-9fJ`6-q12f z5W7)I0f6e^^e%MOOBbKpcI%+k(fIJ1`#xITgepeRjPN0jU`R(!jZ5(rN0A`?HOz?D zM02$08NNKqO^dKn$IfTwG+H!;7kQ#BNtktnWkdwUtZI1CbQE!59jJ_^&tY~sBNmaY z4H#Kr1Jz3=$n;x}JM6>~w+}*8YnTLm4HWK2FUsvHtQbjyVo*d-mf-cu*G-Qj7v&5c zf?#)pa1!l9D353Ek+yjx9?Y!P0C{=_;R-YDc*XMh_9MalS?HRT=nw>p5~$XX_TrEt z`glkjFvO@dA{O7JS1$b$Jt6mCPdC?Py_Llm1Wx%fkU|G5w9mwJgWMX z)&uK-fdLsDvVKFNJoiJV1!xgx?87#4na0Hg~5K?6*Tj(1U&w5wQb*wkH?PU z#aLNuL=kH#%9^f%cB3e(AksLBvJ&3o=~!8{ccAGMWqqcJrc;#lAljXxtlIQw3Po9a zUm-qYWhK5vyrn2BnH52#D68F0JRK{m-2wc50V(dEL--v>l@MfH=u8IctJP}+(g0v6;KtkT+>5C&@}5qQca-%A)IKe{GFxQ)T!o3b?<5b=~~D~`2{M1>fk zfvf>opz9_CvnMwz1@JU%)$lSWH%rdbf3`Q7{!N?gS{Zwg;Fav zm>z6*5W@(Rq6*#Q(&n%Ieg@OX*#K@Sh$?YgS&{^geyS_xnKrsaOWJj#ay#A?MF~R?a91oI`;j?X4yWdG1ONAH`(N!tJCNl`= zTIyMJz~3zfztlYbFL3P8rA-U<{{p8JK_1pQo@$xeNc$Bs8K|Q$eDH95<2~NHfd){t z;4{oXh*s*GB)2w3ayP>d1Rbbc-6s&vf43s-fm{7y{@j1!4w5$#z2PQFbBqR20*-CF zW8iog!(TvJzL(jm>IFRQ1RxOtCmp&`c@dMvy#pdRyuUg6Bq9lv!Hyi^QA1=}66y>S ztl_q$DycI5CI~HYRQ~OFBk^|^8_1A9hrae7!5E-T43qwjfkrfT^iJhPTBP7aCiv6o z#CsWkOrG6tScWzJ7fymw7(7Ud|Ajn^i1xmJ$RnJGh#zlgu`i(6@-&PdU zT~^wtu^61v&eyDk;WV4VM%hSjo`{kI7tlL!d)L*3mw|fAqpj?s8=@ z28Q}4T(pcY%g{$MJexQy9Mm&%Njgz+)0cQGdjcaYnOa@wm}b;4Cd6-)ifP(qn$2kV z_BzMfwD&u&8%rHZySzsW$Ow)vJ3K6Bw(tp6z8o7_nb()4<16{j30VXn9Y+{B>@GQr zi%USd2K5d-glvcr^)zq#J*voFI;AT#y!bO6Je{x~zujy!$BlopoGy>&Zlklf8W7yq z|MbV64fsF8KzstPC_Ea^tsRdyGkt&2%Z{hb=!-8>a(IIoLb%wO5GELJR(`i@l@oR8 zp;WK_e_lAP7vwFQf8HxMP5bV!O=tJwmJ**r&ArmQ!!_k{tlGn=OEwgq*bzW8kPv@W zS;yp_+)(;%4(FOT%5={8$CURB?}*-47j{MK! zQBz_Xa=9?0?=C4==pW@hGPjy}Ij)12NR8Bpz%DU20n_?9d8NmL;PcZyVC2gQ@1g^? zrOBiD%md*3bUp{se;Yc<2IPbPj|%sp0#?GUa9HTuBbc~?8ISd`rVn903=2-TXaIlz zvTeZ{1Spfs3Tuqzg7w+;DjVvRBK@vayIVWp(LGdPBflV+m`c|ctQ2;|bkLHtO6iS= z=I^MCd`DCu#6kOyTa1WD=(_hsN5PAJWd5)U!r1@b60#nq&!yHLD< z5NR*du6eYSeHAxpz#U;BUw7AH$qpasYAw!S-&%N>vbvJ#Et|X9PNQJCPYaR{v9kEY zF%GYG_Bdn(D6J1qRNGJ+wMaqmABL+7WOFwF^3T=5CU5!N%^=F9c(;@&$1|~&Lk*|V z(J**vBuWDXM!uFqvGhbJJp6p`^OVtUUS3{q(dbTCD``Q#l9)QvY!{lDnVA+vW@QVb zv149e)mnFhyoagPzQ3$&`qj=mYp2QW1B=x6PFoCD*X{+HjFk>S`~=KmVM}=NKQH)&AVXwpWA+kBhK} z73MjK?ZMX`0GNb7$J9{L1(W56_WyrzY;llOz#z{VoM73S5wlcVJ}7=Vk@n&K#9Olo z7iB-XZ90*W^xe7O(_{lP_ni~RMl}D3MmwDBaQ8gl-0;epxv+cCLG<9YpWj#JyaTxKLVr93VD{@Q&)HDz6P zg7!na$@3k(psBxsIV&Lgj^qvLyc{|sBIIiAkR2`jix(Z$ZqX^#eVQ8C;qSjx$}#(_Ot$yR^BlDTqeOTxx{h7`5<$W=2z_!f_1k|!f>qPac7a~*^HY7?h@?$V>i*-2|QZfMfKr(Rx~}FgiMG0)LusF8oHSTAHsi&Od<70&!mey z=ln&RGZ`CU1v@u13jYCguCq>=*6uc^8;52an3{e+WAzYna(YzAb&wYhhhIfJXg@xt zyz{}B6yx0w=Etvx!9wgw0LqJA?AdE4Hu~ANZEn(qmi<|wtRxIoTi6i`Gj)cV?;Eds zwPh4?db5M)*V6_H$!#H|i|JRJfW#v$I;NRoCU@)}1xV*2f2rrb|sW4ETpWPb2*bo{CGB zvf2*&62Zvh+1bB*tn8M0`gfv#e`t`MXx}w4!*|V!?rn9iUU%(X8(jR+Fz`0)`I?(o z)J9>}oC5TB(nJ!$p+FtzbtTmovL%tbV4xi8#$3bj_e) zK~TeBspHM#Gcx<5cM1x8?7N(h(Pp>X9y02-DPmVm*q}i57KcWSr;8ot2B}WGi@`?z zr0)K*IO%~dxbfn{jI*b`JHlK`Ajyq&*)ClrtOd0o7rJ478PyQR|5NIc3bgi zRvy<8$=$sbUVRdiogI@U!M_HBxQB|EKKbr5Hgq{$^)UvDIQ(d}4b(Qf1nIk`b0NmL zd3(~Xu07~!sDmG+pP-cb!%%m~^`W}ysCgIpgihJ|-r1pv`QtHxT^*Jgs%yi+7xk{f zSwy)%0kCw=`ZRO;s-M%t+gvW6EtZ_7-Sos~XP^JNovYm&I-*Usd)(pW<&%Uo@R(r%?Zp~fLTU)k$z_t4w z%wo=aPEJnR-`W#Cxwh6RUfI^x*0L39il|;d`JS6`Uh!Ka$Dhi|o5>dJpBZv^T3A67}>6sWxxI^oq^>$V++ z)CAtwdq~o+S%D~Y0UzGETh27(5q7Vov*+G%)0A{Jd%kArRj60-%J}8mII%uYNoEf2 zOFXK4Y?}3>Mh8?2vTS184t0zqe5vOBy}PV2ZhWSs;X_L+XJB2<)lbV-Twri2K!RAl zJ5M|^&$w*j?cwq7C*xkMzJ0U#!Jghpmv(D`ubPeLODint{k%&{Y_D8YmzqC&t3aWj z|H7rVmv25+8ooOHHisT)q0nF7pzyp}Aa2~Z@Ud*(<~hgTFSODJ6B=6Vb;)P zzE;l~*mySQN=(x)yKB!&*{`t&)hyUZ? zce%5wDQ#h1(y{}cnRUUzmm6AnTcWmbn9RoCzWTW0@dwMoUHSo<84v5WikJIq^_RO` zXPUdTCwk4Y#h~?NIyF#+9*bVUT|sh*I2vDOFa2f}Cgh-yUxb;!_=`WE)*5TzKK^c{kOTy1RKp0lQ9d zr4H39`RgT`2=2A+9pqCN0PUgkuxWOFS8qV;T17PUms2Y*qKgAM<-@~y2Mj)zKJN|P z!&ZNFj@wb$d4o*lg}a$tqpCZ9tQdxijZ|~yU00uHdACFw`iN2EqYz#dh5=Z_tERS+%oHuKTz8lB8B)`?Y*2hUK4$CZm_>RTG_2urzs+j z?jc_+pJMLdov@LJ_Z?O5{q0hB`1htB9NNXUCG3-`Z&+R4ojQ#%`XFgbQO1vgg%WOy zIw^g0Ote2Pht%af%b_Q4MW$dmdvN|y(@E7iX8H3@hH>WFgky%X2eo9j+*UVJ5v=_1 zY@8M%oV@3m*hn~yy&BAtJ0pa9$UEh_(5Hy@}&*k{m^fFqM9<< z-Ar?y>ZWRE{@(9@s6U(ExT|Y_$R)FjcQm<^-t|Ixu*XbD-PZGJ4RI~{?J42rwZNFM>=JtKyt)sVFkO!A0K|uS1@8SUI`0MDzltsI{uwix^YFJ=m+gbeqp8`! zT&qjLP~H)>8@U`8xTbR=S3^HX)~&Ai)_H2hDS0dAZZ7YYa!F(}xsh7rHKBK^T4waI z`v8&!wytM6zoR__I5mXbdwK^y-+p}iF5i;Mj_CmcIUPedi?9ML-ye=&q)Ek&3Jo?^LFJ?=BiN8Nx(p$06TPge5{hzmzLv+NB7#^tq z{91SQ(D5O?ob2#gnpg8DQckLB-+u6#ac*Vf0nt~^Z^b@8{B)>xEA}r(KXDLmM&{67 zy8Tt>mW_YtvVp~<+*FRh&r6?wPIrC&OgL6_v-RT1Pq`%NS!f-5hB*~a{_bU^Th?t& zzxwI-+q9TEdf@SY7pP&3>AvWF*riUF5zDv!`2y4v^d74?O)fr*NoVOE|^4BL1O~L;c z0q-x>*onpde4-QZVQuu%;O(yBzg(W2cigb{8Ye zT%}UL3c5mN&ZNWe14o;CIjp}ta;XJ3=l_H6^8 ztce$L&@T6pu-^yzXwb6&UA4eu++n_R%&@w#GGAEvJU7>s$5vo#*!5?dP2EW1y6jL0;})Yd;3a5&fL5KS14a(?$+cA zw8qE}Y43L;L+d7JPNiwV>IJpyRmf(I24JDNsM?lc!uwj+Z%UqC~h@x4M2L^rGsWRdt!TevarZY!4}vE1jT)5=SLf(@aACF zEZHuG^xGlJSn!PGr5oJqsMBhmK#HM0{c3op0r;)PuCnz#0y5TCLf6LE19{up!ep&N zr4rDk@W7DqvlGdQ^Bs6)<7s6`xHhuO!R$Q`Km~_gJX4>fM|gXDXnCI9Fznn(w=!IL zI=JO#+Wc|dOP00FR^w%*#5!MfKfytILlO}tsm&gz-=%jyd{#ofEpO%uOYPjDc6sW# zewBeyVD~kj`rWFdPdVA6s<&&P-9XG0=DkCsNX)dB@IqW_&-OfrJp?}q_9G-b;C`@D zlL&;UxViAcWK4rqu&D@Gh*aH6k`{{KbJ#2u)z({Md*6PUl=rACsq4vgzn^n4us7pK ze+4<2Q?|>Pk!8X;lEtDYL?(-)9MSc4Q01vdY`l{tS~*aPaeZwl1QRPqAl{c(Dr*)H z3Q$!yb>i4_=HV;UoG>K}l-wct=9?`#p30+Vl>F zs&oA^3jrfME9>uqgulqoA>1yMqCm}$>UX@mN8&wQekiQXJ z-$4q7=iJpQf3Y~e@*E5iEV@$$zkv1W;SrphW^}L8ZLK~u_f;>E>ofJI7D?^sHaEMp z>euerVSf`=$X$~$!>2u{JJ|K{oFbgX1C^uS}~f$EFi9P7JZwm6T{}l zlp=>c?pXBM6|PnC1o{v}*Sr%PGr+dElPQOJ#DdyeFT zW%3a;egv1@YVmgI;RSzr#wwimd>~>5m_d}GES8^8MY-OB{Gbv;K=aCILmwtLKc|#4 z#a*kMJ`9ebYV9#kcZ;s-z;9`0>1e-FTl&I zgRHMlCeM0D7JkaPvUF+_9j6POHBnEk>4R5Vo_EySD~v?$eVw31I6^{YEyTnKoL(f- zwX%rIy!J0w=dufJug0-pV+V6@4>ga3Czx860s+^WISlBdNp|p07o=gLCLwwrQ8$_p3v~&0dV9bjZEm1Z?P2)%W*%MB&N` zJs@*kJWvNj3xE+cpEXh%Uj<}VpxQ#VWz6+A=86J< z4|g~>9Uw`+0^lS}k7x8;Z!WrZ2ixcyHEGpI%dDy-+w*#4^K!7(BZBRNZ-G<(R&C84 z>EfvK14u;xo52BWcIS!Tyhbwg8Gl^q=P0>>g8xj3PyJ($zOOX8VIv&|wlM?{@O%7h zjMtEjwtoP-ZRE$(J*X)%C8bgOTj=XwAyxxniXWhE_QrZV@_MVxp9B2S&8H~7keEw> z1w~=#v$64}yGmT5?%y&s{^Sfp+SAKFcSFpx5#^G@(}Q8VNNt^=PM3V= z@vWn>)Cq8Jk$5+OO5H8zK`k_?T$}P(EI!WdL*&VABV2MlFq85LF?6SI)TO!bm(e^i zY`@7KzC9igCe3pUxB7IX#7(zyoSsVnWioUzh7=Jv1~T>#m*=?Gb#eD}D2zCx56NY8 zADC2SUFHw^E=QF{+ikI?jho z9F3Q}`El_wUwI-lKgeAG4%xn*B?C2dD<8vUPP?TI9kUWfMA*mnym|emQDtLP0L8HPU%lK&d6kB%OF|+bo~E^4@3KjejHiq;3u{6m z!rW?&nZ@ty9FDA2UQAZi%y5a;5tY8&hok{MANwOmib^|PX)8#PZz{3DNQctJdJcft zrg0LeY^hGB+i%evFphGbo<=JojmD=dGf!oVkCepZDQPMl)1P69nFGv2fuq(58QUPn zuKTlU6MaAII?QWRD6k>b>+u}d2l$NvK(u}AaL%i6%Qp57*l`nKKR01}bQGm?a!6b> z5*?5t8GL|MZHh7QMK~GwwY%)aTb)|B;x)jEM4y)B@cEwcOTo=QSvu0F&`~w+D@^gS zAb!Ac-`Ipxr}{_KgzJHz!qiufZZ1PJJ95sfKUxbsx>U0^Kd$DDQ|xc>s8=)=R*0>; zukTo1jmhu9V~r}XvAd2s&2*z}`T=#RvA58^=feIJjm(I%C^sqqHqTXqxBApc__2F# zzIKE`oKW*|pt)eiIp)AyPg*Uu?G^*nAe}foRy-MNPLq9Cvm{Qyy3-^EXFD20v#%Xq z>B-)gE*9DhiA;exjsh^j*`kBmvR9*E?V_A7WW}?*Ryv;?gJoK@i*zQx%01-vCh`Ys zs1(_l&EpqbPwuXJs9a`IZBNXdaI=yW*c9vWHpT@|l;7)&8WBQZ?fz`%NjkqZve5;> zzeq)ky6IJK3Bf7Cea9j^Z;&q7-?Fxz$#>a$85%SrG?7Z47}*;6%>V6-W^TQ~a98>Z zeP3!tdXWv};O{SP0383NfL`w>BdLA83dfd8D^QrXE-Y2=WZ4e>(WRuSfg%)cG3W$0 zZhI@M^u9q8DYaElYeKNq(NRa8c^_}-w&J>OxA_0bi2*eg7(-`+up#`J+ro_S)9ZX(%Lj_Txs3IRk Date: Sun, 13 Sep 2015 17:59:02 -0400 Subject: [PATCH 14/20] Update README.md --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 307fd6f..793081d 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,8 @@ CUDA Stream Compaction * Bradley Crusco * Tested on: Windows 10, i7-3770K @ 3.50GHz 16GB, 2 x GTX 980 4096MB (Personal Computer) +![](images/Project 2 Analysis.png "Performance Analysis") + ## Write-up 1. Update all of the TODOs at the top of this README. From d327c874d694846a80ce3d11944fb92bf40dd37c Mon Sep 17 00:00:00 2001 From: Bradley Crusco Date: Sun, 13 Sep 2015 17:59:56 -0400 Subject: [PATCH 15/20] Update README.md --- README.md | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/README.md b/README.md index 793081d..e548f24 100644 --- a/README.md +++ b/README.md @@ -8,6 +8,59 @@ CUDA Stream Compaction ![](images/Project 2 Analysis.png "Performance Analysis") +``` +**************** +** SCAN TESTS ** +**************** + [ 3 29 33 19 0 16 10 40 39 50 44 30 9 ... 4 0 ] +==== cpu scan, power-of-two ==== +CPU execution time for scan: 0.00109ms + [ 0 3 32 65 84 84 100 110 150 189 239 283 313 ... 6684 6688 ] +==== cpu scan, non-power-of-two ==== +CPU execution time for scan: 0.00106ms + [ 0 3 32 65 84 84 100 110 150 189 239 283 313 ... 6613 6626 ] + passed +==== naive scan, power-of-two ==== +CUDA execution time for naive scan: 0.07440ms + passed +==== naive scan, non-power-of-two ==== +CUDA execution time for naive scan: 0.07222ms + passed +==== work-efficient scan, power-of-two ==== +CUDA execution time for work efficient scan: 0.21798ms + passed +==== work-efficient scan, non-power-of-two ==== +CUDA execution time for work efficient scan: 0.21632ms + passed +==== thrust scan, power-of-two ==== + passed +==== thrust scan, non-power-of-two ==== + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 4 3 0 3 4 2 3 2 3 1 1 1 4 ... 3 0 ] +==== cpu compact without scan, power-of-two ==== +CPU execution time for compact without scan: 0.00106ms + [ 4 3 3 4 2 3 2 3 1 1 1 4 3 ... 3 3 ] + passed +==== cpu compact without scan, non-power-of-two ==== +CPU execution time for compact without scan: 0.00106ms + [ 4 3 3 4 2 3 2 3 1 1 1 4 3 ... 4 4 ] + passed +==== cpu compact with scan ==== +CPU execution time for compact with scan: 0.00109ms + [ 4 3 3 4 2 3 2 3 1 1 1 4 3 ... 3 3 ] + passed +==== work-efficient compact, power-of-two ==== +CUDA execution time for stream compaction: 0.22755ms + passed +==== work-efficient compact, non-power-of-two ==== +CUDA execution time for stream compaction: 0.22557ms + passed +``` + ## Write-up 1. Update all of the TODOs at the top of this README. From 2994521f6b877e83445b14dabb2db62e3558e43b Mon Sep 17 00:00:00 2001 From: Bradley Crusco Date: Sun, 13 Sep 2015 18:07:16 -0400 Subject: [PATCH 16/20] Update README.md --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index e548f24..94becc5 100644 --- a/README.md +++ b/README.md @@ -7,6 +7,9 @@ CUDA Stream Compaction * Tested on: Windows 10, i7-3770K @ 3.50GHz 16GB, 2 x GTX 980 4096MB (Personal Computer) ![](images/Project 2 Analysis.png "Performance Analysis") +Unfortunately, the results from testing are not very impressive. The sequential CPU implementation easily out performs everything but the Thrust implementation, and the worst performer by far is the work-efficient implementation, which we'd expect to outperform the naive scan. So why is this? I am not 100% sure. However I had difficulty determining how to configure the grid and block size optimally, and as a result all the GPU implementations are using the same ratio, with 512 threads per block. A better understanding of how to configure this might result in performance more in line with what we'd expect to see. + +The other possible cause may be that our arrays are not very large, with the maximum array I tested with being 1024. It could be the case that this wasn't enough data for the GPU to take advantage of and counteract the overhead of the parallel algorithm vs. the sequential. ``` **************** From 49105a09e414b5ec91267daf8ffc84a4fc054b1d Mon Sep 17 00:00:00 2001 From: Bradley Crusco Date: Sun, 13 Sep 2015 18:11:43 -0400 Subject: [PATCH 17/20] Update README.md --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 94becc5..77f3616 100644 --- a/README.md +++ b/README.md @@ -9,8 +9,9 @@ CUDA Stream Compaction ![](images/Project 2 Analysis.png "Performance Analysis") Unfortunately, the results from testing are not very impressive. The sequential CPU implementation easily out performs everything but the Thrust implementation, and the worst performer by far is the work-efficient implementation, which we'd expect to outperform the naive scan. So why is this? I am not 100% sure. However I had difficulty determining how to configure the grid and block size optimally, and as a result all the GPU implementations are using the same ratio, with 512 threads per block. A better understanding of how to configure this might result in performance more in line with what we'd expect to see. -The other possible cause may be that our arrays are not very large, with the maximum array I tested with being 1024. It could be the case that this wasn't enough data for the GPU to take advantage of and counteract the overhead of the parallel algorithm vs. the sequential. +The other possible cause may be that our arrays are not very large, with the maximum array I tested with being 1024. It could be the case that this wasn't enough data for the GPU to take advantage of and counteract the overhead of the parallel algorithm vs. the sequential and is ultimately bottlenecked by memory I/O +### Test Program Output (Array Size 256) ``` **************** ** SCAN TESTS ** From 6a1fa596a39519db50d26b553baeec1f27b84c68 Mon Sep 17 00:00:00 2001 From: Bradley Crusco Date: Sun, 13 Sep 2015 18:12:12 -0400 Subject: [PATCH 18/20] Update README.md --- README.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 77f3616..29b8fa6 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,9 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** * Bradley Crusco -* Tested on: Windows 10, i7-3770K @ 3.50GHz 16GB, 2 x GTX 980 4096MB (Personal Computer) +* Tested on: Windows 10, i7-3770K @ 3.50GHz 16GB, 2 x GTX 980 4096MB (Personal Computer) + +### Performance Analysis ![](images/Project 2 Analysis.png "Performance Analysis") Unfortunately, the results from testing are not very impressive. The sequential CPU implementation easily out performs everything but the Thrust implementation, and the worst performer by far is the work-efficient implementation, which we'd expect to outperform the naive scan. So why is this? I am not 100% sure. However I had difficulty determining how to configure the grid and block size optimally, and as a result all the GPU implementations are using the same ratio, with 512 threads per block. A better understanding of how to configure this might result in performance more in line with what we'd expect to see. @@ -12,6 +14,7 @@ Unfortunately, the results from testing are not very impressive. The sequential The other possible cause may be that our arrays are not very large, with the maximum array I tested with being 1024. It could be the case that this wasn't enough data for the GPU to take advantage of and counteract the overhead of the parallel algorithm vs. the sequential and is ultimately bottlenecked by memory I/O ### Test Program Output (Array Size 256) + ``` **************** ** SCAN TESTS ** From 01a0ed8d8484ee82f42b6f0d6db982c9948cd9b4 Mon Sep 17 00:00:00 2001 From: Bradley Crusco Date: Sun, 13 Sep 2015 18:15:27 -0400 Subject: [PATCH 19/20] Update README.md --- README.md | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 29b8fa6..dc1777c 100644 --- a/README.md +++ b/README.md @@ -4,10 +4,15 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** * Bradley Crusco -* Tested on: Windows 10, i7-3770K @ 3.50GHz 16GB, 2 x GTX 980 4096MB (Personal Computer) +* Tested on: Windows 10, i7-3770K @ 3.50GHz 16GB, 2 x GTX 980 4096MB (Personal Computer) + +### Description + + ### Performance Analysis +**Scan Implementation Execution Time vs. Array Size** ![](images/Project 2 Analysis.png "Performance Analysis") Unfortunately, the results from testing are not very impressive. The sequential CPU implementation easily out performs everything but the Thrust implementation, and the worst performer by far is the work-efficient implementation, which we'd expect to outperform the naive scan. So why is this? I am not 100% sure. However I had difficulty determining how to configure the grid and block size optimally, and as a result all the GPU implementations are using the same ratio, with 512 threads per block. A better understanding of how to configure this might result in performance more in line with what we'd expect to see. From 00baf138f16c4b257937a9c125580d7848d1128d Mon Sep 17 00:00:00 2001 From: Bradley Crusco Date: Sun, 13 Sep 2015 18:21:46 -0400 Subject: [PATCH 20/20] Update README.md --- README.md | 64 +++++++------------------------------------------------ 1 file changed, 8 insertions(+), 56 deletions(-) diff --git a/README.md b/README.md index dc1777c..d737f6a 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,14 @@ CUDA Stream Compaction ### Description - +This project is a series of scan and stream compression algorithms. +Features: + * CPU Exclusive Prefix Sum Scan + * CPU Stream Compaction Without Scan + * CPU Stream Compaction using Exclusive Prefix Sum Scan + * Naive GPU Exclusive Preflix Sum Scan + * Work-Efficient GPU Exclusive Preflix Sum Scan + * GPU Stream Compaction using Work-Efficient GPU Exclusive Prefix Sum Scan ### Performance Analysis @@ -71,58 +78,3 @@ CUDA execution time for stream compaction: 0.22755ms ==== work-efficient compact, non-power-of-two ==== CUDA execution time for stream compaction: 0.22557ms passed -``` - -## Write-up - -1. Update all of the TODOs at the top of this README. -2. Add a description of this project including a list of its features. -3. Add your performance analysis (see below). - -All extra credit features must be documented in your README, explaining its -value (with performance comparison, if applicable!) and showing an example how -it works. For radix sort, show how it is called and an example of its output. - -Always profile with Release mode builds and run without debugging. - -### Questions - -* Roughly optimize the block sizes of each of your implementations for minimal - run time on your GPU. - * (You shouldn't compare unoptimized implementations to each other!) - -* Compare all of these GPU Scan implementations (Naive, Work-Efficient, and - Thrust) to the serial CPU version of Scan. Plot a graph of the comparison - (with array size on the independent axis). - * You should use CUDA events for timing. Be sure **not** to include any - explicit memory operations in your performance measurements, for - comparability. - * To guess at what might be happening inside the Thrust implementation, take - a look at the Nsight timeline for its execution. - -* Write a brief explanation of the phenomena you see here. - * Can you find the performance bottlenecks? Is it memory I/O? Computation? Is - it different for each implementation? - -* Paste the output of the test program into a triple-backtick block in your - README. - * If you add your own tests (e.g. for radix sort or to test additional corner - cases), be sure to mention it explicitly. - -These questions should help guide you in performance analysis on future -assignments, as well. - -## Submit - -If you have modified any of the `CMakeLists.txt` files at all (aside from the -list of `SOURCE_FILES`), you must test that your project can build in Moore -100B/C. Beware of any build issues discussed on the Google Group. - -1. Open a GitHub pull request so that we can see that you have finished. - The title should be "Submission: YOUR NAME". -2. Send an email to the TA (gmail: kainino1+cis565@) with: - * **Subject**: in the form of `[CIS565] Project 2: PENNKEY` - * Direct link to your pull request on GitHub - * In the form of a grade (0-100+) with comments, evaluate your own - performance on the project. - * Feedback on the project itself, if any.