Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Project 2 Yaoyi Bai #6

Open
wants to merge 10 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 0 additions & 13 deletions README.md

This file was deleted.

2 changes: 1 addition & 1 deletion stream_compaction/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,5 +13,5 @@ set(SOURCE_FILES

cuda_add_library(stream_compaction
${SOURCE_FILES}
OPTIONS -arch=sm_20
OPTIONS -arch=sm_30
)
39 changes: 0 additions & 39 deletions stream_compaction/common.cu

This file was deleted.

7 changes: 7 additions & 0 deletions stream_compaction/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,14 @@
#include <cstdio>
#include <cstring>
#include <cmath>
#include <stdio.h>
#include <time.h>

#define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
#define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__)

#define BLOCK_SIZE 16

/**
* Check for CUDA errors; print and exit if there was a problem.
*/
Expand All @@ -31,5 +35,8 @@ namespace Common {

__global__ void kernScatter(int n, int *odata,
const int *idata, const int *bools, const int *indices);

__global__ void inclusiveToExclusive(int n, int *idata, int *odata);

}
}
58 changes: 54 additions & 4 deletions stream_compaction/cpu.cu
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#include <cstdio>
#include <cstdio>
#include "cpu.h"

namespace StreamCompaction {
Expand All @@ -9,7 +9,14 @@ namespace CPU {
*/
void scan(int n, int *odata, const int *idata) {
// TODO
printf("TODO\n");
odata[0] = 0;
printf("The output array is:\n");
for(int tempCount = 1; tempCount < n; tempCount++)
{
odata[tempCount] = idata[tempCount-1] + odata[tempCount-1];
printf("%5d",odata[tempCount]);
}
printf("\n");
}

/**
Expand All @@ -19,7 +26,19 @@ void scan(int n, int *odata, const int *idata) {
*/
int compactWithoutScan(int n, int *odata, const int *idata) {
// TODO
return -1;
time_t start = clock();
int countOut=0;
for(int tempCount = 0; tempCount <n-1; tempCount++)
{
if(idata[tempCount]!=0)
{
odata[countOut]=idata[tempCount];
countOut++;
}
}
time_t end = clock();
printf("The running time is: %f ms. \n", double(end-start)*1000/CLOCKS_PER_SEC);
return countOut;
}

/**
Expand All @@ -29,7 +48,38 @@ int compactWithoutScan(int n, int *odata, const int *idata) {
*/
int compactWithScan(int n, int *odata, const int *idata) {
// TODO
return -1;
time_t start = clock();
int tempCount=0;
int sumOutTemp=0;
int *tempArray_1=&odata[2*n];
int *tempArray_2=&odata[10*n];

for (tempCount = 0; tempCount < n; tempCount++)
{
if (idata[tempCount] != 0)
{
tempArray_1[tempCount] = 1;
}
else
{
tempArray_1[tempCount] = 2;
}
}
scan(n, tempArray_2, tempArray_1);

for (tempCount = 0; tempCount < n-1; tempCount++)
{
if (tempArray_2[tempCount] != tempArray_2[tempCount + 1])
{
odata[sumOutTemp] = idata[tempCount];
printf("%5d", odata[sumOutTemp]);
sumOutTemp++;
}
}
printf("\n");
time_t end = clock();
printf("The running time is: %f ms. \n", double(end-start)*1000/CLOCKS_PER_SEC);
return sumOutTemp++;
}

}
Expand Down
61 changes: 58 additions & 3 deletions stream_compaction/efficient.cu
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,57 @@
namespace StreamCompaction {
namespace Efficient {

// TODO: __global__
// TODO:
__global__ void EfficientSorting(int n, int *idata, int *odata)
{
extern __shared__ int temp[];
int parallelCount = threadIdx.x;
int offset = 1;
temp[2 * parallelCount] = idata[2 * parallelCount];
temp[2 * parallelCount + 1] = idata[2 * parallelCount + 1];

for (int tempCount = n; tempCount > 0; tempCount*=2)
{
if (parallelCount < tempCount)
{
int temp_1 = offset*(2 * parallelCount + 1) - 1;
int temp_2 = offset*(2 * parallelCount + 2) - 1;
temp[temp_2] += temp[temp_1];
}
}
if (parallelCount == 0)
{
temp[n - 1] = 0;
}

for (int tempCount_1 = 0; tempCount_1 < n; tempCount_1 *= 2)
{
if (parallelCount < tempCount_1)
{
int temp_1 = offset*(2 * parallelCount + 1) - 1;
int temp_2 = offset*(2 * parallelCount + 2) - 1;
int tempStore = temp[temp_1];
temp[temp_1] = temp[temp_2];
temp[temp_2] += tempStore;
}
}
odata[2 * parallelCount] = temp[2 * parallelCount];
odata[2 * parallelCount + 1] = temp[2 * parallelCount + 1];
}


/**
* Performs prefix-sum (aka scan) on idata, storing the result into odata.
*/
void scan(int n, int *odata, const int *idata) {
// TODO
printf("TODO\n");
int tempCount=0;
odata[0]=0;
for (tempCount=0;tempCount<n-1;tempCount++)
{
odata[tempCount+1]=odata[tempCount]+idata[tempCount];
printf("%5d",&odata[tempCount]);
}
}

/**
Expand All @@ -27,7 +70,19 @@ void scan(int n, int *odata, const int *idata) {
*/
int compact(int n, int *odata, const int *idata) {
// TODO
return -1;
time_t start = clock();
int tempCount=0;
int outCount=0;
for(tempCount=0;tempCount<n;tempCount++)
{
if(idata[tempCount]!=0)
{
EfficientSorting<< <n,BLOCK_SIZE>> >(n, idata,odata);
}
}
time_t end = clock();
printf("The running time is: %f ms. \n", double(end-start)*1000/CLOCKS_PER_SEC);
return outCount++;
}

}
Expand Down
52 changes: 50 additions & 2 deletions stream_compaction/naive.cu
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,61 @@ namespace StreamCompaction {
namespace Naive {

// TODO: __global__
__global__ void NaiveGPUScan(int n, int *odata, const int *idata,int step)
{
int parallelCount = threadIdx.x+blockIdx.x*blockDim.x;

if(parallelCount<n)
{
if(parallelCount>=step)
{
odata[parallelCount]=idata[parallelCount-step]+idata[parallelCount];
}
}
}

/**
* Performs prefix-sum (aka scan) on idata, storing the result into odata.
*/
void scan(int n, int *odata, const int *idata) {
void scan(int n, int *odata, const int *idata) {
// TODO
printf("TODO\n");
time_t start = clock();


int* tempArray_1;
int* tempArray_2;
int tempCount=0;
int step=0;

cudaMalloc((void**)&tempArray_1, n * sizeof(int));
cudaMalloc((void**)&tempArray_2, n * sizeof(int));

//allocate the device space
cudaMemcpy(tempArray_1, idata, n * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(tempArray_2, idata, n * sizeof(int), cudaMemcpyHostToDevice);



for (tempCount = 1; tempCount <= ilog2ceil(n); ++tempCount) {
step=2^(tempCount-1);

NaiveGPUScan << <n, BLOCK_SIZE >> >(n, (tempCount % 2) == 0 ? tempArray_1 : tempArray_2, (tempCount % 2) == 0 ? tempArray_2 : tempArray_1,step);
}

if (ilog2ceil(n) % 2 == 0) {
Common::inclusiveToExclusive << <n, BLOCK_SIZE >> >(n, tempArray_2, tempArray_1);

cudaMemcpy(odata, tempArray_2, n * sizeof(int), cudaMemcpyDeviceToHost);
} else {
Common::inclusiveToExclusive << <n, BLOCK_SIZE >> >(n, tempArray_1, tempArray_2);

cudaMemcpy(odata, tempArray_1, n * sizeof(int), cudaMemcpyDeviceToHost);
}

time_t end = clock();
printf("The running time is: %f ms. \n", double(end-start)*1000/CLOCKS_PER_SEC);
cudaFree(tempArray_1);
cudaFree(tempArray_2);
}

}
Expand Down
13 changes: 13 additions & 0 deletions stream_compaction/thrust.cu
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,19 @@ void scan(int n, int *odata, const int *idata) {
// TODO use `thrust::exclusive_scan`
// example: for device_vectors dv_in and dv_out:
// thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());

time_t start = clock();

thrust::device_vector<int> dev_idata(idata, idata + n);
thrust::device_vector<int> dev_odata(odata, odata + n);

thrust::exclusive_scan(dev_idata.begin(), dev_idata.end(), dev_odata.begin());

thrust::host_vector<int> host_odata = dev_odata;
cudaMemcpy(odata, host_odata.data(), n * sizeof(int), cudaMemcpyHostToHost);

time_t end = clock();
printf("The running time is: %f ms. \n", double(end-start)*1000/CLOCKS_PER_SEC);
}

}
Expand Down