diff --git a/.README.md.un~ b/.README.md.un~ new file mode 100644 index 0000000..1adcd84 Binary files /dev/null and b/.README.md.un~ differ diff --git a/.vs/Project2-Number-Algorithms/v15/.suo b/.vs/Project2-Number-Algorithms/v15/.suo new file mode 100644 index 0000000..c9fddab Binary files /dev/null and b/.vs/Project2-Number-Algorithms/v15/.suo differ diff --git a/.vs/Project2-Number-Algorithms/v15/Browse.VC.db b/.vs/Project2-Number-Algorithms/v15/Browse.VC.db new file mode 100644 index 0000000..f530e72 Binary files /dev/null and b/.vs/Project2-Number-Algorithms/v15/Browse.VC.db differ diff --git a/.vs/Project2-Number-Algorithms/v15/ipch/AutoPCH/69bca779ad2e221c/MAIN.ipch b/.vs/Project2-Number-Algorithms/v15/ipch/AutoPCH/69bca779ad2e221c/MAIN.ipch new file mode 100644 index 0000000..91bf6ae Binary files /dev/null and b/.vs/Project2-Number-Algorithms/v15/ipch/AutoPCH/69bca779ad2e221c/MAIN.ipch differ diff --git a/.vs/Project2-Number-Algorithms/v15/ipch/AutoPCH/f38087074acb9f8b/CPU.ipch b/.vs/Project2-Number-Algorithms/v15/ipch/AutoPCH/f38087074acb9f8b/CPU.ipch new file mode 100644 index 0000000..428bfa1 Binary files /dev/null and b/.vs/Project2-Number-Algorithms/v15/ipch/AutoPCH/f38087074acb9f8b/CPU.ipch differ diff --git a/.vs/ProjectSettings.json b/.vs/ProjectSettings.json new file mode 100644 index 0000000..2b7b33d --- /dev/null +++ b/.vs/ProjectSettings.json @@ -0,0 +1,3 @@ +{ + "CurrentProjectSetting": "x64-Debug (default)" +} \ No newline at end of file diff --git a/.vs/VSWorkspaceState.json b/.vs/VSWorkspaceState.json new file mode 100644 index 0000000..4fbcda8 --- /dev/null +++ b/.vs/VSWorkspaceState.json @@ -0,0 +1,10 @@ +{ + "ExpandedNodes": [ + "", + "\\Project2-Stream-Compaction", + "\\Project2-Stream-Compaction\\src", + "\\Project2-Stream-Compaction\\stream_compaction" + ], + "SelectedNode": "\\Project2-Stream-Compaction\\stream_compaction\\CMakeLists.txt", + "PreviewInSolutionExplorer": false +} \ No newline at end of file diff --git a/.vs/slnx.sqlite b/.vs/slnx.sqlite new file mode 100644 index 0000000..c33bf1f Binary files /dev/null and b/.vs/slnx.sqlite differ diff --git a/Project2-Character-Recognition/CMakeLists.txt b/Project2-Character-Recognition/CMakeLists.txt index 09e9198..548fa85 100644 --- a/Project2-Character-Recognition/CMakeLists.txt +++ b/Project2-Character-Recognition/CMakeLists.txt @@ -22,6 +22,7 @@ if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin") endif() include_directories(.) +link_directories(${CUDA_TOOLKIT_ROOT_DIR}/lib/x64) add_subdirectory(character_recognition) cuda_add_executable(${CMAKE_PROJECT_NAME} @@ -30,6 +31,8 @@ cuda_add_executable(${CMAKE_PROJECT_NAME} ) target_link_libraries(${CMAKE_PROJECT_NAME} + cublas + curand character_recognition ${CORELIBS} ) diff --git a/Project2-Character-Recognition/README.md b/Project2-Character-Recognition/README.md index 4503fac..d97aa45 100644 --- a/Project2-Character-Recognition/README.md +++ b/Project2-Character-Recognition/README.md @@ -1,14 +1,4 @@ CUDA Character Recognition ====================== - -**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** - -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) - -### (TODO: Your README) - -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +attempted to implement, never truly understood what I am doing and could not figure out what I am suppose to do diff --git a/Project2-Character-Recognition/character_recognition/CMakeLists.txt b/Project2-Character-Recognition/character_recognition/CMakeLists.txt index 7446175..9e834c1 100644 --- a/Project2-Character-Recognition/character_recognition/CMakeLists.txt +++ b/Project2-Character-Recognition/character_recognition/CMakeLists.txt @@ -7,5 +7,5 @@ set(SOURCE_FILES cuda_add_library(character_recognition ${SOURCE_FILES} - OPTIONS -arch=sm_20 + OPTIONS -arch=sm_61 ) diff --git a/Project2-Character-Recognition/character_recognition/common.cu b/Project2-Character-Recognition/character_recognition/common.cu index 2a754d4..6521b75 100644 --- a/Project2-Character-Recognition/character_recognition/common.cu +++ b/Project2-Character-Recognition/character_recognition/common.cu @@ -1,5 +1,5 @@ #include "common.h" - +#include "cublas.h" void checkCUDAErrorFn(const char *msg, const char *file, int line) { cudaError_t err = cudaGetLastError(); if (cudaSuccess == err) { diff --git a/Project2-Character-Recognition/character_recognition/mlp.cu b/Project2-Character-Recognition/character_recognition/mlp.cu index 5a3ed7f..4aefc20 100644 --- a/Project2-Character-Recognition/character_recognition/mlp.cu +++ b/Project2-Character-Recognition/character_recognition/mlp.cu @@ -2,6 +2,90 @@ #include #include "common.h" #include "mlp.h" +#include +#define blockSize 32 +//https://solarianprogrammer.com/2012/05/31/matrix-multiplication-cuda-cublas-curand-thrust/ +// m = number of rows in A +// k = number of columns in A +// n = number of columns in B +void gpu_blas_mmul(const float *A, const float *B, float *C, const int m, const int k, const int n) { + int lda = m, ldb = k, ldc = m; + const float alf = 1; + const float bet = 0; + const float *alpha = &alf; + const float *beta = &bet; + // Create a handle for CUBLAS + cublasHandle_t handle; + cublasCreate(&handle); + + // Do the actual multiplication + cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, m, n, k, alpha, A, lda, B, ldb, beta, C, ldc); + + // Destroy the handle + cublasDestroy(handle); + +} + +__global__ void activation_rule(int n, float const *idata, float* odata) { + int index = (blockDim.x * blockIdx.x) + threadIdx.x; + if (index >= n) { + return; + } + + odata[index] = 1.0 / (1.0 + std::exp(-idata[index])); +} + +__global__ void activation_rule_d(int n, float const *idat, float* odata) { + int index = (blockDim.x * blockIdx.x) + threadIdx.x; + if (index >= n) { + return; + } + odata[index] = idat[index] * (1 - idat[index]); + +} +__global__ void matrixSub(int n, const float* a, const float* b, float* odata) { + + int index = (blockDim.x * blockIdx.x) + threadIdx.x; + if (index >= n) { + return; + } + odata[index] = a[index] - b[index]; +} + + +void train(const float *A_input, const int A_m, const int A_n, float* weights_into_layer, float* weights_out_layer, float* predicted_output, + float* err, float* layer, const int layer_n, const float* expected_output, const int expected_output_n, float* post_activation_layer) { + float *dev_input, *dev_weights_0, *dev_weights_1, *dev_error, *dev_layer, *dev_output, *dev_activation_layer, *dev_expected_output; + cudaMalloc((void**)&dev_input, sizeof(float) * A_m * A_n); + cudaMemcpy(dev_input, A_input, A_m * A_n * sizeof(float), cudaMemcpyHostToDevice); + cudaMalloc((void**)&dev_weights_0, A_m * layer_n * sizeof(float)); + cudaMemcpy(dev_weights_0, weights_into_layer, A_m * layer_n * sizeof(float), cudaMemcpyHostToDevice); + cudaMalloc((void**)&dev_weights_1, layer_n * sizeof(float)); + cudaMemcpy(dev_weights_1, weights_out_layer, layer_n * sizeof(float), cudaMemcpyHostToDevice); + cudaMalloc((void**)&dev_layer, sizeof(float) * layer_n); + cudaMemcpy(dev_layer, layer, layer_n * sizeof(float), cudaMemcpyHostToDevice); + cudaMalloc((void**)&dev_activation_layer, sizeof(float) * layer_n); + cudaMalloc((void**)&dev_error, expected_output_n * sizeof(float)); + cudaMalloc((void**)&dev_output, expected_output_n * sizeof(float)); + cudaMalloc((void**)&dev_expected_output, expected_output_n * sizeof(float)); + cudaMemcpy(dev_expected_output, expected_output, expected_output_n* sizeof(float), cudaMemcpyHostToDevice); + float* dev_subtraction, *dev_derivative, *dev_output_d; + cudaMalloc((void**)&dev_subtraction, expected_output_n * sizeof(float)); + cudaMalloc((void**)&dev_derivative, expected_output_n * sizeof(float)); + cudaMalloc((void**)&dev_output_d, expected_output_n * sizeof(float)); + for (int i = 0; i < 50; i++) { + gpu_blas_mmul(dev_input, dev_weights_1, dev_layer, 1, A_n, layer_n); + activation_rule<<<((layer_n + blockSize - 1) / blockSize), blockSize>>>(layer_n, dev_layer, dev_activation_layer); + gpu_blas_mmul(dev_activation_layer, dev_weights_1, dev_output, 1, layer_n, A_n); + //compute derivate. derivative of sigmoid is out * expected-out + matrixSub << <((layer_n + blockSize - 1) / blockSize), blockSize >> > (dev_expected_output, dev_output, dev_subtraction); + activation_rule_d << <((layer_n + blockSize - 1) / blockSize), blockSize >> > (layer_n, dev_output, dev_derivative); + gpu_blas_mmul(dev_subtraction, dev_derivative, dev_output_d, 1, expected_output_n, expected_output_n); + + } + +} +__global__ void runTrain(const float *A_input, const int A_m, const int A_n, float* weights_into_layer, float* weights_out_layer, const float* predicted_output, float* err, float* layer, const int layer_n, const float* expected_output, const int expected_output_n) { namespace CharacterRecognition { using Common::PerformanceTimer; diff --git a/Project2-Character-Recognition/src/main.cpp b/Project2-Character-Recognition/src/main.cpp index 11dd534..e4e54a0 100644 --- a/Project2-Character-Recognition/src/main.cpp +++ b/Project2-Character-Recognition/src/main.cpp @@ -11,142 +11,5 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array -const int NPOT = SIZE - 3; // Non-Power-Of-Two -int *a = new int[SIZE]; -int *b = new int[SIZE]; -int *c = new int[SIZE]; - int main(int argc, char* argv[]) { - // Scan tests - - printf("\n"); - printf("****************\n"); - printf("** SCAN TESTS **\n"); - printf("****************\n"); - - genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case - a[SIZE - 1] = 0; - printArray(SIZE, a, true); - - // initialize b using StreamCompaction::CPU::scan you implement - // We use b for further comparison. Make sure your StreamCompaction::CPU::scan is correct. - // At first all cases passed because b && c are all zeroes. - zeroArray(SIZE, b); - printDesc("cpu scan, power-of-two"); - StreamCompaction::CPU::scan(SIZE, b, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(SIZE, b, true); - - zeroArray(SIZE, c); - printDesc("cpu scan, non-power-of-two"); - StreamCompaction::CPU::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(NPOT, b, true); - printCmpResult(NPOT, b, c); - - zeroArray(SIZE, c); - printDesc("naive scan, power-of-two"); - StreamCompaction::Naive::scan(SIZE, c, a); - printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); - - /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan - onesArray(SIZE, c); - printDesc("1s array for finding bugs"); - StreamCompaction::Naive::scan(SIZE, c, a); - printArray(SIZE, c, true); */ - - zeroArray(SIZE, c); - printDesc("naive scan, non-power-of-two"); - StreamCompaction::Naive::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); - printCmpResult(NPOT, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient scan, power-of-two"); - StreamCompaction::Efficient::scan(SIZE, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient scan, non-power-of-two"); - StreamCompaction::Efficient::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); - printCmpResult(NPOT, b, c); - - zeroArray(SIZE, c); - printDesc("thrust scan, power-of-two"); - StreamCompaction::Thrust::scan(SIZE, c, a); - printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); - - zeroArray(SIZE, c); - printDesc("thrust scan, non-power-of-two"); - StreamCompaction::Thrust::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); - printCmpResult(NPOT, b, c); - - printf("\n"); - printf("*****************************\n"); - printf("** STREAM COMPACTION TESTS **\n"); - printf("*****************************\n"); - - // Compaction tests - - genArray(SIZE - 1, a, 4); // Leave a 0 at the end to test that edge case - a[SIZE - 1] = 0; - printArray(SIZE, a, true); - - int count, expectedCount, expectedNPOT; - - // initialize b using StreamCompaction::CPU::compactWithoutScan you implement - // We use b for further comparison. Make sure your StreamCompaction::CPU::compactWithoutScan is correct. - zeroArray(SIZE, b); - printDesc("cpu compact without scan, power-of-two"); - count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - expectedCount = count; - printArray(count, b, true); - printCmpLenResult(count, expectedCount, b, b); - - zeroArray(SIZE, c); - printDesc("cpu compact without scan, non-power-of-two"); - count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - expectedNPOT = count; - printArray(count, c, true); - printCmpLenResult(count, expectedNPOT, b, c); - - zeroArray(SIZE, c); - printDesc("cpu compact with scan"); - count = StreamCompaction::CPU::compactWithScan(SIZE, c, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(count, c, true); - printCmpLenResult(count, expectedCount, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient compact, power-of-two"); - count = StreamCompaction::Efficient::compact(SIZE, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); - printCmpLenResult(count, expectedCount, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient compact, non-power-of-two"); - count = StreamCompaction::Efficient::compact(NPOT, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); - printCmpLenResult(count, expectedNPOT, b, c); - - system("pause"); // stop Win32 console from closing on exit - delete[] a; - delete[] b; - delete[] c; } diff --git a/Project2-Stream-Compaction/img/blocksizeopt.PNG b/Project2-Stream-Compaction/img/blocksizeopt.PNG new file mode 100644 index 0000000..72c3adb Binary files /dev/null and b/Project2-Stream-Compaction/img/blocksizeopt.PNG differ diff --git a/Project2-Stream-Compaction/img/outputConsole.PNG b/Project2-Stream-Compaction/img/outputConsole.PNG new file mode 100644 index 0000000..33fe2fd Binary files /dev/null and b/Project2-Stream-Compaction/img/outputConsole.PNG differ diff --git a/Project2-Stream-Compaction/img/sizevstime.PNG b/Project2-Stream-Compaction/img/sizevstime.PNG new file mode 100644 index 0000000..b25873f Binary files /dev/null and b/Project2-Stream-Compaction/img/sizevstime.PNG differ diff --git a/Project2-Stream-Compaction/src/main.cpp b/Project2-Stream-Compaction/src/main.cpp index d016553..f2f6783 100644 --- a/Project2-Stream-Compaction/src/main.cpp +++ b/Project2-Stream-Compaction/src/main.cpp @@ -13,7 +13,7 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array +const int SIZE = 1 << 15; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; @@ -51,7 +51,7 @@ int main(int argc, char* argv[]) { printDesc("naive scan, power-of-two"); StreamCompaction::Naive::scan(SIZE, c, a); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); + printArray(SIZE, c, true); printCmpResult(SIZE, b, c); /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan diff --git a/Project2-Stream-Compaction/stream_compaction/CMakeLists.txt b/Project2-Stream-Compaction/stream_compaction/CMakeLists.txt index cdbef77..6975dd9 100644 --- a/Project2-Stream-Compaction/stream_compaction/CMakeLists.txt +++ b/Project2-Stream-Compaction/stream_compaction/CMakeLists.txt @@ -13,5 +13,4 @@ set(SOURCE_FILES cuda_add_library(stream_compaction ${SOURCE_FILES} - OPTIONS -arch=sm_20 - ) + OPTIONS -arch=sm_61 ) diff --git a/Project2-Stream-Compaction/stream_compaction/common.cu b/Project2-Stream-Compaction/stream_compaction/common.cu index 2ed6d63..932abab 100644 --- a/Project2-Stream-Compaction/stream_compaction/common.cu +++ b/Project2-Stream-Compaction/stream_compaction/common.cu @@ -24,6 +24,12 @@ namespace StreamCompaction { */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { // TODO + int index = (blockDim.x * blockIdx.x) + threadIdx.x; + if (index >= n) { + return; + } + + bools[index] = (idata[index] != 0) ? 1 : 0; } /** @@ -33,6 +39,14 @@ namespace StreamCompaction { __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { // TODO + int idx = (blockDim.x * blockIdx.x) + threadIdx.x; + if (idx >= n) { + return; + } + + if (bools[idx]) { + odata[indices[idx]] = idata[idx]; + } } } diff --git a/Project2-Stream-Compaction/stream_compaction/common.h b/Project2-Stream-Compaction/stream_compaction/common.h index 996997e..84bae7e 100644 --- a/Project2-Stream-Compaction/stream_compaction/common.h +++ b/Project2-Stream-Compaction/stream_compaction/common.h @@ -9,7 +9,6 @@ #include #include #include - #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__) #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__) @@ -92,6 +91,8 @@ namespace StreamCompaction { cudaEventRecord(event_end); cudaEventSynchronize(event_end); +#include "common.h" + if (!gpu_timer_started) { throw std::runtime_error("GPU timer not started"); } cudaEventElapsedTime(&prev_elapsed_time_gpu_milliseconds, event_start, event_end); diff --git a/Project2-Stream-Compaction/stream_compaction/cpu.cu b/Project2-Stream-Compaction/stream_compaction/cpu.cu index a2d3e6c..5feec13 100644 --- a/Project2-Stream-Compaction/stream_compaction/cpu.cu +++ b/Project2-Stream-Compaction/stream_compaction/cpu.cu @@ -20,9 +20,26 @@ namespace StreamCompaction { void scan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + if(n > 0){ + int sum = 0; + for (int i = 0; i < n; i++) { + odata[i] = sum; + sum += idata[i]; + } + } timer().endCpuTimer(); } + void scan_notimer(int n, int *odata, const int *idata) { + if(n > 0){ + int sum = 0; + for (int i = 0; i < n; i++) { + odata[i] = sum; + sum += idata[i]; + } + } + } + /** * CPU stream compaction without using the scan function. * @@ -31,8 +48,14 @@ namespace StreamCompaction { int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + int count = 0; + for (int i = 0; i < n; i++) { + if (idata[i] != 0) { + odata[count++] = idata[i]; + } + } timer().endCpuTimer(); - return -1; + return count; } /** @@ -42,9 +65,28 @@ namespace StreamCompaction { */ int compactWithScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + if (n < 1) { + return 0; + } + int* tmp = new int [n]; + + for (int i = 0; i < n; i++) { + tmp[i] = (idata[i] == 0) ? 0 : 1; + } + int * scan_o = new int[n]; + + scan_notimer(n, scan_o, tmp); + int scan_counter = 0; + for (int i = 1; i < n; i++) { + if (scan_o[i] != scan_o[i - 1]) { + odata[scan_counter++] = idata[i - 1]; + } + } + timer().endCpuTimer(); - return -1; + delete[] scan_o; + delete[] tmp; + return scan_counter; } } } diff --git a/Project2-Stream-Compaction/stream_compaction/efficient.cu b/Project2-Stream-Compaction/stream_compaction/efficient.cu index 2db346e..694e543 100644 --- a/Project2-Stream-Compaction/stream_compaction/efficient.cu +++ b/Project2-Stream-Compaction/stream_compaction/efficient.cu @@ -2,7 +2,7 @@ #include #include "common.h" #include "efficient.h" - +#define blockSize 512 namespace StreamCompaction { namespace Efficient { using StreamCompaction::Common::PerformanceTimer; @@ -12,15 +12,136 @@ namespace StreamCompaction { return timer; } + __global__ void kernUpsweep(int n, int d, int* idata) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) { + return; + } + int offset = (1 << (d + 1)); // implementing 2^d+1 incrementing + int k = index * offset; + if (k >= n) { + return; + } + + idata[k + offset - 1] += idata[k + (1 << d) - 1]; + } + + __global__ void kernDownsweep(int n, int d, int* idata) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) { + return; + } + int offset = (1 << (d + 1)); // implementing 2^d+1 incrementing + int k = index * offset; + if (k >= n) { + return; + } + + int t = idata[k + (1 << d) - 1]; + idata[k + (1 << d) - 1] = idata[k + offset - 1]; + idata[k + offset - 1] += t; + } + + + void printDeviceArr(int n, int* device_arr) { + int* arr = (int*)malloc(sizeof(int)*n); + cudaMemcpy(arr, device_arr, sizeof(int) *n, cudaMemcpyDeviceToHost); + printf("\n ["); + for (int i = 0; i < n; i++) { + printf("%d, ", arr[i]); + } + printf("]\n"); + free(arr); + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + + dim3 threadsPerBlock(blockSize); + int npt = 1 << ilog2ceil(n); + + int * dev_idata; + cudaMalloc((void **)&dev_idata, npt * sizeof(int)); + checkCUDAError("Error: Cuda Malloc for input data on device"); + if (npt != n) { + int *new_cpu_arr = (int*) malloc(sizeof(int) * npt); + memset(new_cpu_arr, 0, sizeof(int) * npt); + memcpy(new_cpu_arr, idata, sizeof(int) * n); + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("Cuda Error on non-power of two array"); + free(new_cpu_arr); + } else { + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("Cuda Error on power of two array"); + } + timer().startGpuTimer(); + for (int d = 0; d < ilog2ceil(npt); d++) { + int updateThreadUtil = npt / (1 << (npt + 1)); + dim3 blocksPerGrid((updateThreadUtil + blockSize - 1) / blockSize); + kernUpsweep << > > (npt, d, dev_idata); + } + + int z = 0; + cudaMemcpy(&dev_idata[npt - 1], &z, sizeof(int), cudaMemcpyHostToDevice); + for (int d = ilog2(npt) - 1; d >= 0; d--) { + int updateThreadUtil = npt / (1 << (npt + 1)); + dim3 blocksPerGrid((updateThreadUtil + blockSize - 1) / blockSize); + kernDownsweep << > > (npt, d, dev_idata); + } // TODO timer().endGpuTimer(); + cudaMemcpy(odata, dev_idata, n * sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(dev_idata); + } + + void compactScan(int n, int *odata, const int *idata) { + + dim3 threadsPerBlock(blockSize); + int npt = 1 << ilog2ceil(n); + + int * dev_idata; + cudaMalloc((void **)&dev_idata, npt * sizeof(int)); + checkCUDAError("Error: Cuda Malloc for input data on device"); + if (npt != n) { + int *new_cpu_arr = (int*) malloc(sizeof(int) * npt); + memset(new_cpu_arr, 0, sizeof(int) * npt); + memcpy(new_cpu_arr, idata, sizeof(int) * n); + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("Cuda Error on non-power of two array"); + } else { + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("Cuda Error on power of two array"); + } + + for (int d = 0; d < ilog2ceil(npt); d++) { + int updateThreadUtil = npt / (1 << (npt + 1)); + dim3 blocksPerGrid((updateThreadUtil + blockSize - 1) / blockSize); + kernUpsweep << > > (npt, d, dev_idata); + } + + int z = 0; + cudaMemcpy(&dev_idata[npt - 1], &z, sizeof(int), cudaMemcpyHostToDevice); + for (int d = ilog2(npt) - 1; d >= 0; d--) { + int updateThreadUtil = npt / (1 << (npt + 1)); + dim3 blocksPerGrid((updateThreadUtil + blockSize - 1) / blockSize); + kernDownsweep << > > (npt, d, dev_idata); + } + // TODO + cudaMemcpy(odata, dev_idata, n * sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(dev_idata); } + /** + * Performs stream compaction on idata, storing the result into odata. + * All zeroes are discarded. + * + * @param n The number of elements in idata. + * @param odata The array into which to store elements. + * @param idata The array of elements to compact. + * @returns The number of elements remaining after compaction. + /** * Performs stream compaction on idata, storing the result into odata. * All zeroes are discarded. @@ -31,10 +152,53 @@ namespace StreamCompaction { * @returns The number of elements remaining after compaction. */ int compact(int n, int *odata, const int *idata) { + dim3 threadsPerBlock(blockSize); + dim3 blocksPerGrid((n + blockSize - 1) / blockSize); + + int *dev_idata, *dev_odata, *dev_bool; + cudaMalloc((void**)&dev_idata, sizeof(int) * n); + checkCUDAError("Malloc Error dev_idata"); + cudaMalloc((void**)&dev_odata, sizeof(int) * n); + checkCUDAError("Malloc Error dev_odata"); + cudaMalloc((void**)&dev_bool, sizeof(int) * n); + checkCUDAError("Malloc Error dev_bool"); + + cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + checkCUDAError("memcpy error dev_idata", __LINE__); + + timer().startGpuTimer(); + Common::kernMapToBoolean << > > (n, dev_bool, dev_idata); + checkCUDAError("map to boolean error", __LINE__); + int* host_bool = new int[n]; + cudaMemcpy(host_bool, dev_bool, n * sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAError("memcpy erroyr host_bool", __LINE__); + + int* indices = new int[n]; + + compactScan(n, indices, host_bool); + int* dev_indices; + cudaMalloc((void**)&dev_indices, sizeof(int) * n); + checkCUDAError("malloc error dev_indices", __LINE__); + cudaMemcpy(dev_indices, indices, sizeof(int)*n, cudaMemcpyHostToDevice); + checkCUDAError("memcpy error dev_indices", __LINE__); + + Common::kernScatter << > > (n, dev_odata, dev_idata, dev_bool, dev_indices); + checkCUDAError("kernScatter Error", __LINE__); + + cudaMemcpy(odata, dev_odata, sizeof(int) * n, cudaMemcpyDeviceToHost); + checkCUDAError("memcpy error dev_odata to odata", __LINE__); + // TODO timer().endGpuTimer(); - return -1; + int count = (host_bool[n - 1]) ? indices[n - 1] + 1 : indices[n-1]; + delete[] host_bool; + delete[] indices; + cudaFree(dev_bool); + cudaFree(dev_idata); + cudaFree(dev_indices); + cudaFree(dev_odata); + return count; } } } diff --git a/Project2-Stream-Compaction/stream_compaction/naive.cu b/Project2-Stream-Compaction/stream_compaction/naive.cu index 4308876..1ec3875 100644 --- a/Project2-Stream-Compaction/stream_compaction/naive.cu +++ b/Project2-Stream-Compaction/stream_compaction/naive.cu @@ -2,7 +2,7 @@ #include #include "common.h" #include "naive.h" - +#define blockSize 128 namespace StreamCompaction { namespace Naive { using StreamCompaction::Common::PerformanceTimer; @@ -11,15 +11,75 @@ namespace StreamCompaction { static PerformanceTimer timer; return timer; } - // TODO: __global__ + __global__ void kernNaiveScan(int n, int offset, int *odata, const int *idata) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) { + return; + } + + /* if (index < offset) { + odata[index] = idata[index]; + } + else { + odata[index] = idata[index - offset] + idata[index]; + }*/ + + odata[index] = (index < offset) ? idata[index] : idata[index - offset] + idata[index]; + } + + + __global__ void kernMakeExclusive(int n, int *odata, const int *idata) { + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= n) { + return; + } + odata[index] = (index == 0) ? 0 : idata[index - 1]; + } + void printDeviceArr(int n, int* device_arr) { + int* arr = (int*)malloc(sizeof(int)*n); + cudaMemcpy(arr, device_arr, sizeof(int) *n, cudaMemcpyDeviceToHost); + printf("\n ["); + for (int i = -1; i < n; i++) { + printf("%d, ", arr[i]); + } + printf("]\n"); + free(arr); + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + dim3 threadsPerBlock(blockSize); + dim3 blocksPerGrid((n + blockSize - 1) / blockSize); + + int* A; + int* B; + + cudaMalloc((void**)&A, sizeof(int) * n); + checkCUDAError("cuda Error Allocating A"); + cudaMalloc((void**)&B, sizeof(int) * n); + checkCUDAError("cuda Error Allocating B"); + cudaMemcpy(A, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("Error Copying data to A"); + timer().startGpuTimer(); - // TODO + for (int offset = 1; offset < n; offset *= 2) { + kernNaiveScan<<>>(n, offset, B, A); + + cudaMemcpy(A, B, n * sizeof(int), cudaMemcpyDeviceToDevice); + checkCUDAError("Error Copying data to A"); + + } + kernMakeExclusive << > > (n, B, A); + checkCUDAError("ERror in kernMakeExclusive"); + cudaMemcpy(odata, B, n * sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAError("ERror in Copying back to host"); + // TODO timer().endGpuTimer(); + + cudaFree(A); + cudaFree(B); } } } diff --git a/Project2-Stream-Compaction/stream_compaction/naive.h b/Project2-Stream-Compaction/stream_compaction/naive.h index 37dcb06..1d82526 100644 --- a/Project2-Stream-Compaction/stream_compaction/naive.h +++ b/Project2-Stream-Compaction/stream_compaction/naive.h @@ -1,7 +1,6 @@ #pragma once #include "common.h" - namespace StreamCompaction { namespace Naive { StreamCompaction::Common::PerformanceTimer& timer(); diff --git a/Project2-Stream-Compaction/stream_compaction/thrust.cu b/Project2-Stream-Compaction/stream_compaction/thrust.cu index 1def45e..f6b386b 100644 --- a/Project2-Stream-Compaction/stream_compaction/thrust.cu +++ b/Project2-Stream-Compaction/stream_compaction/thrust.cu @@ -18,11 +18,21 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + + thrust::host_vector dv_in(idata, idata + n); + thrust::host_vector dv_out(odata, odata + n); + + thrust::device_vector dev_in(dv_in); + thrust::device_vector dev_out(dv_out); + timer().startGpuTimer(); // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: - // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + thrust::exclusive_scan(dev_in.begin(), dev_in.end(), dev_out.begin()); timer().endGpuTimer(); + thrust::copy(dev_out.begin(), dev_out.end(), dv_out.begin()); + cudaMemcpy(odata, &dv_out[0], sizeof(int) * n, cudaMemcpyHostToHost); + checkCUDAError("memcopy error in thryust scan"); } } } diff --git a/README.md b/README.md index 3a0b2fe..363d1f0 100644 --- a/README.md +++ b/README.md @@ -3,14 +3,27 @@ CUDA Number Algorithms **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +Davis Polito +* [https://github.com/davispolito/Project0-Getting-Started/blob/master]() +* Tested on: Windows 10, i7-8750H @ 2.20GHz 16GB, GTX 1060 +#Optimizing Block Size Per Algorithm -### (TODO: Your README) +First we must optimize block size per algorithm +![block optimization graph](/Project2-Stream-Compaction/img/blocksizeopt.PNG) +After creating this graph I chose blocksize to be 128 and 512 for naive and work efficient scan + +#Scan comparison +![size vs. time graph](/Project2-Stream-Compaction/img/sizevstime.PNG) + + +##Questions +#Explanations for each result + +######cpu This method has O(n) runtime and is only affected by size of the aray +######naivei this method has O(nlogn) runtime +######work-efficient This has a possible runtime of O(n) but do to memory access and non ideal uses of threads and warps (i.e. warp branching) We see a slower runtime than cpu +#####thrust +![Console Output From Steam compaction](/Project2-Stream-Compaction/img/outputConsole.PNG) -Link to the readmes of the other two subprojects. -Add anything else you think is relevant up to this point. -(Remember, this is public, so don't put anything here that you don't want to share with the world.) diff --git a/README.md~ b/README.md~ new file mode 100644 index 0000000..676fcb0 --- /dev/null +++ b/README.md~ @@ -0,0 +1,26 @@ +CUDA Number Algorithms +====================== + +**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** + +Davis Polito +* [https://github.com/davispolito/Project0-Getting-Started/blob/master]() +* Tested on: Windows 10, i7-8750H @ 2.20GHz 16GB, GTX 1060 +#Optimizing Block Size Per Algorithm + +First we must optimize block size per algorithm +![block optimization graph](/Project2-Character-Recognition/img/blocksizeopt.png) +After creating this graph I chose blocksize to be 128 and 512 for naive and work efficient scan + +#Scan comparison +![size vs. time graph](/Project2-Character-Recognition/img/sizevstime.png) + + +##Questions +#Explanations for each result + +######cpu +######naive +######work-efficient +#####thrust +