From e81f6633865e3ae876c8211448fe43677e836e11 Mon Sep 17 00:00:00 2001 From: rhasler1 Date: Wed, 24 Jul 2024 13:03:52 -0500 Subject: [PATCH 1/5] updates: see below 1. No longer using host accessor to compute sequential. 2. Backing adjacency matrix on host w/ 2-D vector. 3. Linearizing adjacency matrix for parallel execution. 4. Adding verify results functionality. 5. Adding function to generate simple directed graph. --- transitive_closure/CMakeLists.txt | 2 +- transitive_closure/graph.cpp | 88 ++++++++ transitive_closure/graph.h | 29 +++ transitive_closure/test.txt | 1 - transitive_closure/transitive_closure.cpp | 261 ++++++++-------------- 5 files changed, 207 insertions(+), 174 deletions(-) create mode 100644 transitive_closure/graph.cpp create mode 100644 transitive_closure/graph.h delete mode 100644 transitive_closure/test.txt diff --git a/transitive_closure/CMakeLists.txt b/transitive_closure/CMakeLists.txt index ebf5b0f..60b2abd 100644 --- a/transitive_closure/CMakeLists.txt +++ b/transitive_closure/CMakeLists.txt @@ -1,2 +1,2 @@ -add_executable(transitive_closure transitive_closure.cpp timestamps.cpp) +add_executable(transitive_closure transitive_closure.cpp timestamps.cpp graph.cpp) target_link_libraries(transitive_closure fmt::fmt spdlog::spdlog CLI11::CLI11) \ No newline at end of file diff --git a/transitive_closure/graph.cpp b/transitive_closure/graph.cpp new file mode 100644 index 0000000..e634f43 --- /dev/null +++ b/transitive_closure/graph.cpp @@ -0,0 +1,88 @@ +// Inspired by: https://www.geeksforgeeks.org/how-to-create-a-random-graph-in-c/ + +// Why to linearize: https://community.khronos.org/t/matrix-multiplication-using-2d-std-vector/106457#:~:text=On%20a%20more,be%20known%20statically. + +#include +#include "graph.h" + +// This function generates a directed graph that has no loops and no duplicate edges. The resulting graph +// is stored in adj_matrix. +int generate_simple_directed_graph(std::vector> & adj_matrix, int num_edge, int num_vertex) +{ + // Maximum number of edges = n x (n - 1) + if (num_edge > num_vertex * (num_vertex - 1)) { + std::cout << "Too many edges." << std::endl; + return 1; + } + + // Resizing adj_matrix (num_vertex by num_vertex). + adj_matrix.resize(num_vertex, std::vector(num_vertex, 0)); + + int edge[num_edge][2]; + int i = 0; + // Generate directed graph. + while (i < num_edge) { + // Generate the vertices connected by edge. + edge[i][0] = rand() % num_vertex; + edge[i][1] = rand() % num_vertex; + + // Discard loop. + if (edge[i][0] == edge[i][1]) { + continue; + } + // Discard duplicate edge. + else { + for (int j = 0; j < i; j++) { + if (( + edge[i][0] == edge[j][0] && + edge[i][1] == edge[j][1] + )) + { + i--; + } + } + } + i++; + } + + // Initialize adjacency matrix. + int vertex_one, vertex_two; + for (int j = 0; j < num_edge; j++) { + // Set entries with connecting edges to 1. + vertex_one = edge[j][0]; + vertex_two = edge[j][1]; + adj_matrix[vertex_one][vertex_two] = 1; + } + return 0; +} + +// This function linearizes parameter adj_matrix and stores in linear_adj_matrix. +void linearize(std::vector> & adj_matrix, std::vector & linear_adj_matrix, int num_vertex) +{ + linear_adj_matrix.resize(num_vertex * num_vertex); + for (int i = 0; i < num_vertex; i++) { + for (int j = 0; j < num_vertex; j++) { + linear_adj_matrix[i * num_vertex + j] = adj_matrix[i][j]; + } + } +} + +void print_adj_matrix(std::vector> & adj_matrix, int num_vertex) +{ + for (int i = 0; i < num_vertex; i++) { + for (int j = 0; j < num_vertex; j++) { + std::cout << adj_matrix[i][j] << " "; + } + std::cout << std::endl; + } +} + +void print_linear_adj_matrix(std::vector & linear_adj_matrix, int num_vertex) +{ + for (int i = 0; i < num_vertex * num_vertex; i++) { + std::cout << linear_adj_matrix[i] << " "; + if ((i+1)%num_vertex==0) { + std::cout << std::endl; + } + } +} \ No newline at end of file diff --git a/transitive_closure/graph.h b/transitive_closure/graph.h new file mode 100644 index 0000000..a70bdc1 --- /dev/null +++ b/transitive_closure/graph.h @@ -0,0 +1,29 @@ +#ifndef GRAPH_H +#define GRAPH_H + +#include +#include + +int generate_simple_directed_graph( + std::vector> & adj_matrix, + int num_edge, + int num_vertex + ); + +void linearize( + std::vector> & adj_matrix, + std::vector & linear_adj_matrix, + int num_vertex + ); + +void print_adj_matrix( + std::vector> & adj_matrix, + int num_vertex + ); + +void print_linear_adj_matrix( + std::vector & linear_adj_matrix, + int num_vertex + ); + +#endif \ No newline at end of file diff --git a/transitive_closure/test.txt b/transitive_closure/test.txt deleted file mode 100644 index b1cfde7..0000000 --- a/transitive_closure/test.txt +++ /dev/null @@ -1 +0,0 @@ -4 4 1 1 0 1 0 1 1 0 0 0 1 1 0 0 0 1 \ No newline at end of file diff --git a/transitive_closure/transitive_closure.cpp b/transitive_closure/transitive_closure.cpp index dc01b0a..b720600 100644 --- a/transitive_closure/transitive_closure.cpp +++ b/transitive_closure/transitive_closure.cpp @@ -8,196 +8,101 @@ #include "timestamps.h" #include "plf_nanotimer.h" +#include "graph.h" -int verify_matrix_squared(std::string & filename, int & n) -{ - int rows, columns; - std::ifstream input(filename); - - if (!input.is_open()) { - std::cout << "The file could not be opened."; - return 1; - } - - input >> rows >> columns; - n = rows; - - if (rows != columns) { return 1; } - - input.close(); - return 0; -} - -int populate_matrix(std::string & filename, sycl::buffer & C_buf) -{ - const sycl::host_accessor C(C_buf); - std::ifstream input(filename); - - if (!input.is_open()) { - std::cout << "The file could not be opened."; - return 1; - } - - int rows, columns; - input >> rows >> columns; - - if (rows != columns) { return 1; } - - for (int i = 0; i < rows; i++) { - for (int j = 0; j < columns; j++) { - input >> C[i][j]; - } - } - - input.close(); - return 0; - -} - -void print_matrix(const sycl::host_accessor & A, int n) -{ - std::cout << "Transitive Closure A of C: " << std::endl; - for (int i = 0; i < n; i++) { - for (int j = 0; j < n; j++) { - std::cout << A[i][j] << " "; - } - std::cout << std::endl; - } -} +// This function is meant to populate a two-dimensional vector with adjacency matrix values. +// I had the idea to use a host_accessor to perform the Warshall procedure sequentially +// on the CPU. I came across the article referenced below outlining why this might be a +// bad idea. This function is the result. +// https://www.intel.com/content/www/us/en/docs/oneapi/optimization-guide-gpu/2024-2/performance-impact-of-usm-and-buffers.html#:~:text=The%20same%20is,should%20be%20avoided int main(const int argc, const char *const argv[]) { - // main declarations::begin - std::string input; // file to adjacency matrix - int n; // squared matrix dimension + // Main declarations. + std::vector> adj_matrix; + std::vector linear_adj_matrix; + std::vector> sequential_result; std::string device_name; std::vector> timestamps; double time_result; - // main declarations::end - // main inits::begin - bool run_sequentially{false}; + // Main inits. + int num_edges{380}; // Default + int num_vertices{200}; // Default + bool run_sequential{false}; + bool run_parallel{false}; bool run_cpuonly{false}; bool print{false}; - // main inits::end - // cli setup and parse::begin + // CLI setup and parse. CLI::App app{"Transitive Closure"}; app.option_defaults()->always_capture_default(true); - app.add_option("-i, --input", input); - app.add_flag("-s, --sequential", run_sequentially); + app.add_option("-e, --edges", num_edges); + app.add_option("-v, --vertices", num_vertices); + app.add_flag("-s, --sequential", run_sequential); + app.add_flag("-p, --parallel", run_parallel); app.add_flag("-c, --cpu-only", run_cpuonly); - app.add_flag("-p, --print", print); + //app.add_flag("-p, --print", print); CLI11_PARSE(app, argc, argv); - // cli setup and parse::end - - plf::nanotimer time_total; - time_total.start(); - // get dimensions of adjacency matrix::begin - spdlog::info("attempting to verify square-ness of adjacency matrix using input: {}", input); - if (verify_matrix_squared(input, n) == 1) { - spdlog::error("error when verifying square-ness of adjacency matrix using input: {}", input); - return 1; - } - // get dimensions of adjacency matrix::end - - // declare sycl buffers::begin - plf::nanotimer time_buffer_dec; - time_buffer_dec.start(); - sycl::buffer C_buf(sycl::range(n, n)); - sycl::buffer A_buf(sycl::range(n, n)); - time_result = time_buffer_dec.get_elapsed_ns(); - mark_time(timestamps, time_result, "declare sycl buffers"); - // declare sycl buffers::end - - // init sycl buffers::begin - plf::nanotimer time_buffer_init; - time_buffer_init.start(); - spdlog::info("attempting to populate adjacency matrix using input: {}", input); - if (populate_matrix(input, C_buf) == 1) { - spdlog::error("error when populating adjacency matrix using input: {}", input); + // Init parallel and sequential result. + spdlog::info("Initializing result structures"); + sycl::buffer parallel_result(sycl::range(num_vertices, num_vertices)); + sequential_result.resize(num_vertices, std::vector(num_vertices, 0)); + + // Initialize adjacency matrix. + spdlog::info("Generating simple directed graph with edges: {} and vertices: {}", num_edges, num_vertices); + if (generate_simple_directed_graph(adj_matrix, num_edges, num_vertices) == 1) { return 1; } - time_result = time_buffer_init.get_elapsed_ns(); - mark_time(timestamps, time_result, "populate sycl buffer with adj matrix"); - // init sycl buffers::end + + // Initialize linear adjacency matrix. + spdlog::info("Linearizing adjacency matrix"); + linearize(adj_matrix, linear_adj_matrix, num_vertices); // run sequential::begin - if (run_sequentially) { - spdlog::info("starting sequential execution block"); - plf::nanotimer time_sequential; // time sequential block::begin + if (run_sequential) { + spdlog::info("Run sequential begin"); + plf::nanotimer time_sequential; time_sequential.start(); - - // init host accessors::begin - spdlog::info("initializing host accessors"); - plf::nanotimer time_init_host; - time_init_host.start(); - const sycl::host_accessor C(C_buf); - const sycl::host_accessor A(A_buf); - time_result = time_init_host.get_elapsed_ns(); - mark_time(timestamps, time_result, "host accessor creation"); - // init host accessors::end - - // Warshall procedure step 1::begin - spdlog::info("starting Warshall procedure step 1"); - plf::nanotimer time_warshall_p1; - time_warshall_p1.start(); - for (int i = 0; i < n; i++) { - for (int j = 0; j < n; j++) { - A[i][j] = C[i][j]; + + // Warshall procedure step 1 + for (int i = 0; i < num_vertices; i++) { + for (int j = 0; j < num_vertices; j++) { + sequential_result[i][j] = adj_matrix[i][j]; } } - time_result = time_warshall_p1.get_elapsed_ns(); - mark_time(timestamps, time_result, "Warshall procedure step 1"); - // Warshall procedure step 1::end - // Warshall procedure step 2::begin - spdlog::info("starting Warshall procedure step 2"); - plf::nanotimer time_warshall_p2; - time_warshall_p2.start(); - for (int k = 0; k < n; k++) { - for (int i = 0; i < n; i++) { - for (int j = 0; j < n; j++) { - A[i][j] = A[i][j] || A[i][k] * A[k][j]; + // Warshall procedure step 2 + for (int k = 0; k < num_vertices; k++) { + for (int i = 0; i < num_vertices; i++) { + for (int j = 0; j < num_vertices; j++) { + sequential_result[i][j] = sequential_result[i][j] || sequential_result[i][k] && sequential_result[k][j]; } } } - time_result = time_warshall_p2.get_elapsed_ns(); - mark_time(timestamps, time_result, "Warshall procedure step 2"); - // Warshall procedure step 2::end - - time_result = time_sequential.get_elapsed_ns(); // time sequential block::end - mark_time(timestamps, time_result, "sequential block"); - - if (print) { - print_matrix(A, n); - } + time_result = time_sequential.get_elapsed_ns(); + mark_time(timestamps, time_result, "Run sequential"); + spdlog::info("Run sequential end"); } // run sequential::end // run parallel::begin - else { - spdlog::info("starting parallel execution block"); - // flag buffer for timing parallel execution - spdlog::info("initializing flag"); - plf::nanotimer time_flag_init; - time_flag_init.start(); + if (run_parallel) { + spdlog::info("Run parallel begin"); + plf::nanotimer time_parallel; + time_parallel.start(); + + // Loading linear adjacency matrix into buffer. + sycl::buffer C_buf(linear_adj_matrix.data(), sycl::range<1>{linear_adj_matrix.size()}); sycl::buffer flag_buf{sycl::range<1>{1}}; - time_result = time_flag_init.get_elapsed_ns(); - mark_time(timestamps, time_result, "init flag buf"); - // sycl queue creation::begin + // sycl queue creation: spdlog::info("setting up queue"); - plf::nanotimer time_device_init; - time_device_init.start(); sycl::device device{run_cpuonly ? sycl::cpu_selector_v : sycl::default_selector_v}; sycl::queue q{device, dpc_common::exception_handler, sycl::property::queue::in_order()}; device_name = q.get_device().get_info(); spdlog::info("device: {}", device_name); - time_result = time_device_init.get_elapsed_ns(); - mark_time(timestamps, time_result, "queue creation"); - // sycl queue creation::end // Warshall procedure // step 1: copy adjacency matrix C into matrix A @@ -205,12 +110,14 @@ int main(const int argc, const char *const argv[]) // for j := 1 to n do // A[i, j] := C[i, j] // + + // Copying Linear C into 2-D A. q.submit([&](auto &h) { const sycl::accessor C(C_buf, h, sycl::read_only); - const sycl::accessor A(A_buf, h, sycl::write_only); + const sycl::accessor A(parallel_result, h, sycl::write_only); - h.parallel_for(sycl::range(n, n), [=](auto index) { - A[index] = C[index]; + h.parallel_for(sycl::range(num_vertices, num_vertices), [=](auto index) { + A[index] = C[index[0] * num_vertices + index[1]]; }); }); @@ -226,35 +133,45 @@ int main(const int argc, const char *const argv[]) // // note: A[index] = A[i][j] // - for (int k = 0; k < n; k++) { - + for (int k = 0; k < num_vertices; k++) { q.submit([&](auto &h) { - const sycl::accessor A(A_buf, h, sycl::write_only); + const sycl::accessor A(parallel_result, h, sycl::write_only); - h.parallel_for(sycl::range(n, n), [=](auto index) { - A[index] = A[index] || A[index[0]][k] * A[k][index[1]]; + h.parallel_for(sycl::range(num_vertices, num_vertices), [=](auto index) { + A[index] = A[index] || A[index[0]][k] && A[k][index[1]]; }); }); } - // access flag buff to initiate work on device::begin + // Access flag buff to initiate work on device. spdlog::info("preparing flag access"); - plf::nanotimer time_parallel; - time_parallel.start(); const sycl::host_accessor flag{flag_buf}; time_result = time_parallel.get_elapsed_ns(); - mark_time(timestamps, time_result, "Warshall procedure step 1 & 2"); - // access flag buff to initiate work on device::end + mark_time(timestamps, time_result, "Run parallel"); + spdlog::info("Run parallel end"); + } + // run parallel::end + + if (run_sequential && run_parallel) { + // Compare outputs. + spdlog::info("Comparing results begin"); + plf::nanotimer time_compare; + time_compare.start(); - if (print) { - const sycl::host_accessor A{A_buf}; - print_matrix(A, n); + const sycl::host_accessor A_back{parallel_result}; + for (int i = 0; i < num_vertices; i++) { + for (int j = 0; j < num_vertices; j++) { + if (A_back[i][j] != sequential_result[i][j]) { + spdlog::info("Results do not match!"); + return 1; + } + } } + time_result = time_compare.get_elapsed_ns(); + mark_time(timestamps, time_result, "Comparing results"); + spdlog::info("Results match!"); + spdlog::info("Comparing results end"); } - // run parallel::end - - time_result = time_total.get_elapsed_ns(); - mark_time(timestamps, time_result, "total time"); print_timestamps(timestamps); spdlog::info("all done"); return 0; From ec9f7ff4110452387ee3fd487a11cf96b0ff0ce4 Mon Sep 17 00:00:00 2001 From: rhasler1 Date: Wed, 24 Jul 2024 13:46:51 -0500 Subject: [PATCH 2/5] removing flag buf --- transitive_closure/transitive_closure.cpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/transitive_closure/transitive_closure.cpp b/transitive_closure/transitive_closure.cpp index b720600..e1e1eaf 100644 --- a/transitive_closure/transitive_closure.cpp +++ b/transitive_closure/transitive_closure.cpp @@ -45,6 +45,8 @@ int main(const int argc, const char *const argv[]) //app.add_flag("-p, --print", print); CLI11_PARSE(app, argc, argv); + plf::nanotimer time_total; + time_total.start(); // Init parallel and sequential result. spdlog::info("Initializing result structures"); sycl::buffer parallel_result(sycl::range(num_vertices, num_vertices)); @@ -95,7 +97,6 @@ int main(const int argc, const char *const argv[]) // Loading linear adjacency matrix into buffer. sycl::buffer C_buf(linear_adj_matrix.data(), sycl::range<1>{linear_adj_matrix.size()}); - sycl::buffer flag_buf{sycl::range<1>{1}}; // sycl queue creation: spdlog::info("setting up queue"); @@ -143,9 +144,9 @@ int main(const int argc, const char *const argv[]) }); }); } - // Access flag buff to initiate work on device. - spdlog::info("preparing flag access"); - const sycl::host_accessor flag{flag_buf}; + // Init work on device. + spdlog::info("Preparing host access"); + const sycl::host_accessor host_access{parallel_result}; time_result = time_parallel.get_elapsed_ns(); mark_time(timestamps, time_result, "Run parallel"); spdlog::info("Run parallel end"); @@ -172,6 +173,8 @@ int main(const int argc, const char *const argv[]) spdlog::info("Results match!"); spdlog::info("Comparing results end"); } + time_result = time_total.get_elapsed_ns(); + mark_time(timestamps, time_result, "Time Total"); print_timestamps(timestamps); spdlog::info("all done"); return 0; From 9b524c0330dece575a248af9794c2728e02e4df1 Mon Sep 17 00:00:00 2001 From: rhasler1 Date: Wed, 24 Jul 2024 13:51:59 -0500 Subject: [PATCH 3/5] removing comment --- transitive_closure/transitive_closure.cpp | 6 ------ 1 file changed, 6 deletions(-) diff --git a/transitive_closure/transitive_closure.cpp b/transitive_closure/transitive_closure.cpp index e1e1eaf..56a5a91 100644 --- a/transitive_closure/transitive_closure.cpp +++ b/transitive_closure/transitive_closure.cpp @@ -10,12 +10,6 @@ #include "plf_nanotimer.h" #include "graph.h" -// This function is meant to populate a two-dimensional vector with adjacency matrix values. -// I had the idea to use a host_accessor to perform the Warshall procedure sequentially -// on the CPU. I came across the article referenced below outlining why this might be a -// bad idea. This function is the result. -// https://www.intel.com/content/www/us/en/docs/oneapi/optimization-guide-gpu/2024-2/performance-impact-of-usm-and-buffers.html#:~:text=The%20same%20is,should%20be%20avoided - int main(const int argc, const char *const argv[]) { // Main declarations. From b8833655c0096316dec18656aeac77f23971f233 Mon Sep 17 00:00:00 2001 From: rhasler1 Date: Thu, 5 Sep 2024 14:22:36 -0500 Subject: [PATCH 4/5] working on ptr rep --- transitive_closure/CMakeLists.txt | 2 +- transitive_closure/graph.cpp | 110 ++++++- transitive_closure/graph.h | 12 + transitive_closure/main.cpp | 377 ++++++++++++++++++++++ transitive_closure/transitive_closure.cpp | 175 ---------- 5 files changed, 497 insertions(+), 179 deletions(-) create mode 100644 transitive_closure/main.cpp delete mode 100644 transitive_closure/transitive_closure.cpp diff --git a/transitive_closure/CMakeLists.txt b/transitive_closure/CMakeLists.txt index 60b2abd..a410232 100644 --- a/transitive_closure/CMakeLists.txt +++ b/transitive_closure/CMakeLists.txt @@ -1,2 +1,2 @@ -add_executable(transitive_closure transitive_closure.cpp timestamps.cpp graph.cpp) +add_executable(transitive_closure main.cpp timestamps.cpp graph.cpp) target_link_libraries(transitive_closure fmt::fmt spdlog::spdlog CLI11::CLI11) \ No newline at end of file diff --git a/transitive_closure/graph.cpp b/transitive_closure/graph.cpp index e634f43..b4af0c6 100644 --- a/transitive_closure/graph.cpp +++ b/transitive_closure/graph.cpp @@ -3,10 +3,113 @@ // Why to linearize: https://community.khronos.org/t/matrix-multiplication-using-2d-std-vector/106457#:~:text=On%20a%20more,be%20known%20statically. #include +#include +#include #include "graph.h" +// Function to generate a directed graph that has no loops and no duplicate edges. The resuling graph is +// stored in a 1-D vector. +// Resizing graph is done in main. +int generate_simple_directed_linear(std::vector & graph, int edges, int vertices) +{ + if (edges > vertices * (vertices - 1)) { + spdlog::warn("Too many edges."); + return -1; + } + + std::vector> edge(edges, std::vector(2)); + int i = 0; + + // Generate directed graph. + while (i < edges) { + edge[i][0] = rand() % vertices; + edge[i][1] = rand() % vertices; + + // Discard loop. + if (edge[i][0] == edge[i][1]) { + continue; + } + // Discard duplicate edge. + else { + for (int j = 0; j < i; j++) { + if (( + edge[i][0] == edge[j][0] && + edge[i][1] == edge[j][1] + )) + { + i--; + } + } + } + i++; + } + + // Initialize linear adjacency matrix. + int vertex_one, vertex_two; + for (int j = 0; j < edges; j++) { + vertex_one = edge[j][0]; + vertex_two = edge[j][1]; + graph[vertex_one * vertices + vertex_two] = 1; + } + + return 0; +} + +// Generate directed graph provided pointer. +int generate_simple_directed_linear(int *graph, int edges, int vertices) +{ + if (edges > vertices * (vertices - 1)) { + spdlog::warn("Too many edges."); + return -1; + } + + std::vector> edge(edges, std::vector(2)); + int i = 0; + + // Generate directed graph. + while (i < edges) { + edge[i][0] = rand() % vertices; + edge[i][1] = rand() % vertices; + + // Discard loop. + if (edge[i][0] == edge[i][1]) { + continue; + } + // Discard duplicate edge. + else { + for (int j = 0; j < i; j++) { + if (( + edge[i][0] == edge[j][0] && + edge[i][1] == edge[j][1] + )) + { + i--; + } + } + } + i++; + } + + // Initialize linear adjacency matrix. + int vertex_one, vertex_two; + for (int j = 0; j < edges; j++) { + vertex_one = edge[j][0]; + vertex_two = edge[j][1]; + graph[vertex_one * vertices + vertex_two] = 1; + } + + return 0; +} + + + + + + +// #### DEPRECATED ##### +// // This function generates a directed graph that has no loops and no duplicate edges. The resulting graph -// is stored in adj_matrix. +// is stored in a 2-D vector. int generate_simple_directed_graph(std::vector> & adj_matrix, int num_edge, int num_vertex) { // Maximum number of edges = n x (n - 1) @@ -15,8 +118,8 @@ int generate_simple_directed_graph(std::vector> & adj_matrix, i return 1; } - // Resizing adj_matrix (num_vertex by num_vertex). - adj_matrix.resize(num_vertex, std::vector(num_vertex, 0)); + // Resizing adj_matrix (num_vertex by num_vertex) is now done in main. + //adj_matrix.resize(num_vertex, std::vector(num_vertex, 0)); int edge[num_edge][2]; int i = 0; @@ -53,6 +156,7 @@ int generate_simple_directed_graph(std::vector> & adj_matrix, i vertex_two = edge[j][1]; adj_matrix[vertex_one][vertex_two] = 1; } + return 0; } diff --git a/transitive_closure/graph.h b/transitive_closure/graph.h index a70bdc1..0b3884e 100644 --- a/transitive_closure/graph.h +++ b/transitive_closure/graph.h @@ -4,6 +4,18 @@ #include #include +int generate_simple_directed_linear( + std::vector & graph, + int edges, + int vertices + ); + +int generate_simple_directed_linear( + int *graph, + int edges, + int vertices + ); + int generate_simple_directed_graph( std::vector> & adj_matrix, int num_edge, diff --git a/transitive_closure/main.cpp b/transitive_closure/main.cpp new file mode 100644 index 0000000..58e8769 --- /dev/null +++ b/transitive_closure/main.cpp @@ -0,0 +1,377 @@ +// MAIN +// 1. Parse CLI. +// 2. Set up device. +// 3. Allocate USM memory. +// 4. Allocate Buffer memory. +// 5. Run sequential ? Use USM else use Buffer. +// 6. Run parallel ? use USM else use Buffer. +// +// +// For buffer execution: +// 1. Allocate and initialize graph vector on host. +// 2. Shove graph vector into sycl buffer C. +// 3. Copy sycl buffer C to sycl buffer A (Warshall P1). +// 4. Do computation on sycl buffer A (Warshall P1). +// 5. Access sycl buffer A on host. +// 6. Complete. +// +// +// For USM execution: +// 1. Allocate and initialize graph pointer on host. +// 2. Copy contents of graph into USM C. +// 3. Copy USM C to USM A (Warshall P1). +// 4. Do computation on USM A (Warshall P2). +// 5. Access USM on host. +// 6. Complete. +// +// TODO: look moore into malloc_shared + +#include +#include + +#include +#include +#include +#include + +#include "timestamps.h" +#include "plf_nanotimer.h" +#include "graph.h" + +int verify_results(sycl::buffer &a_parallel, std::vector A_sequential, int vertices); +int verify_results(int *A_parallel, int *A_sequential, int vertices); +void warshall_parallel_p1(sycl::queue &q, sycl::buffer &a, std::vector &c, int vertices); +void warshall_parallel_p2(sycl::queue &q, sycl::buffer &a, int vertices); +void warshall_sequential_p1(std::vector &A, std::vector &C, int vertices); +void warshall_sequential_p2(std::vector &A, int vertices); +void warshall_parallel_p1(sycl::queue &q, int *A, int *C, int vertices); +void warshall_parallel_p2(sycl::queue &q, int *A, int vertices); +void warshall_sequential_p1(int *A, int *C, int vertices); +void warshall_sequential_p2(int *A, int vertices); + + +int main(const int argc, const char *const argv[]) +{ + // Main declarations. + std::vector> timestamps; + double time_result; + + // Default values. + int edges{380}; + int vertices{200}; + bool run_sequential{false}; + bool run_parallel{false}; + bool run_cpuonly{false}; + bool use_usm{false}; + bool print{false}; + + // CLI setup and parse. + CLI::App app{"Transitive Closure"}; + app.option_defaults()->always_capture_default(true); + app.add_option("-e, --edges", edges); + app.add_option("-v, --vertices", vertices); + app.add_flag("-s, --sequential", run_sequential); + app.add_flag("-p, --parallel", run_parallel); + app.add_flag("-c, --cpu-only", run_cpuonly); + app.add_flag("-u, --use-usm", use_usm); + CLI11_PARSE(app, argc, argv); + + plf::nanotimer time_total; + time_total.start(); + + // Sycl queue creation + spdlog::info("setting up queue"); + plf::nanotimer time_queue_creation; + time_queue_creation.start(); + sycl::device device{run_cpuonly ? sycl::cpu_selector_v : sycl::default_selector_v}; + sycl::queue q{device, dpc_common::exception_handler}; + std::string device_name = q.get_device().get_info(); + uint device_max_compute_units = q.get_device().get_info(); + spdlog::info("device name: {}, device max compute units: {}", device_name, device_max_compute_units); + time_result = time_queue_creation.get_elapsed_ns(); + mark_time(timestamps, time_result, "Sycl queue creation"); + + + + // Allocate memory using USM and pointers + plf::nanotimer time_memory_alloc_usm; + time_memory_alloc_usm.start(); + int *graph_usm = sycl::malloc_shared(vertices * vertices, q); + int *sequential_usm = sycl::malloc_shared(vertices * vertices, q); + int *parallel_usm = sycl::malloc_shared(vertices * vertices, q); + + if ((graph_usm == nullptr) || (sequential_usm == nullptr) || + (parallel_usm == nullptr)) { + if (graph_usm != nullptr) free(graph_usm, q); + if (sequential_usm != nullptr) free(sequential_usm, q); + if (parallel_usm != nullptr) free(parallel_usm, q); + + spdlog::error("USM memory allocation failure"); + return -1; + } + time_result = time_memory_alloc_usm.get_elapsed_ns(); + mark_time(timestamps, time_result, "USM memory allocation"); + + plf::nanotimer time_init_buffer; + time_init_buffer.start(); + + if (!generate_simple_directed_linear(graph_usm, edges, vertices)) { + spdlog::info("(USM) Failed to generate simple directed graph with edges: {} and vertices: {}", edges, vertices); + if (graph_usm != nullptr) free(graph_usm, q); + if (sequential_usm != nullptr) free(sequential_usm, q); + if (parallel_usm != nullptr) free(parallel_usm, q); + return -1; + } + time_result = time_init_buffer.get_elapsed_ns(); + mark_time(timestamps, time_result, "USM graph initialization"); + + + + // Allocate memory for parallel and sequential execution. + plf::nanotimer time_memory_alloc; + time_memory_alloc.start(); + spdlog::info("Allocating memory for graph data."); + std::vector graph_vec(vertices * vertices); + spdlog::info("Allocating memory for sequential result."); + std::vector sequential_vec(vertices * vertices); + spdlog::info("Allocating memory for parallel result."); + sycl::buffer parallel_buf(sycl::range(vertices, vertices)); + time_result = time_memory_alloc.get_elapsed_ns(); + mark_time(timestamps, time_result, "Buffer memory allocation"); + + // Initialize adjacency matrix. + spdlog::info("Generating simple directed graph with edges: {} and vertices: {}", edges, vertices); + if (!generate_simple_directed_linear(graph_vec, edges, vertices)) { + spdlog::info("(Buffer) Failed to generate simple directed graph with edges: {} and vertices: {}", edges, vertices); + return 1; + } + + if (run_sequential) { + plf::nanotimer time_sequential; + if (use_usm) { + // TODO implement overload function for + spdlog::info("Warshall sequential with USM begin"); + time_sequential.start(); + warshall_sequential_p1(sequential_usm, graph_usm, vertices); + warshall_sequential_p2(sequential_usm, vertices); + time_result = time_sequential.get_elapsed_ns(); + mark_time(timestamps, time_result, "Warshall USM sequential"); + spdlog::info("Warshall USM sequential end"); + } + else { // Use buffers. + spdlog::info("Warshall sequential with buffers begin"); + time_sequential.start(); + warshall_sequential_p1(sequential_vec, graph_vec, vertices); + warshall_sequential_p2(sequential_vec, vertices); + time_result = time_sequential.get_elapsed_ns(); + mark_time(timestamps, time_result, "Warshall sequential"); + spdlog::info("Warshall sequential end"); + } + } + + if (run_parallel) { + plf::nanotimer time_parallel; + if (use_usm) { + //TODO - implement overloaded function for warshall parallel w/ pointers. + spdlog::info("Warshall parallel with USM begin"); + time_parallel.start(); + warshall_parallel_p1(q, parallel_usm, graph_usm, vertices); + warshall_parallel_p2(q, parallel_usm, vertices); + time_result = time_parallel.get_elapsed_ns(); + mark_time(timestamps, time_result, "Warshall USM parallel"); + spdlog::info("Warshall USM parallel end"); + } + else { // Use buffers. + spdlog::info("Warshall parallel with Buffers begin"); + time_parallel.start(); + warshall_parallel_p1(q, parallel_buf, graph_vec, vertices); + warshall_parallel_p2(q, parallel_buf, vertices); + const sycl::host_accessor trigger_work{parallel_buf}; + time_result = time_parallel.get_elapsed_ns(); + mark_time(timestamps, time_result, "Warshall Buffers parallel"); + spdlog::info("Warshall Buffers parallel end"); + } + } + + if (run_sequential && run_parallel) { + spdlog::info("Verify results begin"); + plf::nanotimer time_verify; + time_verify.start(); + if (use_usm) { + if (!verify_results(parallel_usm, sequential_usm, vertices)) { + spdlog::warn("Results do not match!"); + } + else { + spdlog::info("Results match!"); + } + } + else { + if (!verify_results(parallel_buf, sequential_vec, vertices)) { + spdlog::warn("Results do not match!"); + } + else { + spdlog::info("Results match!"); + } + } + time_result = time_verify.get_elapsed_ns(); + mark_time(timestamps, time_result, "Verify results"); + spdlog::info("Verify results end"); + } + + if (graph_usm != nullptr) { free(graph_usm, q); } + if (sequential_usm != nullptr) { free(sequential_usm, q); } + if (parallel_usm != nullptr) { free(parallel_usm, q); } + + time_result = time_total.get_elapsed_ns(); + mark_time(timestamps, time_result, "Total time"); + print_timestamps(timestamps); + spdlog::info("All done"); + return 0; +} + +// Function compares sequential and parallel result. +int verify_results(sycl::buffer &a_parallel, std::vector A_sequential, int vertices) +{ + const sycl::host_accessor A_parallel{a_parallel}; + for (int i = 0; i < vertices; i++) { + for (int j = 0; j < vertices; j++) { + if (A_parallel[i][j] != A_sequential[i * vertices + j]) { + return -1; + } + } + } + return 1; +} + +int verify_results(int *A_parallel, int *A_sequential, int vertices) +{ + for (int i = 0; i < vertices; i++) { + for (int j = 0; j < vertices; j++) { + if (A_parallel[i * vertices + j] != A_sequential[i * vertices + j]) { + return -1; + } + } + } + return 1; +} + +// Warshall procedure +// Step 1: copy adjacency matrix C into matrix A +// for i := 1 to n do +// for j := 1 to n do +// A[i, j] := C[i, j] +// +// +// Parallel part 1 w/ Buffers. +void warshall_parallel_p1(sycl::queue &q, sycl::buffer &a, std::vector &c, int vertices) +{ + sycl::buffer c_buf(c.data(), sycl::range<1>{c.size()}); + q.submit([&](auto &h) { + const sycl::accessor C(c_buf, h, sycl::read_only); + const sycl::accessor A(a, h, sycl::write_only); + + h.parallel_for(sycl::range(vertices, vertices), [=](auto index) { + int i = index[0]; + int j = index[1]; + A[i][j] = C[i * vertices + j]; + }); + }); +} + +// Parallel part 1 w/ USM. +//TODO +void warshall_parallel_p1(sycl::queue &q, int *A, int *C, int vertices) +{ + q.submit([&](auto &h) { + h.parallel_for(sycl::range(vertices * vertices), [=](auto index) { + A[index] = C[index]; + }); + }); +} + +// Parallel part 2 w/ USM. +//TODO +void warshall_parallel_p2(sycl::queue &q, int *A, int vertices) +{ + for (int k = 0; k < vertices; k++) { + q.submit([&](auto &h) { + h.parallel_for(sycl::range(vertices * vertices), [=](auto index) { + int i = index / vertices; + int j = index % vertices; + A[i * vertices + j] = A[i * vertices + j] || A[i * vertices + k] && A[k * vertices + j]; + }); + }); + } +} + +// Warshall procedure +// Step 2: compute the transitive closure of C as A +// +// for k := 1 to n do +// for i := 1 to n do +// for j := 1 to n do +// if A[i, j] = false then +// A[i, j] := A[i, k] and A[k, j] +// +// note: A[index] = A[i][j] +// +// +// Parallel part 2 w/ Buffers. +void warshall_parallel_p2(sycl::queue &q, sycl::buffer &a, int vertices) +{ + for (int k = 0; k < vertices; k++) { + q.submit([&](auto &h) { + const sycl::accessor A(a, h, sycl::read_write); + + h.parallel_for(sycl::range(vertices, vertices), [=](auto index) { + int i = index[0]; + int j = index[1]; + A[i][j] = A[i][j] || A[i][k] && A[k][j]; + }); + }); + } +} + +// Sequential part 1 w/ Buffers. +void warshall_sequential_p1(std::vector &A, std::vector &C, int vertices) +{ + for (int i = 0; i < vertices; i++) { + for (int j = 0; j < vertices; j++) { + A[i * vertices + j] = C[i * vertices + j]; + } + } +} + +// Sequential part 1 w/ USM. +void warshall_sequential_p1(int *A, int *C, int vertices) +{ + for (int i = 0; i < vertices; i++) { + for (int j = 0; j < vertices; j++) { + A[i * vertices + j] = C[i * vertices + j]; + } + } +} + +// Sequential part 2 w/ Buffers. +void warshall_sequential_p2(std::vector &A, int vertices) +{ + for (int k = 0; k < vertices; k++) { + for (int i = 0; i < vertices; i++) { + for (int j = 0; j < vertices; j++) { + A[i * vertices + j] = A[i * vertices + j] || A[i * vertices + k] && A[k * vertices + j]; + } + } + } +} + +// Sequential part 2 w/ USM. +void warshall_sequential_p2(int *A, int vertices) +{ + for (int k = 0; k < vertices; k++) { + for (int i = 0; i < vertices; i++) { + for (int j = 0; j < vertices; j++) { + A[i * vertices + j] = A[i * vertices + j] || A[i * vertices + k] && A[k * vertices + j]; + } + } + } +} \ No newline at end of file diff --git a/transitive_closure/transitive_closure.cpp b/transitive_closure/transitive_closure.cpp deleted file mode 100644 index 56a5a91..0000000 --- a/transitive_closure/transitive_closure.cpp +++ /dev/null @@ -1,175 +0,0 @@ -#include -#include - -#include -#include -#include -#include - -#include "timestamps.h" -#include "plf_nanotimer.h" -#include "graph.h" - -int main(const int argc, const char *const argv[]) -{ - // Main declarations. - std::vector> adj_matrix; - std::vector linear_adj_matrix; - std::vector> sequential_result; - std::string device_name; - std::vector> timestamps; - double time_result; - - // Main inits. - int num_edges{380}; // Default - int num_vertices{200}; // Default - bool run_sequential{false}; - bool run_parallel{false}; - bool run_cpuonly{false}; - bool print{false}; - - // CLI setup and parse. - CLI::App app{"Transitive Closure"}; - app.option_defaults()->always_capture_default(true); - app.add_option("-e, --edges", num_edges); - app.add_option("-v, --vertices", num_vertices); - app.add_flag("-s, --sequential", run_sequential); - app.add_flag("-p, --parallel", run_parallel); - app.add_flag("-c, --cpu-only", run_cpuonly); - //app.add_flag("-p, --print", print); - CLI11_PARSE(app, argc, argv); - - plf::nanotimer time_total; - time_total.start(); - // Init parallel and sequential result. - spdlog::info("Initializing result structures"); - sycl::buffer parallel_result(sycl::range(num_vertices, num_vertices)); - sequential_result.resize(num_vertices, std::vector(num_vertices, 0)); - - // Initialize adjacency matrix. - spdlog::info("Generating simple directed graph with edges: {} and vertices: {}", num_edges, num_vertices); - if (generate_simple_directed_graph(adj_matrix, num_edges, num_vertices) == 1) { - return 1; - } - - // Initialize linear adjacency matrix. - spdlog::info("Linearizing adjacency matrix"); - linearize(adj_matrix, linear_adj_matrix, num_vertices); - - // run sequential::begin - if (run_sequential) { - spdlog::info("Run sequential begin"); - plf::nanotimer time_sequential; - time_sequential.start(); - - // Warshall procedure step 1 - for (int i = 0; i < num_vertices; i++) { - for (int j = 0; j < num_vertices; j++) { - sequential_result[i][j] = adj_matrix[i][j]; - } - } - - // Warshall procedure step 2 - for (int k = 0; k < num_vertices; k++) { - for (int i = 0; i < num_vertices; i++) { - for (int j = 0; j < num_vertices; j++) { - sequential_result[i][j] = sequential_result[i][j] || sequential_result[i][k] && sequential_result[k][j]; - } - } - } - time_result = time_sequential.get_elapsed_ns(); - mark_time(timestamps, time_result, "Run sequential"); - spdlog::info("Run sequential end"); - } - // run sequential::end - - // run parallel::begin - if (run_parallel) { - spdlog::info("Run parallel begin"); - plf::nanotimer time_parallel; - time_parallel.start(); - - // Loading linear adjacency matrix into buffer. - sycl::buffer C_buf(linear_adj_matrix.data(), sycl::range<1>{linear_adj_matrix.size()}); - - // sycl queue creation: - spdlog::info("setting up queue"); - sycl::device device{run_cpuonly ? sycl::cpu_selector_v : sycl::default_selector_v}; - sycl::queue q{device, dpc_common::exception_handler, sycl::property::queue::in_order()}; - device_name = q.get_device().get_info(); - spdlog::info("device: {}", device_name); - - // Warshall procedure - // step 1: copy adjacency matrix C into matrix A - // for i := 1 to n do - // for j := 1 to n do - // A[i, j] := C[i, j] - // - - // Copying Linear C into 2-D A. - q.submit([&](auto &h) { - const sycl::accessor C(C_buf, h, sycl::read_only); - const sycl::accessor A(parallel_result, h, sycl::write_only); - - h.parallel_for(sycl::range(num_vertices, num_vertices), [=](auto index) { - A[index] = C[index[0] * num_vertices + index[1]]; - - }); - }); - - // Warshall procedure - // step 2: compute the transitive closure of C as A - // - // for k := 1 to n do - // for i := 1 to n do - // for j := 1 to n do - // if A[i, j] = false then - // A[i, j] := A[i, k] and A[k, j] - // - // note: A[index] = A[i][j] - // - for (int k = 0; k < num_vertices; k++) { - q.submit([&](auto &h) { - const sycl::accessor A(parallel_result, h, sycl::write_only); - - h.parallel_for(sycl::range(num_vertices, num_vertices), [=](auto index) { - A[index] = A[index] || A[index[0]][k] && A[k][index[1]]; - - }); - }); - } - // Init work on device. - spdlog::info("Preparing host access"); - const sycl::host_accessor host_access{parallel_result}; - time_result = time_parallel.get_elapsed_ns(); - mark_time(timestamps, time_result, "Run parallel"); - spdlog::info("Run parallel end"); - } - // run parallel::end - - if (run_sequential && run_parallel) { - // Compare outputs. - spdlog::info("Comparing results begin"); - plf::nanotimer time_compare; - time_compare.start(); - - const sycl::host_accessor A_back{parallel_result}; - for (int i = 0; i < num_vertices; i++) { - for (int j = 0; j < num_vertices; j++) { - if (A_back[i][j] != sequential_result[i][j]) { - spdlog::info("Results do not match!"); - return 1; - } - } - } - time_result = time_compare.get_elapsed_ns(); - mark_time(timestamps, time_result, "Comparing results"); - spdlog::info("Results match!"); - spdlog::info("Comparing results end"); - } - time_result = time_total.get_elapsed_ns(); - mark_time(timestamps, time_result, "Time Total"); - print_timestamps(timestamps); - spdlog::info("all done"); - return 0; -} \ No newline at end of file From 655277da6da7f3b91e99b3f2e66181b5269c43b0 Mon Sep 17 00:00:00 2001 From: rhasler1 Date: Mon, 9 Sep 2024 17:40:46 -0500 Subject: [PATCH 5/5] fixing return values --- transitive_closure/graph.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/transitive_closure/graph.cpp b/transitive_closure/graph.cpp index b4af0c6..9ea7b48 100644 --- a/transitive_closure/graph.cpp +++ b/transitive_closure/graph.cpp @@ -52,7 +52,7 @@ int generate_simple_directed_linear(std::vector & graph, int edges, int ver graph[vertex_one * vertices + vertex_two] = 1; } - return 0; + return 1; } // Generate directed graph provided pointer. @@ -98,7 +98,7 @@ int generate_simple_directed_linear(int *graph, int edges, int vertices) graph[vertex_one * vertices + vertex_two] = 1; } - return 0; + return 1; } @@ -115,7 +115,7 @@ int generate_simple_directed_graph(std::vector> & adj_matrix, i // Maximum number of edges = n x (n - 1) if (num_edge > num_vertex * (num_vertex - 1)) { std::cout << "Too many edges." << std::endl; - return 1; + return -1; } // Resizing adj_matrix (num_vertex by num_vertex) is now done in main. @@ -157,7 +157,7 @@ int generate_simple_directed_graph(std::vector> & adj_matrix, i adj_matrix[vertex_one][vertex_two] = 1; } - return 0; + return 1; } // This function linearizes parameter adj_matrix and stores in linear_adj_matrix.