Change implementation of acl_hal_mmd_get_timestamp() to use std::chro…#41
Draft
huangbri wants to merge 1 commit intointel:mainfrom
Draft
Change implementation of acl_hal_mmd_get_timestamp() to use std::chro…#41huangbri wants to merge 1 commit intointel:mainfrom
huangbri wants to merge 1 commit intointel:mainfrom
Conversation
…no instead of clock_gettime.
Contributor
Author
|
Testing difference in kernel runtime using following design #include <iostream>
#include <chrono>
#include "ocl.h"
using namespace aocl_utils;
#ifndef ITERATION_TIME
#define ITERATION_TIME 5
#endif
// ACL runtime configuration
static cl_platform_id platform;
static cl_device_id device;
static cl_context context;
static cl_command_queue queue;
static cl_int status;
static cl_program program;
static cl_kernel kernel_dummy;
static cl_mem kernel_buf;
static Timer timer;
static cl_event evt;
struct BenchmarkTime get_exec_time_us(cl_event evt);
// free the resources allocated during initialization
static void freeResources() {
if (queue) clReleaseCommandQueue(queue);
if (context) clReleaseContext(context);
if (kernel_dummy) clReleaseKernel(kernel_dummy);
if (program) clReleaseProgram(program);
if(kernel_buf) clReleaseMemObject(kernel_buf);
}
static void dump_error(const char* str, cl_int status) {
printf("%s\n", str);
printf("Error code: %d\n", status);
freeResources();
exit(-1);
}
/******************************************************************************************/
/*************************************** MAIN LOGIC ***************************************/
/******************************************************************************************/
// Initiate the platform, device, context, queue, and other global variables
void device_init()
{
char buf[1000];
cl_uint num_devices;
size_t bin_file_len = 0;
const unsigned char* bin_file;
cl_int bin_status = 0;
printf("Initializing OpenCL\n");
#ifdef EMULATOR_FAST
platform = findPlatform("Intel(R) FPGA Emulation Platform for OpenCL(TM)");
#else
platform = findPlatform("Intel(R) FPGA SDK for OpenCL(TM)");
#endif
if(platform == NULL) {
printf("ERROR: Unable to find Intel(R) FPGA OpenCL platform.\n");
exit(-1);
}
// get the device ID
status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, &num_devices);
if(status != CL_SUCCESS) dump_error("Failed clGetDeviceIDs.", status);
if(num_devices < 1) {
printf("Found %d devices!\n", num_devices);
freeResources();
exit(-1);
}
status = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(buf), (void*)&buf, NULL);
printf("Using Device with name: %s\n",buf);
status = clGetDeviceInfo(device, CL_DEVICE_VENDOR, sizeof(buf), (void*)&buf, NULL);
printf("Using Device from vendor: %s\n",buf);
// create a context
context = clCreateContext(0, 1, &device, NULL, NULL, &status);
if(status != CL_SUCCESS) dump_error("Failed clCreateContext.", status);
// create a command queue
queue = clCreateCommandQueue(context, device, NULL, &status);
if(status != CL_SUCCESS) dump_error("Failed clCreateCommandQueue.", status);
// load the aocx
printf("Loading dummy.aocx...\n");
bin_file = load_file("dummy.aocx", &bin_file_len);
// create the program
program = clCreateProgramWithBinary(context, 1, &device, &bin_file_len, &bin_file, &bin_status, &status);
if(status != CL_SUCCESS) dump_error("Failed clCreateProgramWithBinary.", status);
// build the program
printf("build dummy_k program\n"); fflush(stdout);
status = clBuildProgram(program, 1, &device, "", 0, 0);
if(status != CL_SUCCESS) dump_error("Failed clBuildProgram.", status);
// create the kernel
printf("create kernel dummy_k\n"); fflush(stdout);
kernel_dummy = clCreateKernel(program, "dummy_k", &status);
if(status != CL_SUCCESS) dump_error("Failed clCreateKernel.", status);
// create the input buffer
kernel_buf = clCreateBuffer(context, CL_MEM_READ_WRITE, 1000, NULL, &status);
if(status != CL_SUCCESS) dump_error("Failed clCreateBuffer.", status);
printf("\n\n");
}
// Repeatedly run get_enqueueTask_time
void run_test()
{
// Empty queue just in case
clFinish(queue);
for (int i = 0; i < 100; i++) {
#ifdef UNRELEASED_AT_QUEUE_START
clEnqueueTask(queue, kernel_dummy, 0, NULL, NULL);
#else
clEnqueueTask(queue, kernel_dummy, 0, NULL, &evt);
#endif
clFinish(queue);
}
const auto tstart = std::chrono::high_resolution_clock::now();
for (size_t i = 0; i < 100000; i++) {
#ifdef UNRELEASED_AT_QUEUE_START
clEnqueueTask(queue, kernel_dummy, 0, NULL, NULL);
#else
clEnqueueTask(queue, kernel_dummy, 0, NULL, &evt);
#endif
clFinish(queue);
}
const auto tend = std::chrono::high_resolution_clock::now();
// compute the average kernel latency across the measured iterations
const std::chrono::duration<double, std::milli> time_span = tend - tstart;
std::cout << "Overall Time: " << time_span.count() << "\n";
}
/******************************************************************************************/
/*********************************** MAIN LOGIC HELPER ************************************/
/******************************************************************************************/
// I can set the status and do error check inside the timer, but that extra time might skew the result.
// Since I don't know how fast these operations will be, I'll not do that. All operations are assumed to execute correctly
struct BenchmarkTime get_enqueueTask_time(bool verbose)
{
timer.start();
// Special case: Will not pass event in. As a result, we can't measure queue-submit-start time
#ifdef UNRELEASED_AT_QUEUE_START
clEnqueueTask(queue, kernel_dummy, 0, NULL, NULL);
#else
clEnqueueTask(queue, kernel_dummy, 0, NULL, &evt);
#endif
clFinish(queue);
timer.stop();
struct BenchmarkTime enqueueTask_time;
#ifndef UNRELEASED_AT_QUEUE_START
enqueueTask_time = get_exec_time_us(evt);
#endif
enqueueTask_time.execution_time = timer.get_time_s() * 1000000;
if (verbose)
print_benchmark_time(enqueueTask_time);
return enqueueTask_time;
}
unsigned char* load_file(const char* filename, size_t* size_ret)
{
FILE* fp = fopen(filename,"rb");
long size;
unsigned char *buffer;
if (!fp) {
return NULL;
}
fseek(fp, 0, SEEK_END);
size = ftell(fp);
fseek(fp, 0, SEEK_SET);
buffer = (unsigned char *) malloc(size);
fread(buffer, size, 1, fp);
fclose(fp);
*size_ret = size;
return buffer;
}
/******************************************************************************************/
/******************************** BENCHMARK_TIME UTILITY **********************************/
/******************************************************************************************/
struct BenchmarkTime get_exec_time_us(cl_event evt)
{
struct BenchmarkTime res;
cl_ulong kernelEventQueued;
cl_ulong kernelEventSubmit;
cl_ulong kernelEventStart;
cl_ulong kernelEventEnd;
clGetEventProfilingInfo(evt, CL_PROFILING_COMMAND_QUEUED, sizeof(unsigned long long), &kernelEventQueued, NULL);
clGetEventProfilingInfo(evt, CL_PROFILING_COMMAND_SUBMIT, sizeof(unsigned long long), &kernelEventSubmit, NULL);
clGetEventProfilingInfo(evt, CL_PROFILING_COMMAND_START, sizeof(unsigned long long), &kernelEventStart, NULL);
clGetEventProfilingInfo(evt, CL_PROFILING_COMMAND_END, sizeof(unsigned long long), &kernelEventEnd, NULL);
res.queued_submit_time = (float)(kernelEventSubmit - kernelEventQueued) / 1000;
res.submit_start_time = (float)(kernelEventStart - kernelEventSubmit) / 1000;
res.start_end_time = (float)(kernelEventEnd - kernelEventStart) / 1000;
return res;
}
void print_benchmark_time(struct BenchmarkTime benchmark_time) {
float cumul_time = 0.0f;
cumul_time += benchmark_time.queued_submit_time;
printf(" Queued-Submit time: %9.3lf us, cumulative time: %9.3lf us\n", benchmark_time.queued_submit_time, cumul_time);
cumul_time += benchmark_time.submit_start_time;
printf(" Submit-Start time: %9.3lf us, cumulative time: %9.3lf us\n", benchmark_time.submit_start_time, cumul_time);
cumul_time += benchmark_time.start_end_time;
printf(" Start-End time: %9.3lf us, cumulative time: %9.3lf us\n", benchmark_time.start_end_time, cumul_time);
printf(" Execution time: %9.3lf us\n", benchmark_time.execution_time);
printf("\n\n");
}
/******************************************************************************************/
/************************************* OTHER UTILITY **************************************/
/******************************************************************************************/
// Searches all platforms for the first platform whose name
// contains the search string (case-insensitive).
cl_platform_id findPlatform(const char *platform_name_search) {
cl_int status;
std::string search = platform_name_search;
std::transform(search.begin(), search.end(), search.begin(), tolower);
// Get number of platforms.
cl_uint num_platforms;
status = clGetPlatformIDs(0, NULL, &num_platforms);
if(status != CL_SUCCESS) dump_error("Query for number of platforms failed", status);
// Get a list of all platform ids.
scoped_array<cl_platform_id> pids(num_platforms);
status = clGetPlatformIDs(num_platforms, pids, NULL);
if(status != CL_SUCCESS) dump_error("Query for all platform ids failed", status);
// For each platform, get name and compare against the search string.
for(unsigned i = 0; i < num_platforms; ++i) {
std::string name = getPlatformName(pids[i]);
// Convert to lower case.
std::transform(name.begin(), name.end(), name.begin(), tolower);
if(name.find(search) != std::string::npos) {
// Found!
return pids[i];
}
}
// No platform found.
return NULL;
}
// Returns the platform name.
std::string getPlatformName(cl_platform_id pid) {
cl_int status;
size_t sz;
status = clGetPlatformInfo(pid, CL_PLATFORM_NAME, 0, NULL, &sz);
if(status != CL_SUCCESS) dump_error("Query for platform name size failed", status);
scoped_array<char> name(sz);
status = clGetPlatformInfo(pid, CL_PLATFORM_NAME, sz, name, NULL);
if(status != CL_SUCCESS) dump_error("Query for platform name failed", status);
return name.get();
} |
Contributor
Author
|
Current benchmarks: old: 3278.61 ms old (no profiling): 3359.73 ms Looks like there is random variation. Going to do multiple runs and average out. Variation on regtest runs is in the 100s of ms. The improvement cannot be measured with this method |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Add this suggestion to a batch that can be applied as a single commit.This suggestion is invalid because no changes were made to the code.Suggestions cannot be applied while the pull request is closed.Suggestions cannot be applied while viewing a subset of changes.Only one suggestion per line can be applied in a batch.Add this suggestion to a batch that can be applied as a single commit.Applying suggestions on deleted lines is not supported.You must change the existing code in this line in order to create a valid suggestion.Outdated suggestions cannot be applied.This suggestion has been applied or marked resolved.Suggestions cannot be applied from pending reviews.Suggestions cannot be applied on multi-line comments.Suggestions cannot be applied while the pull request is queued to merge.Suggestion cannot be applied right now. Please check back later.
…no instead of clock_gettime.