Change implementation of acl_hal_mmd_get_timestamp() to use std::chro… by huangbri · Pull Request #41 · intel/fpga-runtime-for-opencl

huangbri · 2021-12-10T23:02:59Z

…no instead of clock_gettime.

huangbri · 2022-03-18T06:23:02Z

Testing difference in kernel runtime using following design

#include <iostream>
#include <chrono>
#include "ocl.h"

using namespace aocl_utils;

#ifndef ITERATION_TIME
#define ITERATION_TIME 5
#endif

// ACL runtime configuration
static cl_platform_id platform;
static cl_device_id device;
static cl_context context;
static cl_command_queue queue;
static cl_int status;
static cl_program program;
static cl_kernel kernel_dummy;
static cl_mem kernel_buf;

static Timer timer;
static cl_event evt;

struct BenchmarkTime get_exec_time_us(cl_event evt);

// free the resources allocated during initialization
static void freeResources() {
    if (queue)          clReleaseCommandQueue(queue);
    if (context)        clReleaseContext(context);
    if (kernel_dummy)   clReleaseKernel(kernel_dummy);
    if (program)        clReleaseProgram(program);
    if(kernel_buf)      clReleaseMemObject(kernel_buf);
}


static void dump_error(const char* str, cl_int status) {
    printf("%s\n", str);
    printf("Error code: %d\n", status);
    freeResources();
    exit(-1);
}


/******************************************************************************************/
/*************************************** MAIN LOGIC ***************************************/
/******************************************************************************************/

// Initiate the platform, device, context, queue, and other global variables
void device_init()
{
    char buf[1000];
    cl_uint num_devices;
    size_t bin_file_len = 0;
    const unsigned char* bin_file;
    cl_int bin_status = 0;

    printf("Initializing OpenCL\n");

#ifdef EMULATOR_FAST
    platform = findPlatform("Intel(R) FPGA Emulation Platform for OpenCL(TM)");
#else
    platform = findPlatform("Intel(R) FPGA SDK for OpenCL(TM)");
#endif
    if(platform == NULL) {
        printf("ERROR: Unable to find Intel(R) FPGA OpenCL platform.\n");
        exit(-1);
    }

    // get the device ID
    status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, &num_devices);
    if(status != CL_SUCCESS) dump_error("Failed clGetDeviceIDs.", status);
    if(num_devices < 1) {
        printf("Found %d devices!\n", num_devices);
        freeResources();
        exit(-1);
    }
    status = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(buf), (void*)&buf, NULL);
    printf("Using Device with name: %s\n",buf);
    status = clGetDeviceInfo(device, CL_DEVICE_VENDOR, sizeof(buf), (void*)&buf, NULL);
    printf("Using Device from vendor: %s\n",buf);

    // create a context
    context = clCreateContext(0, 1, &device, NULL, NULL, &status);
    if(status != CL_SUCCESS) dump_error("Failed clCreateContext.", status);

    // create a command queue
    queue = clCreateCommandQueue(context, device, NULL, &status);
    if(status != CL_SUCCESS) dump_error("Failed clCreateCommandQueue.", status);

    // load the aocx
    printf("Loading dummy.aocx...\n");
    bin_file = load_file("dummy.aocx", &bin_file_len);

    // create the program
    program = clCreateProgramWithBinary(context, 1, &device, &bin_file_len, &bin_file, &bin_status, &status);
    if(status != CL_SUCCESS) dump_error("Failed clCreateProgramWithBinary.", status);

    // build the program
    printf("build dummy_k program\n"); fflush(stdout);
    status = clBuildProgram(program, 1, &device, "", 0, 0);
    if(status != CL_SUCCESS) dump_error("Failed clBuildProgram.", status);

    // create the kernel
    printf("create kernel dummy_k\n"); fflush(stdout);
    kernel_dummy = clCreateKernel(program, "dummy_k", &status);
    if(status != CL_SUCCESS) dump_error("Failed clCreateKernel.", status);

    // create the input buffer
    kernel_buf = clCreateBuffer(context, CL_MEM_READ_WRITE, 1000, NULL, &status);
    if(status != CL_SUCCESS) dump_error("Failed clCreateBuffer.", status);
    printf("\n\n");
}


// Repeatedly run get_enqueueTask_time
void run_test()
{
    // Empty queue just in case
    clFinish(queue);
    for (int i = 0; i < 100; i++) {
    #ifdef UNRELEASED_AT_QUEUE_START
        clEnqueueTask(queue, kernel_dummy, 0, NULL, NULL);
    #else
        clEnqueueTask(queue, kernel_dummy, 0, NULL, &evt);
    #endif
        clFinish(queue);
    }
    

    const auto tstart = std::chrono::high_resolution_clock::now();
    for (size_t i = 0; i < 100000; i++) {
    #ifdef UNRELEASED_AT_QUEUE_START
      clEnqueueTask(queue, kernel_dummy, 0, NULL, NULL);
    #else
      clEnqueueTask(queue, kernel_dummy, 0, NULL, &evt);
    #endif
      clFinish(queue);
    }
    const auto tend = std::chrono::high_resolution_clock::now();

    // compute the average kernel latency across the measured iterations
    const std::chrono::duration<double, std::milli> time_span = tend - tstart;


    std::cout << "Overall Time: " << time_span.count() << "\n";
}





/******************************************************************************************/
/*********************************** MAIN LOGIC HELPER ************************************/
/******************************************************************************************/

// I can set the status and do error check inside the timer, but that extra time might skew the result.
// Since I don't know how fast these operations will be, I'll not do that. All operations are assumed to execute correctly
struct BenchmarkTime get_enqueueTask_time(bool verbose)
{
    timer.start();
// Special case: Will not pass event in. As a result, we can't measure queue-submit-start time
#ifdef UNRELEASED_AT_QUEUE_START
    clEnqueueTask(queue, kernel_dummy, 0, NULL, NULL);
#else
    clEnqueueTask(queue, kernel_dummy, 0, NULL, &evt);
#endif
    clFinish(queue);
    timer.stop();

    struct BenchmarkTime enqueueTask_time;
#ifndef UNRELEASED_AT_QUEUE_START
    enqueueTask_time = get_exec_time_us(evt);
#endif
    enqueueTask_time.execution_time = timer.get_time_s() * 1000000;

    if (verbose)
        print_benchmark_time(enqueueTask_time);
    return enqueueTask_time;
}


unsigned char* load_file(const char* filename, size_t* size_ret)
{
    FILE* fp = fopen(filename,"rb");
    long size;
    unsigned char *buffer;

    if (!fp) {
      return NULL;
    }
    fseek(fp, 0, SEEK_END);
    size = ftell(fp);
    fseek(fp, 0, SEEK_SET);
    buffer = (unsigned char *) malloc(size);
    fread(buffer, size, 1, fp);
    fclose(fp);
    *size_ret = size;
    return buffer;
}

/******************************************************************************************/
/******************************** BENCHMARK_TIME UTILITY **********************************/
/******************************************************************************************/
struct BenchmarkTime get_exec_time_us(cl_event evt)
{
    struct BenchmarkTime res;
    cl_ulong kernelEventQueued;
    cl_ulong kernelEventSubmit;
    cl_ulong kernelEventStart;
    cl_ulong kernelEventEnd;
    clGetEventProfilingInfo(evt, CL_PROFILING_COMMAND_QUEUED, sizeof(unsigned long long), &kernelEventQueued, NULL);
    clGetEventProfilingInfo(evt, CL_PROFILING_COMMAND_SUBMIT, sizeof(unsigned long long), &kernelEventSubmit, NULL);
    clGetEventProfilingInfo(evt, CL_PROFILING_COMMAND_START, sizeof(unsigned long long), &kernelEventStart, NULL);
    clGetEventProfilingInfo(evt, CL_PROFILING_COMMAND_END, sizeof(unsigned long long), &kernelEventEnd, NULL);

    res.queued_submit_time = (float)(kernelEventSubmit - kernelEventQueued) / 1000;
    res.submit_start_time = (float)(kernelEventStart - kernelEventSubmit) / 1000;
    res.start_end_time = (float)(kernelEventEnd - kernelEventStart) / 1000;

    return res;
}


void print_benchmark_time(struct BenchmarkTime benchmark_time) {
    float cumul_time = 0.0f;
    cumul_time += benchmark_time.queued_submit_time;
    printf("        Queued-Submit time: %9.3lf us,  cumulative time: %9.3lf us\n", benchmark_time.queued_submit_time, cumul_time);
    cumul_time += benchmark_time.submit_start_time;
    printf("        Submit-Start time:  %9.3lf us,  cumulative time: %9.3lf us\n", benchmark_time.submit_start_time, cumul_time);
    cumul_time += benchmark_time.start_end_time;
    printf("        Start-End time:     %9.3lf us,  cumulative time: %9.3lf us\n", benchmark_time.start_end_time, cumul_time);

    printf("        Execution time:     %9.3lf us\n", benchmark_time.execution_time);
    printf("\n\n");
}


/******************************************************************************************/
/************************************* OTHER UTILITY **************************************/
/******************************************************************************************/

// Searches all platforms for the first platform whose name
// contains the search string (case-insensitive).
cl_platform_id findPlatform(const char *platform_name_search) {
    cl_int status;

    std::string search = platform_name_search;
    std::transform(search.begin(), search.end(), search.begin(), tolower);

    // Get number of platforms.
    cl_uint num_platforms;
    status = clGetPlatformIDs(0, NULL, &num_platforms);
    if(status != CL_SUCCESS) dump_error("Query for number of platforms failed", status);

    // Get a list of all platform ids.
    scoped_array<cl_platform_id> pids(num_platforms);
    status = clGetPlatformIDs(num_platforms, pids, NULL);
    if(status != CL_SUCCESS) dump_error("Query for all platform ids failed", status);

    // For each platform, get name and compare against the search string.
    for(unsigned i = 0; i < num_platforms; ++i) {
        std::string name = getPlatformName(pids[i]);

        // Convert to lower case.
        std::transform(name.begin(), name.end(), name.begin(), tolower);

        if(name.find(search) != std::string::npos) {
        // Found!
        return pids[i];
        }
    }

    // No platform found.
    return NULL;
}


// Returns the platform name.
std::string getPlatformName(cl_platform_id pid) {
    cl_int status;

    size_t sz;
    status = clGetPlatformInfo(pid, CL_PLATFORM_NAME, 0, NULL, &sz);
    if(status != CL_SUCCESS) dump_error("Query for platform name size failed", status);

    scoped_array<char> name(sz);
    status = clGetPlatformInfo(pid, CL_PLATFORM_NAME, sz, name, NULL);
    if(status != CL_SUCCESS) dump_error("Query for platform name failed", status);

    return name.get();
}

huangbri · 2022-03-18T06:28:04Z

Current benchmarks:

old: 3278.61 ms
new: 3457.42 ms

old (no profiling): 3359.73 ms
new (no profiling): 3171.21 ms

Looks like there is random variation. Going to do multiple runs and average out.

Variation on regtest runs is in the 100s of ms.
Change makes a 10^-4 ms improvement per enqueue, 10^1 ms improvement over 10^5 ms loops.

The improvement cannot be measured with this method

Change implementation of acl_hal_mmd_get_timestamp() to use std::chro…

5a57b05

…no instead of clock_gettime.

huangbri self-assigned this Dec 10, 2021

huangbri added this to the 2022.2 milestone Dec 10, 2021

pcolberg modified the milestones: 2022.2, 2022.3 Jan 20, 2022

pcolberg assigned pcolberg and unassigned huangbri Apr 30, 2022

pcolberg modified the milestones: 2022.3, 2022.4 Jun 4, 2022

pcolberg removed this from the 2023.0 milestone Oct 19, 2022

pcolberg added the enhancement New feature or request label Dec 8, 2022

pcolberg removed their assignment Mar 15, 2025

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Change implementation of acl_hal_mmd_get_timestamp() to use std::chro…#41

Change implementation of acl_hal_mmd_get_timestamp() to use std::chro…#41
huangbri wants to merge 1 commit intointel:mainfrom
huangbri:consistent-timer

huangbri commented Dec 10, 2021

Uh oh!

huangbri commented Mar 18, 2022 •

edited by pcolberg

Loading

Uh oh!

huangbri commented Mar 18, 2022 •

edited by pcolberg

Loading

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

2 participants

Conversation

huangbri commented Dec 10, 2021

Uh oh!

huangbri commented Mar 18, 2022 • edited by pcolberg Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

huangbri commented Mar 18, 2022 • edited by pcolberg Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

2 participants

huangbri commented Mar 18, 2022 •

edited by pcolberg

Loading

huangbri commented Mar 18, 2022 •

edited by pcolberg

Loading