From 84c3b759cf6ff7dee68efb692079139f0f65aecc Mon Sep 17 00:00:00 2001 From: "Igoshev, Yaroslav" Date: Sun, 16 Feb 2020 13:53:09 +0300 Subject: [PATCH 01/12] initial --- .../oneapi/sgd_dense_minibatch_oneapi_impl.i | 70 ++++++++++++++- .../oneapi/internal/execution_context_sycl.h | 86 +++++++------------ include/oneapi/internal/types_utils_cxx11.h | 45 +++++----- 3 files changed, 121 insertions(+), 80 deletions(-) diff --git a/algorithms/kernel/optimization_solver/sgd/oneapi/sgd_dense_minibatch_oneapi_impl.i b/algorithms/kernel/optimization_solver/sgd/oneapi/sgd_dense_minibatch_oneapi_impl.i index 0101646e60e..899b987bab3 100644 --- a/algorithms/kernel/optimization_solver/sgd/oneapi/sgd_dense_minibatch_oneapi_impl.i +++ b/algorithms/kernel/optimization_solver/sgd/oneapi/sgd_dense_minibatch_oneapi_impl.i @@ -228,6 +228,7 @@ services::Status SGDKernelOneAPI::compute(HostA const IndicesStatus indicesStatus = (batchIndices ? user : (batchSize < nTerms ? random : all)); services::SharedPtr > ntBatchIndices; + services::SharedPtr > ntBatchIndices2; if (indicesStatus == user || indicesStatus == random) { @@ -235,8 +236,9 @@ services::Status SGDKernelOneAPI::compute(HostA ntBatchIndices = HomogenNumericTableCPU::create(batchSize, 1, &status); } - NumericTablePtr previousBatchIndices = function->sumOfFunctionsParameter->batchIndices; - function->sumOfFunctionsParameter->batchIndices = ntBatchIndices; + NumericTablePtr previousBatchIndices = function->sumOfFunctionsParameter->batchIndices; + auto ntBatchIndicesSycl = SyclHomogenNumericTable::create(batchSize, 1, AllocationFlag::doAllocate); + auto ntBatchIndices2Sycl = SyclHomogenNumericTable::create(batchSize, 1, AllocationFlag::doAllocate); const TypeIds::Id idType = TypeIds::id(); UniversalBuffer prevWorkValueU = ctx.allocate(idType, argumentSize, &status); @@ -288,23 +290,83 @@ services::Status SGDKernelOneAPI::compute(HostA *nProceededIterations = static_cast(nIter); + bool isSync = false; + bool isFirst = false; + bool isFirstInitialized = false; + bool isSecondInitialized = false; + services::internal::HostAppHelper host(pHost, 10); for (size_t epoch = startIteration; epoch < (startIteration + nIter); epoch++) { - if (epoch % L == 0 || epoch == startIteration) + if (epoch % (L << 1) == 0 || epoch == startIteration) { learningRate = learningRateArray[(epoch / L) % learningRateLength]; consCoeff = consCoeffsArray[(epoch / L) % consCoeffsLength]; if (indicesStatus == user || indicesStatus == random) { DAAL_ITTNOTIFY_SCOPED_TASK(generateUniform); + const int * pValues = nullptr; DAAL_CHECK_STATUS(status, rngTask.get(pValues)); ntBatchIndices->setArray(const_cast(pValues), ntBatchIndices->getNumberOfRows()); } + if (indicesStatus == user || indicesStatus == random) + { + DAAL_ITTNOTIFY_SCOPED_TASK(generateUniform); + + const int * pValues2 = nullptr; + DAAL_CHECK_STATUS(status, rngTask.get(pValues2)); + ntBatchIndices2->setArray(const_cast(pValues2), ntBatchIndices2->getNumberOfRows()); + } + + BlockDescriptor ntBatchIndicesBD; + DAAL_CHECK_STATUS(status, ntBatchIndices->getBlockOfRows(0, 1, ReadWriteMode::readOnly, ntBatchIndicesBD)); + const services::Buffer ntBatchIndicesBuffer = ntBatchIndicesBD.getBuffer(); + + BlockDescriptor ntBatchIndicesBDSycl; + DAAL_CHECK_STATUS(status, ntBatchIndicesSycl->getBlockOfRows(0, 1, ReadWriteMode::writeOnly, ntBatchIndicesBDSycl)); + const services::Buffer ntBatchIndicesBufferSycl = ntBatchIndicesBDSycl.getBuffer(); + + ctx.copy(ntBatchIndicesBufferSycl, 0, ntBatchIndicesBuffer, 0, batchSize, &status, isSync); + + BlockDescriptor ntBatchIndices2BD; + DAAL_CHECK_STATUS(status, ntBatchIndices2->getBlockOfRows(0, 1, ReadWriteMode::readOnly, ntBatchIndices2BD)); + const services::Buffer ntBatchIndices2Buffer = ntBatchIndices2BD.getBuffer(); + + BlockDescriptor ntBatchIndices2SyclBD; + DAAL_CHECK_STATUS(status, ntBatchIndices2Sycl->getBlockOfRows(0, 1, ReadWriteMode::writeOnly, ntBatchIndices2SyclBD)); + const services::Buffer ntBatchIndices2SyclBuffer = ntBatchIndices2SyclBD.getBuffer(); + + ctx.copy(ntBatchIndices2SyclBuffer, 0, ntBatchIndices2Buffer, 0, batchSize, &status, isSync); + + isFirst = false; + isFirstInitialized = false; + isSecondInitialized = false; } - DAAL_CHECK_STATUS(status, function->computeNoThrow()); + if (epoch % L == 0) + { + isFirst = true; + } + + if (isFirst) + { + if (!isFirstInitialized) + { + sumOfFunctionsParameter->batchIndices = ntBatchIndicesSycl; + isFirstInitialized = true; + } + DAAL_CHECK_STATUS(status, function->computeNoThrow()); + } + else + { + if (!isSecondInitialized) + { + sumOfFunctionsParameter->batchIndices = ntBatchIndices2Sycl; + isSecondInitialized = true; + } + DAAL_CHECK_STATUS(status, function->computeNoThrow()); + } if (host.isCancelled(status, 1)) { diff --git a/include/oneapi/internal/execution_context_sycl.h b/include/oneapi/internal/execution_context_sycl.h index 7b11af482e8..2b08434a516 100644 --- a/include/oneapi/internal/execution_context_sycl.h +++ b/include/oneapi/internal/execution_context_sycl.h @@ -16,20 +16,20 @@ *******************************************************************************/ #ifdef DAAL_SYCL_INTERFACE -#ifndef __DAAL_ONEAPI_INTERNAL_EXECUTION_CONTEXT_SYCL_H__ -#define __DAAL_ONEAPI_INTERNAL_EXECUTION_CONTEXT_SYCL_H__ + #ifndef __DAAL_ONEAPI_INTERNAL_EXECUTION_CONTEXT_SYCL_H__ + #define __DAAL_ONEAPI_INTERNAL_EXECUTION_CONTEXT_SYCL_H__ -#include -#include -#include -#include + #include + #include + #include + #include -#include "services/daal_string.h" -#include "oneapi/internal/execution_context.h" -#include "oneapi/internal/kernel_scheduler_sycl.h" -#include "oneapi/internal/math/blas_executor.h" -#include "oneapi/internal/math/lapack_executor.h" -#include "oneapi/internal/error_handling.h" + #include "services/daal_string.h" + #include "oneapi/internal/execution_context.h" + #include "oneapi/internal/kernel_scheduler_sycl.h" + #include "oneapi/internal/math/blas_executor.h" + #include "oneapi/internal/math/lapack_executor.h" + #include "oneapi/internal/error_handling.h" namespace daal { @@ -50,22 +50,13 @@ class OpenClKernelFactory : public Base, public ClKernelFactoryIface public: ProgramCacheEntry() : _program(nullptr) {} - ~ProgramCacheEntry() - { - delete _program; - } + ~ProgramCacheEntry() { delete _program; } - void setProgram(OpenClProgramRef *program) - { - _program = program; - } + void setProgram(OpenClProgramRef * program) { _program = program; } - OpenClProgramRef * getProgram() - { - return _program; - } + OpenClProgramRef * getProgram() { return _program; } - const char* getName(services::Status * status = nullptr) + const char * getName(services::Status * status = nullptr) { if (!_program) { @@ -90,25 +81,20 @@ class OpenClKernelFactory : public Base, public ClKernelFactoryIface ~KernelCacheEntry() {} - void setKernel(KernelPtr kernel, const char *name) + void setKernel(KernelPtr kernel, const char * name) { - _name = name; + _name = name; _kernel = kernel; } - KernelPtr getKernel() - { - return _kernel; - } + KernelPtr getKernel() { return _kernel; } - const char* getName() - { - return _name.c_str(); - } + const char * getName() { return _name.c_str(); } }; public: - explicit OpenClKernelFactory(cl::sycl::queue & deviceQueue) : _clProgramRef(nullptr), _executionTarget(ExecutionTargetIds::unspecified), _deviceQueue(deviceQueue) + explicit OpenClKernelFactory(cl::sycl::queue & deviceQueue) + : _clProgramRef(nullptr), _executionTarget(ExecutionTargetIds::unspecified), _deviceQueue(deviceQueue) {} void build(ExecutionTargetId target, const char * key, const char * program, const char * options = "", @@ -131,8 +117,8 @@ class OpenClKernelFactory : public Base, public ClKernelFactoryIface } else { - _clProgramCache[id].setProgram(new OpenClProgramRef(_deviceQueue.get_context().get(), - _deviceQueue.get_device().get(), key, program, options, status)); + _clProgramCache[id].setProgram( + new OpenClProgramRef(_deviceQueue.get_context().get(), _deviceQueue.get_device().get(), key, program, options, status)); if (status != nullptr && !status->ok()) { return; @@ -169,14 +155,13 @@ class OpenClKernelFactory : public Base, public ClKernelFactoryIface { return KernelPtr(); } - kernelPtr = KernelPtr(new OpenClKernel(_executionTarget, *_clProgramRef, kernelRef)); + kernelPtr = KernelPtr(new OpenClKernel(_executionTarget, *_clProgramRef, kernelRef)); _kernelCache[id].setKernel(kernelPtr, kernelName); } return kernelPtr; } - ~OpenClKernelFactory() DAAL_C11_OVERRIDE - {} + ~OpenClKernelFactory() DAAL_C11_OVERRIDE {} protected: uint64_t hash(const char * key) @@ -293,14 +278,14 @@ class SyclExecutionContextImpl : public Base, public ExecutionContextIface } } - void copy(UniversalBuffer dest, size_t desOffset, UniversalBuffer src, size_t srcOffset, size_t count, - services::Status * status = nullptr) DAAL_C11_OVERRIDE + void copy(UniversalBuffer dest, size_t desOffset, UniversalBuffer src, size_t srcOffset, size_t count, services::Status * status = nullptr, + isSync = true) DAAL_C11_OVERRIDE { DAAL_ASSERT(dest.type() == src.type()); // TODO: Thread safe? try { - BufferCopier::copy(_deviceQueue, dest, desOffset, src, srcOffset, count); + BufferCopier::copy(_deviceQueue, dest, desOffset, src, srcOffset, count, isSync); } catch (cl::sycl::exception const & e) { @@ -325,20 +310,15 @@ class SyclExecutionContextImpl : public Base, public ExecutionContextIface InfoDevice & getInfoDevice() DAAL_C11_OVERRIDE { return _infoDevice; } - void copy(UniversalBuffer dest, - size_t desOffset, - void *src, - size_t srcOffset, - size_t count, - services::Status *status = nullptr) DAAL_C11_OVERRIDE + void copy(UniversalBuffer dest, size_t desOffset, void * src, size_t srcOffset, size_t count, + services::Status * status = nullptr) DAAL_C11_OVERRIDE { // TODO: Thread safe? try { - ArrayCopier::copy(_deviceQueue, dest, - desOffset, src, srcOffset, count); + ArrayCopier::copy(_deviceQueue, dest, desOffset, src, srcOffset, count); } - catch (cl::sycl::exception const &e) + catch (cl::sycl::exception const & e) { convertSyclExceptionToStatus(e, status); } diff --git a/include/oneapi/internal/types_utils_cxx11.h b/include/oneapi/internal/types_utils_cxx11.h index 0a3f424bc6d..92b5937c898 100644 --- a/include/oneapi/internal/types_utils_cxx11.h +++ b/include/oneapi/internal/types_utils_cxx11.h @@ -77,9 +77,11 @@ class BufferCopier UniversalBuffer & srcUnivers; size_t srcOffset; size_t count; + bool isSync; - explicit Execute(cl::sycl::queue & queue, UniversalBuffer & dst, size_t desOffset, UniversalBuffer & src, size_t srcOffset, size_t count) - : queue(queue), dstUnivers(dst), dstOffset(desOffset), srcUnivers(src), srcOffset(srcOffset), count(count) + explicit Execute(cl::sycl::queue & queue, UniversalBuffer & dst, size_t desOffset, UniversalBuffer & src, size_t srcOffset, size_t count, + bool isSync = true) + : queue(queue), dstUnivers(dst), dstOffset(desOffset), srcUnivers(src), srcOffset(srcOffset), count(count), isSync(isSync) {} template @@ -92,14 +94,18 @@ class BufferCopier auto dst_acc = dst.template get_access(cgh, cl::sycl::range<1>(count), cl::sycl::id<1>(dstOffset)); cgh.copy(src_acc, dst_acc); }); - event.wait(); + if (isSync) + { + event.wait(); + } } }; public: - static void copy(cl::sycl::queue & queue, UniversalBuffer & dest, size_t dstOffset, UniversalBuffer & src, size_t srcOffset, size_t count) + static void copy(cl::sycl::queue & queue, UniversalBuffer & dest, size_t dstOffset, UniversalBuffer & src, size_t srcOffset, size_t count, + bool isSync = true) { - Execute op(queue, dest, dstOffset, src, srcOffset, count); + Execute op(queue, dest, dstOffset, src, srcOffset, count, isSync); TypeDispatcher::dispatch(dest.type(), op); } }; @@ -113,28 +119,24 @@ class ArrayCopier private: struct Execute { - cl::sycl::queue &queue; - UniversalBuffer &dstUnivers; + cl::sycl::queue & queue; + UniversalBuffer & dstUnivers; size_t dstOffset; - void *srcArray; + void * srcArray; size_t srcOffset; size_t count; - explicit Execute(cl::sycl::queue &queue, - UniversalBuffer &dst, size_t desOffset, - void *src, size_t srcOffset, - size_t count) : queue(queue), dstUnivers(dst), - dstOffset(desOffset), srcArray(src), - srcOffset(srcOffset), count(count) { } + explicit Execute(cl::sycl::queue & queue, UniversalBuffer & dst, size_t desOffset, void * src, size_t srcOffset, size_t count) + : queue(queue), dstUnivers(dst), dstOffset(desOffset), srcArray(src), srcOffset(srcOffset), count(count) + {} template void operator()(Typelist) { - auto src = (T*)srcArray; - auto dst = dstUnivers.get().toSycl(); - cl::sycl::event event = queue.submit([&](cl::sycl::handler &cgh) { - auto dst_acc = dst.template get_access( - cgh, cl::sycl::range<1>(count), cl::sycl::id<1>(dstOffset)); + auto src = (T *)srcArray; + auto dst = dstUnivers.get().toSycl(); + cl::sycl::event event = queue.submit([&](cl::sycl::handler & cgh) { + auto dst_acc = dst.template get_access(cgh, cl::sycl::range<1>(count), cl::sycl::id<1>(dstOffset)); cgh.copy(src, dst_acc); }); event.wait(); @@ -142,10 +144,7 @@ class ArrayCopier }; public: - static void copy(cl::sycl::queue &queue, - UniversalBuffer &dest, size_t dstOffset, - void *src, size_t srcOffset, - size_t count) + static void copy(cl::sycl::queue & queue, UniversalBuffer & dest, size_t dstOffset, void * src, size_t srcOffset, size_t count) { Execute op(queue, dest, dstOffset, src, srcOffset, count); TypeDispatcher::dispatch(dest.type(), op); From 96f43ee7e2b6a3a6b9def60f7bd365560203e302 Mon Sep 17 00:00:00 2001 From: "Igoshev, Yaroslav" Date: Sun, 16 Feb 2020 14:47:20 +0300 Subject: [PATCH 02/12] fix --- .../oneapi/sgd_dense_minibatch_oneapi_impl.i | 8 +++---- include/oneapi/internal/execution_context.h | 24 ++++++++----------- .../oneapi/internal/execution_context_sycl.h | 2 +- 3 files changed, 15 insertions(+), 19 deletions(-) diff --git a/algorithms/kernel/optimization_solver/sgd/oneapi/sgd_dense_minibatch_oneapi_impl.i b/algorithms/kernel/optimization_solver/sgd/oneapi/sgd_dense_minibatch_oneapi_impl.i index 899b987bab3..b341205243e 100644 --- a/algorithms/kernel/optimization_solver/sgd/oneapi/sgd_dense_minibatch_oneapi_impl.i +++ b/algorithms/kernel/optimization_solver/sgd/oneapi/sgd_dense_minibatch_oneapi_impl.i @@ -237,8 +237,8 @@ services::Status SGDKernelOneAPI::compute(HostA } NumericTablePtr previousBatchIndices = function->sumOfFunctionsParameter->batchIndices; - auto ntBatchIndicesSycl = SyclHomogenNumericTable::create(batchSize, 1, AllocationFlag::doAllocate); - auto ntBatchIndices2Sycl = SyclHomogenNumericTable::create(batchSize, 1, AllocationFlag::doAllocate); + auto ntBatchIndicesSycl = SyclHomogenNumericTable::create(batchSize, 1, NumericTableIface::doAllocate); + auto ntBatchIndices2Sycl = SyclHomogenNumericTable::create(batchSize, 1, NumericTableIface::doAllocate); const TypeIds::Id idType = TypeIds::id(); UniversalBuffer prevWorkValueU = ctx.allocate(idType, argumentSize, &status); @@ -353,7 +353,7 @@ services::Status SGDKernelOneAPI::compute(HostA { if (!isFirstInitialized) { - sumOfFunctionsParameter->batchIndices = ntBatchIndicesSycl; + function->sumOfFunctionsParameter->batchIndices = ntBatchIndicesSycl; isFirstInitialized = true; } DAAL_CHECK_STATUS(status, function->computeNoThrow()); @@ -362,7 +362,7 @@ services::Status SGDKernelOneAPI::compute(HostA { if (!isSecondInitialized) { - sumOfFunctionsParameter->batchIndices = ntBatchIndices2Sycl; + function->sumOfFunctionsParameter->batchIndices = ntBatchIndices2Sycl; isSecondInitialized = true; } DAAL_CHECK_STATUS(status, function->computeNoThrow()); diff --git a/include/oneapi/internal/execution_context.h b/include/oneapi/internal/execution_context.h index 0293356df09..3da9c586b39 100644 --- a/include/oneapi/internal/execution_context.h +++ b/include/oneapi/internal/execution_context.h @@ -339,15 +339,16 @@ class ExecutionContextIface virtual void syrk(math::UpLo upper_lower, math::Transpose trans, size_t n, size_t k, double alpha, const UniversalBuffer & a_buffer, size_t lda, size_t offsetA, double beta, UniversalBuffer & c_buffer, size_t ldc, size_t offsetC, services::Status * status = NULL) = 0; - virtual void axpy(const uint32_t n, const double a, const UniversalBuffer x_buffer, const int incx, - const UniversalBuffer y_buffer, const int incy, services::Status * status = NULL) = 0; + virtual void axpy(const uint32_t n, const double a, const UniversalBuffer x_buffer, const int incx, const UniversalBuffer y_buffer, + const int incy, services::Status * status = NULL) = 0; virtual void potrf(math::UpLo uplo, size_t n, UniversalBuffer & a_buffer, size_t lda, services::Status * status = NULL) = 0; virtual void potrs(math::UpLo uplo, size_t n, size_t ny, UniversalBuffer & a_buffer, size_t lda, UniversalBuffer & b_buffer, size_t ldb, services::Status * status = NULL) = 0; - virtual void copy(UniversalBuffer dest, size_t desOffset, UniversalBuffer src, size_t srcOffset, size_t count, services::Status * status) = 0; + virtual void copy(UniversalBuffer dest, size_t desOffset, UniversalBuffer src, size_t srcOffset, size_t count, services::Status * status, + bool isSync = true) = 0; virtual void fill(UniversalBuffer dest, double value, services::Status * status) = 0; @@ -357,7 +358,7 @@ class ExecutionContextIface virtual InfoDevice & getInfoDevice() = 0; - virtual void copy(UniversalBuffer dest, size_t desOffset, void *src, size_t srcOffset, size_t count, services::Status *status) = 0; + virtual void copy(UniversalBuffer dest, size_t desOffset, void * src, size_t srcOffset, size_t count, services::Status * status) = 0; }; /** @@ -414,8 +415,8 @@ class CpuExecutionContextImpl : public Base, public ExecutionContextIface services::internal::tryAssignStatus(status, services::ErrorMethodNotImplemented); } - void axpy(const uint32_t n, const double a, const UniversalBuffer x_buffer, const int incx, - const UniversalBuffer y_buffer, const int incy, services::Status * status = NULL) DAAL_C11_OVERRIDE + void axpy(const uint32_t n, const double a, const UniversalBuffer x_buffer, const int incx, const UniversalBuffer y_buffer, const int incy, + services::Status * status = NULL) DAAL_C11_OVERRIDE { services::internal::tryAssignStatus(status, services::ErrorMethodNotImplemented); } @@ -431,8 +432,8 @@ class CpuExecutionContextImpl : public Base, public ExecutionContextIface services::internal::tryAssignStatus(status, services::ErrorMethodNotImplemented); } - void copy(UniversalBuffer dest, size_t desOffset, UniversalBuffer src, size_t srcOffset, size_t count, - services::Status * status = NULL) DAAL_C11_OVERRIDE + void copy(UniversalBuffer dest, size_t desOffset, UniversalBuffer src, size_t srcOffset, size_t count, services::Status * status = NULL, + bool isSync = true) DAAL_C11_OVERRIDE { services::internal::tryAssignStatus(status, services::ErrorMethodNotImplemented); } @@ -452,12 +453,7 @@ class CpuExecutionContextImpl : public Base, public ExecutionContextIface InfoDevice & getInfoDevice() DAAL_C11_OVERRIDE { return _infoDevice; } - void copy(UniversalBuffer dest, - size_t desOffset, - void *src, - size_t srcOffset, - size_t count, - services::Status *status = NULL) DAAL_C11_OVERRIDE + void copy(UniversalBuffer dest, size_t desOffset, void * src, size_t srcOffset, size_t count, services::Status * status = NULL) DAAL_C11_OVERRIDE { services::internal::tryAssignStatus(status, services::ErrorMethodNotImplemented); } diff --git a/include/oneapi/internal/execution_context_sycl.h b/include/oneapi/internal/execution_context_sycl.h index 2b08434a516..1b9b7fe400f 100644 --- a/include/oneapi/internal/execution_context_sycl.h +++ b/include/oneapi/internal/execution_context_sycl.h @@ -279,7 +279,7 @@ class SyclExecutionContextImpl : public Base, public ExecutionContextIface } void copy(UniversalBuffer dest, size_t desOffset, UniversalBuffer src, size_t srcOffset, size_t count, services::Status * status = nullptr, - isSync = true) DAAL_C11_OVERRIDE + bool isSync = true) DAAL_C11_OVERRIDE { DAAL_ASSERT(dest.type() == src.type()); // TODO: Thread safe? From 6c48bb4244843970493a8421e34f3d7947cecd53 Mon Sep 17 00:00:00 2001 From: "Igoshev, Yaroslav" Date: Sun, 16 Feb 2020 16:39:27 +0300 Subject: [PATCH 03/12] fix --- .../sgd/oneapi/sgd_dense_minibatch_oneapi_impl.i | 1 + 1 file changed, 1 insertion(+) diff --git a/algorithms/kernel/optimization_solver/sgd/oneapi/sgd_dense_minibatch_oneapi_impl.i b/algorithms/kernel/optimization_solver/sgd/oneapi/sgd_dense_minibatch_oneapi_impl.i index b341205243e..578431a65c5 100644 --- a/algorithms/kernel/optimization_solver/sgd/oneapi/sgd_dense_minibatch_oneapi_impl.i +++ b/algorithms/kernel/optimization_solver/sgd/oneapi/sgd_dense_minibatch_oneapi_impl.i @@ -234,6 +234,7 @@ services::Status SGDKernelOneAPI::compute(HostA { // Replace by SyclNumericTable when will be RNG on GPU ntBatchIndices = HomogenNumericTableCPU::create(batchSize, 1, &status); + ntBatchIndices2 = HomogenNumericTableCPU::create(batchSize, 1, &status); } NumericTablePtr previousBatchIndices = function->sumOfFunctionsParameter->batchIndices; From 629f85411bc2436ac8edac26807ae9b1c222880a Mon Sep 17 00:00:00 2001 From: "Igoshev, Yaroslav" Date: Sun, 16 Feb 2020 17:07:31 +0300 Subject: [PATCH 04/12] fix --- .../oneapi/sgd_dense_minibatch_oneapi_impl.i | 60 ++++++++++--------- 1 file changed, 32 insertions(+), 28 deletions(-) diff --git a/algorithms/kernel/optimization_solver/sgd/oneapi/sgd_dense_minibatch_oneapi_impl.i b/algorithms/kernel/optimization_solver/sgd/oneapi/sgd_dense_minibatch_oneapi_impl.i index 578431a65c5..719e7e195a6 100644 --- a/algorithms/kernel/optimization_solver/sgd/oneapi/sgd_dense_minibatch_oneapi_impl.i +++ b/algorithms/kernel/optimization_solver/sgd/oneapi/sgd_dense_minibatch_oneapi_impl.i @@ -233,13 +233,13 @@ services::Status SGDKernelOneAPI::compute(HostA if (indicesStatus == user || indicesStatus == random) { // Replace by SyclNumericTable when will be RNG on GPU - ntBatchIndices = HomogenNumericTableCPU::create(batchSize, 1, &status); + ntBatchIndices = HomogenNumericTableCPU::create(batchSize, 1, &status); ntBatchIndices2 = HomogenNumericTableCPU::create(batchSize, 1, &status); } NumericTablePtr previousBatchIndices = function->sumOfFunctionsParameter->batchIndices; - auto ntBatchIndicesSycl = SyclHomogenNumericTable::create(batchSize, 1, NumericTableIface::doAllocate); - auto ntBatchIndices2Sycl = SyclHomogenNumericTable::create(batchSize, 1, NumericTableIface::doAllocate); + auto ntBatchIndicesSycl = SyclHomogenNumericTable::create(batchSize, 1, NumericTableIface::doAllocate); + auto ntBatchIndices2Sycl = SyclHomogenNumericTable::create(batchSize, 1, NumericTableIface::doAllocate); const TypeIds::Id idType = TypeIds::id(); UniversalBuffer prevWorkValueU = ctx.allocate(idType, argumentSize, &status); @@ -291,10 +291,10 @@ services::Status SGDKernelOneAPI::compute(HostA *nProceededIterations = static_cast(nIter); - bool isSync = false; - bool isFirst = false; - bool isFirstInitialized = false; - bool isSecondInitialized = false; + bool isSync = false; + bool isFirstPart = false; + bool isFirstPartInitialized = false; + bool isSecondPartInitialized = false; services::internal::HostAppHelper host(pHost, 10); for (size_t epoch = startIteration; epoch < (startIteration + nIter); epoch++) @@ -320,51 +320,55 @@ services::Status SGDKernelOneAPI::compute(HostA ntBatchIndices2->setArray(const_cast(pValues2), ntBatchIndices2->getNumberOfRows()); } - BlockDescriptor ntBatchIndicesBD; - DAAL_CHECK_STATUS(status, ntBatchIndices->getBlockOfRows(0, 1, ReadWriteMode::readOnly, ntBatchIndicesBD)); - const services::Buffer ntBatchIndicesBuffer = ntBatchIndicesBD.getBuffer(); + BlockDescriptor ntBatchIndicesBD; + DAAL_CHECK_STATUS(status, + ntBatchIndices->getBlockOfRows(0, ntBatchIndices->getNumberOfRows(), ReadWriteMode::readOnly, ntBatchIndicesBD)); + const services::Buffer ntBatchIndicesBuffer = ntBatchIndicesBD.getBuffer(); - BlockDescriptor ntBatchIndicesBDSycl; - DAAL_CHECK_STATUS(status, ntBatchIndicesSycl->getBlockOfRows(0, 1, ReadWriteMode::writeOnly, ntBatchIndicesBDSycl)); - const services::Buffer ntBatchIndicesBufferSycl = ntBatchIndicesBDSycl.getBuffer(); + BlockDescriptor ntBatchIndicesBDSycl; + DAAL_CHECK_STATUS( + status, ntBatchIndicesSycl->getBlockOfRows(0, ntBatchIndicesSycl->getNumberOfRows(), ReadWriteMode::writeOnly, ntBatchIndicesBDSycl)); + const services::Buffer ntBatchIndicesBufferSycl = ntBatchIndicesBDSycl.getBuffer(); ctx.copy(ntBatchIndicesBufferSycl, 0, ntBatchIndicesBuffer, 0, batchSize, &status, isSync); - BlockDescriptor ntBatchIndices2BD; - DAAL_CHECK_STATUS(status, ntBatchIndices2->getBlockOfRows(0, 1, ReadWriteMode::readOnly, ntBatchIndices2BD)); - const services::Buffer ntBatchIndices2Buffer = ntBatchIndices2BD.getBuffer(); + BlockDescriptor ntBatchIndices2BD; + DAAL_CHECK_STATUS(status, + ntBatchIndices2->getBlockOfRows(0, ntBatchIndices2->getNumberOfRows(), ReadWriteMode::readOnly, ntBatchIndices2BD)); + const services::Buffer ntBatchIndices2Buffer = ntBatchIndices2BD.getBuffer(); - BlockDescriptor ntBatchIndices2SyclBD; - DAAL_CHECK_STATUS(status, ntBatchIndices2Sycl->getBlockOfRows(0, 1, ReadWriteMode::writeOnly, ntBatchIndices2SyclBD)); - const services::Buffer ntBatchIndices2SyclBuffer = ntBatchIndices2SyclBD.getBuffer(); + BlockDescriptor ntBatchIndices2SyclBD; + DAAL_CHECK_STATUS(status, ntBatchIndices2Sycl->getBlockOfRows(0, ntBatchIndices2Sycl->getNumberOfRows(), ReadWriteMode::writeOnly, + ntBatchIndices2SyclBD)); + const services::Buffer ntBatchIndices2SyclBuffer = ntBatchIndices2SyclBD.getBuffer(); ctx.copy(ntBatchIndices2SyclBuffer, 0, ntBatchIndices2Buffer, 0, batchSize, &status, isSync); - isFirst = false; - isFirstInitialized = false; - isSecondInitialized = false; + isFirstPart = false; + isFirstPartInitialized = false; + isSecondPartInitialized = false; } if (epoch % L == 0) { - isFirst = true; + isFirstPart = true; } - if (isFirst) + if (isFirstPart) { - if (!isFirstInitialized) + if (!isFirstPartInitialized) { function->sumOfFunctionsParameter->batchIndices = ntBatchIndicesSycl; - isFirstInitialized = true; + isFirstPartInitialized = true; } DAAL_CHECK_STATUS(status, function->computeNoThrow()); } else { - if (!isSecondInitialized) + if (!isSecondPartInitialized) { function->sumOfFunctionsParameter->batchIndices = ntBatchIndices2Sycl; - isSecondInitialized = true; + isSecondPartInitialized = true; } DAAL_CHECK_STATUS(status, function->computeNoThrow()); } From 56fdf1d449392056f14035d15a526d3b5dafbc6a Mon Sep 17 00:00:00 2001 From: "Igoshev, Yaroslav" Date: Sun, 16 Feb 2020 19:37:12 +0300 Subject: [PATCH 05/12] fix --- include/oneapi/internal/execution_context.h | 10 ++++--- .../oneapi/internal/execution_context_sycl.h | 10 +++---- include/oneapi/internal/types_utils_cxx11.h | 30 +++++++++++++------ 3 files changed, 32 insertions(+), 18 deletions(-) diff --git a/include/oneapi/internal/execution_context.h b/include/oneapi/internal/execution_context.h index 3da9c586b39..1943a7b149c 100644 --- a/include/oneapi/internal/execution_context.h +++ b/include/oneapi/internal/execution_context.h @@ -350,7 +350,7 @@ class ExecutionContextIface virtual void copy(UniversalBuffer dest, size_t desOffset, UniversalBuffer src, size_t srcOffset, size_t count, services::Status * status, bool isSync = true) = 0; - virtual void fill(UniversalBuffer dest, double value, services::Status * status) = 0; + virtual void fill(UniversalBuffer dest, double value, services::Status * status, bool isSync = true) = 0; virtual UniversalBuffer allocate(TypeId type, size_t bufferSize, services::Status * status) = 0; @@ -358,7 +358,8 @@ class ExecutionContextIface virtual InfoDevice & getInfoDevice() = 0; - virtual void copy(UniversalBuffer dest, size_t desOffset, void * src, size_t srcOffset, size_t count, services::Status * status) = 0; + virtual void copy(UniversalBuffer dest, size_t desOffset, void * src, size_t srcOffset, size_t count, services::Status * status, + bool isSync = true) = 0; }; /** @@ -438,7 +439,7 @@ class CpuExecutionContextImpl : public Base, public ExecutionContextIface services::internal::tryAssignStatus(status, services::ErrorMethodNotImplemented); } - void fill(UniversalBuffer dest, double value, services::Status * status = NULL) DAAL_C11_OVERRIDE + void fill(UniversalBuffer dest, double value, services::Status * status = NULL, bool isSync = true) DAAL_C11_OVERRIDE { services::internal::tryAssignStatus(status, services::ErrorMethodNotImplemented); } @@ -453,7 +454,8 @@ class CpuExecutionContextImpl : public Base, public ExecutionContextIface InfoDevice & getInfoDevice() DAAL_C11_OVERRIDE { return _infoDevice; } - void copy(UniversalBuffer dest, size_t desOffset, void * src, size_t srcOffset, size_t count, services::Status * status = NULL) DAAL_C11_OVERRIDE + void copy(UniversalBuffer dest, size_t desOffset, void * src, size_t srcOffset, size_t count, services::Status * status = NULL, + bool isSync = true) DAAL_C11_OVERRIDE { services::internal::tryAssignStatus(status, services::ErrorMethodNotImplemented); } diff --git a/include/oneapi/internal/execution_context_sycl.h b/include/oneapi/internal/execution_context_sycl.h index 1b9b7fe400f..ce3bcfab455 100644 --- a/include/oneapi/internal/execution_context_sycl.h +++ b/include/oneapi/internal/execution_context_sycl.h @@ -293,12 +293,12 @@ class SyclExecutionContextImpl : public Base, public ExecutionContextIface } } - void fill(UniversalBuffer dest, double value, services::Status * status = nullptr) DAAL_C11_OVERRIDE + void fill(UniversalBuffer dest, double value, services::Status * status = nullptr, bool isSync = true) DAAL_C11_OVERRIDE { // TODO: Thread safe? try { - BufferFiller::fill(_deviceQueue, dest, value); + BufferFiller::fill(_deviceQueue, dest, value, isSync); } catch (cl::sycl::exception const & e) { @@ -310,13 +310,13 @@ class SyclExecutionContextImpl : public Base, public ExecutionContextIface InfoDevice & getInfoDevice() DAAL_C11_OVERRIDE { return _infoDevice; } - void copy(UniversalBuffer dest, size_t desOffset, void * src, size_t srcOffset, size_t count, - services::Status * status = nullptr) DAAL_C11_OVERRIDE + void copy(UniversalBuffer dest, size_t desOffset, void * src, size_t srcOffset, size_t count, services::Status * status = nullptr, + bool isSync = true) DAAL_C11_OVERRIDE { // TODO: Thread safe? try { - ArrayCopier::copy(_deviceQueue, dest, desOffset, src, srcOffset, count); + ArrayCopier::copy(_deviceQueue, dest, desOffset, src, srcOffset, count, isSync); } catch (cl::sycl::exception const & e) { diff --git a/include/oneapi/internal/types_utils_cxx11.h b/include/oneapi/internal/types_utils_cxx11.h index 92b5937c898..7264f9ab3c3 100644 --- a/include/oneapi/internal/types_utils_cxx11.h +++ b/include/oneapi/internal/types_utils_cxx11.h @@ -125,9 +125,11 @@ class ArrayCopier void * srcArray; size_t srcOffset; size_t count; + bool isSync; - explicit Execute(cl::sycl::queue & queue, UniversalBuffer & dst, size_t desOffset, void * src, size_t srcOffset, size_t count) - : queue(queue), dstUnivers(dst), dstOffset(desOffset), srcArray(src), srcOffset(srcOffset), count(count) + explicit Execute(cl::sycl::queue & queue, UniversalBuffer & dst, size_t desOffset, void * src, size_t srcOffset, size_t count, + bool isSync = true) + : queue(queue), dstUnivers(dst), dstOffset(desOffset), srcArray(src), srcOffset(srcOffset), count(count), isSync(isSync) {} template @@ -139,14 +141,18 @@ class ArrayCopier auto dst_acc = dst.template get_access(cgh, cl::sycl::range<1>(count), cl::sycl::id<1>(dstOffset)); cgh.copy(src, dst_acc); }); - event.wait(); + if (isSync) + { + event.wait(); + } } }; public: - static void copy(cl::sycl::queue & queue, UniversalBuffer & dest, size_t dstOffset, void * src, size_t srcOffset, size_t count) + static void copy(cl::sycl::queue & queue, UniversalBuffer & dest, size_t dstOffset, void * src, size_t srcOffset, size_t count, + bool isSync = true) { - Execute op(queue, dest, dstOffset, src, srcOffset, count); + Execute op(queue, dest, dstOffset, src, srcOffset, count, isSync); TypeDispatcher::dispatch(dest.type(), op); } }; @@ -163,8 +169,11 @@ class BufferFiller cl::sycl::queue & queue; UniversalBuffer & dstUnivers; double value; + bool isSync; - explicit Execute(cl::sycl::queue & queue, UniversalBuffer & dest, double value) : queue(queue), dstUnivers(dest), value(value) {} + explicit Execute(cl::sycl::queue & queue, UniversalBuffer & dest, double value, bool isSync = true) + : queue(queue), dstUnivers(dest), value(value), isSync(isSync) + {} template void operator()(Typelist) @@ -174,14 +183,17 @@ class BufferFiller auto acc = dst.template get_access(cgh); cgh.fill(acc, static_cast(value)); }); - event.wait(); + if (isSync) + { + event.wait(); + } } }; public: - static void fill(cl::sycl::queue & queue, UniversalBuffer & dest, double value) + static void fill(cl::sycl::queue & queue, UniversalBuffer & dest, double value, bool isSync = true) { - Execute op(queue, dest, value); + Execute op(queue, dest, value, isSync); TypeDispatcher::dispatch(dest.type(), op); } }; From ec5656557f9bdbc2c70bb4c4b3f589ca7602dc9c Mon Sep 17 00:00:00 2001 From: "Igoshev, Yaroslav" Date: Sun, 16 Feb 2020 20:38:32 +0300 Subject: [PATCH 06/12] fix --- .../oneapi/sgd_dense_minibatch_oneapi_impl.i | 33 ++++++++++--------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/algorithms/kernel/optimization_solver/sgd/oneapi/sgd_dense_minibatch_oneapi_impl.i b/algorithms/kernel/optimization_solver/sgd/oneapi/sgd_dense_minibatch_oneapi_impl.i index 719e7e195a6..7e991a4d51e 100644 --- a/algorithms/kernel/optimization_solver/sgd/oneapi/sgd_dense_minibatch_oneapi_impl.i +++ b/algorithms/kernel/optimization_solver/sgd/oneapi/sgd_dense_minibatch_oneapi_impl.i @@ -291,10 +291,10 @@ services::Status SGDKernelOneAPI::compute(HostA *nProceededIterations = static_cast(nIter); - bool isSync = false; - bool isFirstPart = false; - bool isFirstPartInitialized = false; - bool isSecondPartInitialized = false; + bool isSync = false; + bool isSecondPartOfIndices = false; + bool isFirstPartOfIndicesInitialized = false; + bool isSecondPartOfIndicesInitialized = false; services::internal::HostAppHelper host(pHost, 10); for (size_t epoch = startIteration; epoch < (startIteration + nIter); epoch++) @@ -303,6 +303,7 @@ services::Status SGDKernelOneAPI::compute(HostA { learningRate = learningRateArray[(epoch / L) % learningRateLength]; consCoeff = consCoeffsArray[(epoch / L) % consCoeffsLength]; + if (indicesStatus == user || indicesStatus == random) { DAAL_ITTNOTIFY_SCOPED_TASK(generateUniform); @@ -344,31 +345,31 @@ services::Status SGDKernelOneAPI::compute(HostA ctx.copy(ntBatchIndices2SyclBuffer, 0, ntBatchIndices2Buffer, 0, batchSize, &status, isSync); - isFirstPart = false; - isFirstPartInitialized = false; - isSecondPartInitialized = false; + isSecondPartOfIndices = false; + isFirstPartOfIndicesInitialized = false; + isSecondPartOfIndicesInitialized = false; } - if (epoch % L == 0) + if ((epoch % L == 0) && !(epoch == startIteration)) { - isFirstPart = true; + isSecondPartOfIndices = true; } - if (isFirstPart) + if (isSecondPartOfIndices) { - if (!isFirstPartInitialized) + if (!isSecondPartOfIndicesInitialized) { - function->sumOfFunctionsParameter->batchIndices = ntBatchIndicesSycl; - isFirstPartInitialized = true; + function->sumOfFunctionsParameter->batchIndices = ntBatchIndices2Sycl; + isSecondPartOfIndicesInitialized = true; } DAAL_CHECK_STATUS(status, function->computeNoThrow()); } else { - if (!isSecondPartInitialized) + if (!isFirstPartOfIndicesInitialized) { - function->sumOfFunctionsParameter->batchIndices = ntBatchIndices2Sycl; - isSecondPartInitialized = true; + function->sumOfFunctionsParameter->batchIndices = ntBatchIndicesSycl; + isFirstPartOfIndicesInitialized = true; } DAAL_CHECK_STATUS(status, function->computeNoThrow()); } From f53c62d2d45a0a3c91c7e5d7dd8ad02e1be31f9b Mon Sep 17 00:00:00 2001 From: "Igoshev, Yaroslav" Date: Sun, 16 Feb 2020 20:41:41 +0300 Subject: [PATCH 07/12] fix --- .../sgd/oneapi/sgd_dense_minibatch_oneapi_impl.i | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/algorithms/kernel/optimization_solver/sgd/oneapi/sgd_dense_minibatch_oneapi_impl.i b/algorithms/kernel/optimization_solver/sgd/oneapi/sgd_dense_minibatch_oneapi_impl.i index 7e991a4d51e..82606302a88 100644 --- a/algorithms/kernel/optimization_solver/sgd/oneapi/sgd_dense_minibatch_oneapi_impl.i +++ b/algorithms/kernel/optimization_solver/sgd/oneapi/sgd_dense_minibatch_oneapi_impl.i @@ -299,12 +299,12 @@ services::Status SGDKernelOneAPI::compute(HostA services::internal::HostAppHelper host(pHost, 10); for (size_t epoch = startIteration; epoch < (startIteration + nIter); epoch++) { - if (epoch % (L << 1) == 0 || epoch == startIteration) + if ((epoch % (L << 1) == 0) || (epoch == startIteration)) { learningRate = learningRateArray[(epoch / L) % learningRateLength]; consCoeff = consCoeffsArray[(epoch / L) % consCoeffsLength]; - if (indicesStatus == user || indicesStatus == random) + if ((indicesStatus == user) || (indicesStatus == random)) { DAAL_ITTNOTIFY_SCOPED_TASK(generateUniform); @@ -312,7 +312,7 @@ services::Status SGDKernelOneAPI::compute(HostA DAAL_CHECK_STATUS(status, rngTask.get(pValues)); ntBatchIndices->setArray(const_cast(pValues), ntBatchIndices->getNumberOfRows()); } - if (indicesStatus == user || indicesStatus == random) + if ((indicesStatus == user) || (indicesStatus == random)) { DAAL_ITTNOTIFY_SCOPED_TASK(generateUniform); From 1eb9942d7248f32f7d0709d7582eb70f9e66d27d Mon Sep 17 00:00:00 2001 From: "Igoshev, Yaroslav" Date: Sun, 16 Feb 2020 21:04:41 +0300 Subject: [PATCH 08/12] fix --- .../sgd/oneapi/sgd_dense_minibatch_oneapi_impl.i | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/algorithms/kernel/optimization_solver/sgd/oneapi/sgd_dense_minibatch_oneapi_impl.i b/algorithms/kernel/optimization_solver/sgd/oneapi/sgd_dense_minibatch_oneapi_impl.i index 82606302a88..317f0e6c40d 100644 --- a/algorithms/kernel/optimization_solver/sgd/oneapi/sgd_dense_minibatch_oneapi_impl.i +++ b/algorithms/kernel/optimization_solver/sgd/oneapi/sgd_dense_minibatch_oneapi_impl.i @@ -333,6 +333,9 @@ services::Status SGDKernelOneAPI::compute(HostA ctx.copy(ntBatchIndicesBufferSycl, 0, ntBatchIndicesBuffer, 0, batchSize, &status, isSync); + ntBatchIndices->releaseBlockOfRows(ntBatchIndicesBD); + ntBatchIndicesSycl->releaseBlockOfRows(ntBatchIndicesBDSycl); + BlockDescriptor ntBatchIndices2BD; DAAL_CHECK_STATUS(status, ntBatchIndices2->getBlockOfRows(0, ntBatchIndices2->getNumberOfRows(), ReadWriteMode::readOnly, ntBatchIndices2BD)); @@ -345,6 +348,9 @@ services::Status SGDKernelOneAPI::compute(HostA ctx.copy(ntBatchIndices2SyclBuffer, 0, ntBatchIndices2Buffer, 0, batchSize, &status, isSync); + ntBatchIndices2->releaseBlockOfRows(ntBatchIndices2BD); + ntBatchIndices2Sycl->releaseBlockOfRows(ntBatchIndices2SyclBD); + isSecondPartOfIndices = false; isFirstPartOfIndicesInitialized = false; isSecondPartOfIndicesInitialized = false; From 1cb029232af453e266a4f9a50193fba0ade0b992 Mon Sep 17 00:00:00 2001 From: "Igoshev, Yaroslav" Date: Mon, 17 Feb 2020 08:46:22 +0300 Subject: [PATCH 09/12] fix --- .../oneapi/sgd_dense_minibatch_oneapi_impl.i | 42 +++++++++---------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/algorithms/kernel/optimization_solver/sgd/oneapi/sgd_dense_minibatch_oneapi_impl.i b/algorithms/kernel/optimization_solver/sgd/oneapi/sgd_dense_minibatch_oneapi_impl.i index 317f0e6c40d..af8a5e9ea2f 100644 --- a/algorithms/kernel/optimization_solver/sgd/oneapi/sgd_dense_minibatch_oneapi_impl.i +++ b/algorithms/kernel/optimization_solver/sgd/oneapi/sgd_dense_minibatch_oneapi_impl.i @@ -238,8 +238,9 @@ services::Status SGDKernelOneAPI::compute(HostA } NumericTablePtr previousBatchIndices = function->sumOfFunctionsParameter->batchIndices; - auto ntBatchIndicesSycl = SyclHomogenNumericTable::create(batchSize, 1, NumericTableIface::doAllocate); - auto ntBatchIndices2Sycl = SyclHomogenNumericTable::create(batchSize, 1, NumericTableIface::doAllocate); + + auto ntBatchIndicesSycl = SyclHomogenNumericTable::create(batchSize, 1, NumericTableIface::doAllocate); + auto ntBatchIndices2Sycl = SyclHomogenNumericTable::create(batchSize, 1, NumericTableIface::doAllocate); const TypeIds::Id idType = TypeIds::id(); UniversalBuffer prevWorkValueU = ctx.allocate(idType, argumentSize, &status); @@ -321,35 +322,34 @@ services::Status SGDKernelOneAPI::compute(HostA ntBatchIndices2->setArray(const_cast(pValues2), ntBatchIndices2->getNumberOfRows()); } - BlockDescriptor ntBatchIndicesBD; - DAAL_CHECK_STATUS(status, - ntBatchIndices->getBlockOfRows(0, ntBatchIndices->getNumberOfRows(), ReadWriteMode::readOnly, ntBatchIndicesBD)); - const services::Buffer ntBatchIndicesBuffer = ntBatchIndicesBD.getBuffer(); + BlockDescriptor batchIndicesBD; + DAAL_CHECK_STATUS(status, ntBatchIndices->getBlockOfRows(0, ntBatchIndices->getNumberOfRows(), ReadWriteMode::readOnly, batchIndicesBD)); + const services::Buffer batchIndicesBuffer = batchIndicesBD.getBuffer(); - BlockDescriptor ntBatchIndicesBDSycl; + BlockDescriptor batchIndicesSyclBD; DAAL_CHECK_STATUS( - status, ntBatchIndicesSycl->getBlockOfRows(0, ntBatchIndicesSycl->getNumberOfRows(), ReadWriteMode::writeOnly, ntBatchIndicesBDSycl)); - const services::Buffer ntBatchIndicesBufferSycl = ntBatchIndicesBDSycl.getBuffer(); + status, ntBatchIndicesSycl->getBlockOfRows(0, ntBatchIndicesSycl->getNumberOfRows(), ReadWriteMode::writeOnly, batchIndicesSyclBD)); + const services::Buffer batchIndicesSyclBuffer = batchIndicesSyclBD.getBuffer(); - ctx.copy(ntBatchIndicesBufferSycl, 0, ntBatchIndicesBuffer, 0, batchSize, &status, isSync); + ctx.copy(batchIndicesSyclBuffer, 0, batchIndicesBuffer, 0, batchSize, &status, isSync); - ntBatchIndices->releaseBlockOfRows(ntBatchIndicesBD); - ntBatchIndicesSycl->releaseBlockOfRows(ntBatchIndicesBDSycl); + ntBatchIndices->releaseBlockOfRows(batchIndicesBD); + ntBatchIndicesSycl->releaseBlockOfRows(batchIndicesSyclBD); - BlockDescriptor ntBatchIndices2BD; + BlockDescriptor batchIndices2BD; DAAL_CHECK_STATUS(status, - ntBatchIndices2->getBlockOfRows(0, ntBatchIndices2->getNumberOfRows(), ReadWriteMode::readOnly, ntBatchIndices2BD)); - const services::Buffer ntBatchIndices2Buffer = ntBatchIndices2BD.getBuffer(); + ntBatchIndices2->getBlockOfRows(0, ntBatchIndices2->getNumberOfRows(), ReadWriteMode::readOnly, batchIndices2BD)); + const services::Buffer batchIndices2Buffer = batchIndices2BD.getBuffer(); - BlockDescriptor ntBatchIndices2SyclBD; + BlockDescriptor batchIndices2SyclBD; DAAL_CHECK_STATUS(status, ntBatchIndices2Sycl->getBlockOfRows(0, ntBatchIndices2Sycl->getNumberOfRows(), ReadWriteMode::writeOnly, - ntBatchIndices2SyclBD)); - const services::Buffer ntBatchIndices2SyclBuffer = ntBatchIndices2SyclBD.getBuffer(); + batchIndices2SyclBD)); + const services::Buffer batchIndices2SyclBuffer = batchIndices2SyclBD.getBuffer(); - ctx.copy(ntBatchIndices2SyclBuffer, 0, ntBatchIndices2Buffer, 0, batchSize, &status, isSync); + ctx.copy(batchIndices2SyclBuffer, 0, batchIndices2Buffer, 0, batchSize, &status, isSync); - ntBatchIndices2->releaseBlockOfRows(ntBatchIndices2BD); - ntBatchIndices2Sycl->releaseBlockOfRows(ntBatchIndices2SyclBD); + ntBatchIndices2->releaseBlockOfRows(batchIndices2BD); + ntBatchIndices2Sycl->releaseBlockOfRows(batchIndices2SyclBD); isSecondPartOfIndices = false; isFirstPartOfIndicesInitialized = false; From f385c40d91a1c0cc035927af5c53addf8c396bfc Mon Sep 17 00:00:00 2001 From: "Igoshev, Yaroslav" Date: Mon, 17 Feb 2020 14:45:18 +0300 Subject: [PATCH 10/12] fix --- .../oneapi/sgd_dense_minibatch_oneapi_impl.i | 74 +++++++++++-------- 1 file changed, 43 insertions(+), 31 deletions(-) diff --git a/algorithms/kernel/optimization_solver/sgd/oneapi/sgd_dense_minibatch_oneapi_impl.i b/algorithms/kernel/optimization_solver/sgd/oneapi/sgd_dense_minibatch_oneapi_impl.i index af8a5e9ea2f..7ea8221a27f 100644 --- a/algorithms/kernel/optimization_solver/sgd/oneapi/sgd_dense_minibatch_oneapi_impl.i +++ b/algorithms/kernel/optimization_solver/sgd/oneapi/sgd_dense_minibatch_oneapi_impl.i @@ -229,19 +229,21 @@ services::Status SGDKernelOneAPI::compute(HostA const IndicesStatus indicesStatus = (batchIndices ? user : (batchSize < nTerms ? random : all)); services::SharedPtr > ntBatchIndices; services::SharedPtr > ntBatchIndices2; + services::SharedPtr > ntBatchIndicesSycl; + services::SharedPtr > ntBatchIndices2Sycl; + BlockDescriptor batchIndicesBD; + BlockDescriptor batchIndicesSyclBD; + BlockDescriptor batchIndices2BD; + BlockDescriptor batchIndices2SyclBD; if (indicesStatus == user || indicesStatus == random) { - // Replace by SyclNumericTable when will be RNG on GPU ntBatchIndices = HomogenNumericTableCPU::create(batchSize, 1, &status); ntBatchIndices2 = HomogenNumericTableCPU::create(batchSize, 1, &status); } NumericTablePtr previousBatchIndices = function->sumOfFunctionsParameter->batchIndices; - auto ntBatchIndicesSycl = SyclHomogenNumericTable::create(batchSize, 1, NumericTableIface::doAllocate); - auto ntBatchIndices2Sycl = SyclHomogenNumericTable::create(batchSize, 1, NumericTableIface::doAllocate); - const TypeIds::Id idType = TypeIds::id(); UniversalBuffer prevWorkValueU = ctx.allocate(idType, argumentSize, &status); services::Buffer prevWorkValueBuff = prevWorkValueU.get(); @@ -312,6 +314,20 @@ services::Status SGDKernelOneAPI::compute(HostA const int * pValues = nullptr; DAAL_CHECK_STATUS(status, rngTask.get(pValues)); ntBatchIndices->setArray(const_cast(pValues), ntBatchIndices->getNumberOfRows()); + + DAAL_CHECK_STATUS(status, + ntBatchIndices->getBlockOfRows(0, ntBatchIndices->getNumberOfRows(), ReadWriteMode::readOnly, batchIndicesBD)); + const services::Buffer batchIndicesBuffer = batchIndicesBD.getBuffer(); + + if (!ntBatchIndicesSycl.get()) + { + ntBatchIndicesSycl = SyclHomogenNumericTable::create(batchSize, 1, NumericTableIface::doAllocate); + } + DAAL_CHECK_STATUS(status, ntBatchIndicesSycl->getBlockOfRows(0, ntBatchIndicesSycl->getNumberOfRows(), ReadWriteMode::writeOnly, + batchIndicesSyclBD)); + const services::Buffer batchIndicesSyclBuffer = batchIndicesSyclBD.getBuffer(); + + ctx.copy(batchIndicesSyclBuffer, 0, batchIndicesBuffer, 0, batchSize, &status, isSync); } if ((indicesStatus == user) || (indicesStatus == random)) { @@ -320,36 +336,21 @@ services::Status SGDKernelOneAPI::compute(HostA const int * pValues2 = nullptr; DAAL_CHECK_STATUS(status, rngTask.get(pValues2)); ntBatchIndices2->setArray(const_cast(pValues2), ntBatchIndices2->getNumberOfRows()); - } - - BlockDescriptor batchIndicesBD; - DAAL_CHECK_STATUS(status, ntBatchIndices->getBlockOfRows(0, ntBatchIndices->getNumberOfRows(), ReadWriteMode::readOnly, batchIndicesBD)); - const services::Buffer batchIndicesBuffer = batchIndicesBD.getBuffer(); - - BlockDescriptor batchIndicesSyclBD; - DAAL_CHECK_STATUS( - status, ntBatchIndicesSycl->getBlockOfRows(0, ntBatchIndicesSycl->getNumberOfRows(), ReadWriteMode::writeOnly, batchIndicesSyclBD)); - const services::Buffer batchIndicesSyclBuffer = batchIndicesSyclBD.getBuffer(); - - ctx.copy(batchIndicesSyclBuffer, 0, batchIndicesBuffer, 0, batchSize, &status, isSync); - - ntBatchIndices->releaseBlockOfRows(batchIndicesBD); - ntBatchIndicesSycl->releaseBlockOfRows(batchIndicesSyclBD); - BlockDescriptor batchIndices2BD; - DAAL_CHECK_STATUS(status, - ntBatchIndices2->getBlockOfRows(0, ntBatchIndices2->getNumberOfRows(), ReadWriteMode::readOnly, batchIndices2BD)); - const services::Buffer batchIndices2Buffer = batchIndices2BD.getBuffer(); + DAAL_CHECK_STATUS(status, + ntBatchIndices2->getBlockOfRows(0, ntBatchIndices2->getNumberOfRows(), ReadWriteMode::readOnly, batchIndices2BD)); + const services::Buffer batchIndices2Buffer = batchIndices2BD.getBuffer(); - BlockDescriptor batchIndices2SyclBD; - DAAL_CHECK_STATUS(status, ntBatchIndices2Sycl->getBlockOfRows(0, ntBatchIndices2Sycl->getNumberOfRows(), ReadWriteMode::writeOnly, - batchIndices2SyclBD)); - const services::Buffer batchIndices2SyclBuffer = batchIndices2SyclBD.getBuffer(); - - ctx.copy(batchIndices2SyclBuffer, 0, batchIndices2Buffer, 0, batchSize, &status, isSync); + if (!ntBatchIndices2Sycl.get()) + { + ntBatchIndices2Sycl = SyclHomogenNumericTable::create(batchSize, 1, NumericTableIface::doAllocate); + } + DAAL_CHECK_STATUS(status, ntBatchIndices2Sycl->getBlockOfRows(0, ntBatchIndices2Sycl->getNumberOfRows(), ReadWriteMode::writeOnly, + batchIndices2SyclBD)); + const services::Buffer batchIndices2SyclBuffer = batchIndices2SyclBD.getBuffer(); - ntBatchIndices2->releaseBlockOfRows(batchIndices2BD); - ntBatchIndices2Sycl->releaseBlockOfRows(batchIndices2SyclBD); + ctx.copy(batchIndices2SyclBuffer, 0, batchIndices2Buffer, 0, batchSize, &status, isSync); + } isSecondPartOfIndices = false; isFirstPartOfIndicesInitialized = false; @@ -406,6 +407,17 @@ services::Status SGDKernelOneAPI::compute(HostA } DAAL_CHECK_STATUS(status, makeStep(argumentSize, prevWorkValueBuff, gradientBuff, workValueBuff, learningRate, consCoeff)); nProceededIters++; + + if ((epoch % (L << 1) == 0) || (epoch == startIteration)) + { + if ((indicesStatus == user) || (indicesStatus == random)) + { + ntBatchIndices->releaseBlockOfRows(batchIndicesBD); + ntBatchIndicesSycl->releaseBlockOfRows(batchIndicesSyclBD); + ntBatchIndices2->releaseBlockOfRows(batchIndices2BD); + ntBatchIndices2Sycl->releaseBlockOfRows(batchIndices2SyclBD); + } + } } if (lastIterationResult) From 5b65581396cb72342c37e7f6b3defd6260984960 Mon Sep 17 00:00:00 2001 From: "Igoshev, Yaroslav" Date: Mon, 17 Feb 2020 23:49:28 +0300 Subject: [PATCH 11/12] fix --- .../oneapi/sgd_dense_minibatch_oneapi_impl.i | 33 +++++++------------ include/oneapi/internal/types_utils_cxx11.h | 12 +++++++ 2 files changed, 23 insertions(+), 22 deletions(-) diff --git a/algorithms/kernel/optimization_solver/sgd/oneapi/sgd_dense_minibatch_oneapi_impl.i b/algorithms/kernel/optimization_solver/sgd/oneapi/sgd_dense_minibatch_oneapi_impl.i index 7ea8221a27f..07c7c86a653 100644 --- a/algorithms/kernel/optimization_solver/sgd/oneapi/sgd_dense_minibatch_oneapi_impl.i +++ b/algorithms/kernel/optimization_solver/sgd/oneapi/sgd_dense_minibatch_oneapi_impl.i @@ -238,11 +238,14 @@ services::Status SGDKernelOneAPI::compute(HostA if (indicesStatus == user || indicesStatus == random) { - ntBatchIndices = HomogenNumericTableCPU::create(batchSize, 1, &status); - ntBatchIndices2 = HomogenNumericTableCPU::create(batchSize, 1, &status); + ntBatchIndices = HomogenNumericTableCPU::create(batchSize, 1, &status); + ntBatchIndices2 = HomogenNumericTableCPU::create(batchSize, 1, &status); + ntBatchIndicesSycl = SyclHomogenNumericTable::create(batchSize, 1, NumericTableIface::doAllocate); + ntBatchIndices2Sycl = SyclHomogenNumericTable::create(batchSize, 1, NumericTableIface::doAllocate); } - NumericTablePtr previousBatchIndices = function->sumOfFunctionsParameter->batchIndices; + NumericTablePtr previousBatchIndices = function->sumOfFunctionsParameter->batchIndices; + function->sumOfFunctionsParameter->batchIndices = ntBatchIndices; const TypeIds::Id idType = TypeIds::id(); UniversalBuffer prevWorkValueU = ctx.allocate(idType, argumentSize, &status); @@ -319,10 +322,6 @@ services::Status SGDKernelOneAPI::compute(HostA ntBatchIndices->getBlockOfRows(0, ntBatchIndices->getNumberOfRows(), ReadWriteMode::readOnly, batchIndicesBD)); const services::Buffer batchIndicesBuffer = batchIndicesBD.getBuffer(); - if (!ntBatchIndicesSycl.get()) - { - ntBatchIndicesSycl = SyclHomogenNumericTable::create(batchSize, 1, NumericTableIface::doAllocate); - } DAAL_CHECK_STATUS(status, ntBatchIndicesSycl->getBlockOfRows(0, ntBatchIndicesSycl->getNumberOfRows(), ReadWriteMode::writeOnly, batchIndicesSyclBD)); const services::Buffer batchIndicesSyclBuffer = batchIndicesSyclBD.getBuffer(); @@ -341,10 +340,6 @@ services::Status SGDKernelOneAPI::compute(HostA ntBatchIndices2->getBlockOfRows(0, ntBatchIndices2->getNumberOfRows(), ReadWriteMode::readOnly, batchIndices2BD)); const services::Buffer batchIndices2Buffer = batchIndices2BD.getBuffer(); - if (!ntBatchIndices2Sycl.get()) - { - ntBatchIndices2Sycl = SyclHomogenNumericTable::create(batchSize, 1, NumericTableIface::doAllocate); - } DAAL_CHECK_STATUS(status, ntBatchIndices2Sycl->getBlockOfRows(0, ntBatchIndices2Sycl->getNumberOfRows(), ReadWriteMode::writeOnly, batchIndices2SyclBD)); const services::Buffer batchIndices2SyclBuffer = batchIndices2SyclBD.getBuffer(); @@ -359,6 +354,11 @@ services::Status SGDKernelOneAPI::compute(HostA if ((epoch % L == 0) && !(epoch == startIteration)) { + ntBatchIndices->releaseBlockOfRows(batchIndicesBD); + ntBatchIndicesSycl->releaseBlockOfRows(batchIndicesSyclBD); + ntBatchIndices2->releaseBlockOfRows(batchIndices2BD); + ntBatchIndices2Sycl->releaseBlockOfRows(batchIndices2SyclBD); + isSecondPartOfIndices = true; } @@ -407,17 +407,6 @@ services::Status SGDKernelOneAPI::compute(HostA } DAAL_CHECK_STATUS(status, makeStep(argumentSize, prevWorkValueBuff, gradientBuff, workValueBuff, learningRate, consCoeff)); nProceededIters++; - - if ((epoch % (L << 1) == 0) || (epoch == startIteration)) - { - if ((indicesStatus == user) || (indicesStatus == random)) - { - ntBatchIndices->releaseBlockOfRows(batchIndicesBD); - ntBatchIndicesSycl->releaseBlockOfRows(batchIndicesSyclBD); - ntBatchIndices2->releaseBlockOfRows(batchIndices2BD); - ntBatchIndices2Sycl->releaseBlockOfRows(batchIndices2SyclBD); - } - } } if (lastIterationResult) diff --git a/include/oneapi/internal/types_utils_cxx11.h b/include/oneapi/internal/types_utils_cxx11.h index 7264f9ab3c3..8209bb108cd 100644 --- a/include/oneapi/internal/types_utils_cxx11.h +++ b/include/oneapi/internal/types_utils_cxx11.h @@ -98,6 +98,10 @@ class BufferCopier { event.wait(); } + else + { + dst.set_write_back(false); + } } }; @@ -145,6 +149,10 @@ class ArrayCopier { event.wait(); } + else + { + dst.set_write_back(false); + } } }; @@ -187,6 +195,10 @@ class BufferFiller { event.wait(); } + else + { + dst.set_write_back(false); + } } }; From dc8876fcd51317b274c49cd741f2b34e7477695b Mon Sep 17 00:00:00 2001 From: "Igoshev, Yaroslav" Date: Tue, 18 Feb 2020 16:01:18 +0300 Subject: [PATCH 12/12] fix --- .../sgd/oneapi/sgd_dense_minibatch_oneapi_impl.i | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/algorithms/kernel/optimization_solver/sgd/oneapi/sgd_dense_minibatch_oneapi_impl.i b/algorithms/kernel/optimization_solver/sgd/oneapi/sgd_dense_minibatch_oneapi_impl.i index 07c7c86a653..d9545f9ea3d 100644 --- a/algorithms/kernel/optimization_solver/sgd/oneapi/sgd_dense_minibatch_oneapi_impl.i +++ b/algorithms/kernel/optimization_solver/sgd/oneapi/sgd_dense_minibatch_oneapi_impl.i @@ -354,11 +354,6 @@ services::Status SGDKernelOneAPI::compute(HostA if ((epoch % L == 0) && !(epoch == startIteration)) { - ntBatchIndices->releaseBlockOfRows(batchIndicesBD); - ntBatchIndicesSycl->releaseBlockOfRows(batchIndicesSyclBD); - ntBatchIndices2->releaseBlockOfRows(batchIndices2BD); - ntBatchIndices2Sycl->releaseBlockOfRows(batchIndices2SyclBD); - isSecondPartOfIndices = true; } @@ -407,6 +402,14 @@ services::Status SGDKernelOneAPI::compute(HostA } DAAL_CHECK_STATUS(status, makeStep(argumentSize, prevWorkValueBuff, gradientBuff, workValueBuff, learningRate, consCoeff)); nProceededIters++; + + if ((epoch % (L << 1) == (L << 1) - 1) && !(epoch == startIteration)) + { + ntBatchIndices->releaseBlockOfRows(batchIndicesBD); + ntBatchIndicesSycl->releaseBlockOfRows(batchIndicesSyclBD); + ntBatchIndices2->releaseBlockOfRows(batchIndices2BD); + ntBatchIndices2Sycl->releaseBlockOfRows(batchIndices2SyclBD); + } } if (lastIterationResult)