Skip to content
This repository was archived by the owner on Apr 8, 2024. It is now read-only.
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,7 @@ Usage:
--pow-diff or -d <0-256> count of leading zero bits in target D value [default - 16]
--srand-seed or -ss <unsigned int> set srand seed value for POW test: 0 - use zero id/seed [default], -1 - use random value
--solution-idx or -si <unsigned int> set solution index for POW test: index will be compared to be the found solution for Pow [default - unset]
-N <scrypt N> set scrypt parameter N [default - 512]
```

## Mixing CUDA and Vulkan
Expand Down
1 change: 1 addition & 0 deletions src/api_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -309,6 +309,7 @@ struct cgpu_info {
enum alive status;

uint32_t gpu_core_count;
uint64_t gpu_memory;
uint64_t gpu_max_alloc;

uint32_t hash_len_bits;
Expand Down
2 changes: 1 addition & 1 deletion src/cuda/salsa_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,7 @@ int find_optimal_concurency(struct cgpu_info *cgpu, _cudaState *cudaState, Kerne
optimal_blocks--;
}

optimal_blocks &= 0xfffffffc;
optimal_blocks &= (optimal_blocks > 512) ? 0xffffff00 : 0xfffffffc;

applog(LOG_INFO, "GPU #%d: using launch configuration %c%dx%d", cgpu->driver_id, kernel->get_identifier(), optimal_blocks, WARPS_PER_BLOCK);

Expand Down
159 changes: 90 additions & 69 deletions src/vulkan/driver-vulkan.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -111,145 +111,159 @@ static uint64_t alignBuffer(uint64_t size, uint64_t align)

static _vulkanState *initVulkan(struct cgpu_info *cgpu, char *name, size_t nameSize, uint32_t hash_len_bits, bool throttled, bool copy_only)
{
_vulkanState *state = (_vulkanState *)calloc(1, sizeof(_vulkanState));
_vulkanState state;
memset(&state, 0, sizeof(_vulkanState));

uint32_t scrypt_mem = 128 * cgpu->r;
uint32_t scrypt_mem = 128 * cgpu->r * cgpu->N;

uint32_t computeQueueFamilyIndex = getComputeQueueFamilyIndex(cgpu->driver_id);
if (computeQueueFamilyIndex < 0) {
applog(LOG_ERR, "GPU %d: Compute query family not found\n", cgpu->driver_id);
return NULL;
}

state->deviceId = cgpu->driver_id;
state->vkDevice = createDevice(cgpu->driver_id, computeQueueFamilyIndex);
if (NULL == state->vkDevice) {
state.deviceId = cgpu->driver_id;
state.vkDevice = createDevice(cgpu->driver_id, computeQueueFamilyIndex);
if (NULL == state.vkDevice) {
applog(LOG_NOTICE, "GPU %d: Create Vulkan device instance failed", cgpu->driver_id);
return NULL;
}

VkDeviceMemory tmpMem = allocateGPUMemory(state->deviceId, state->vkDevice, 1024, true, true);
VkBuffer tmpBuf = createBuffer(state->vkDevice, computeQueueFamilyIndex, tmpMem, 256, 0);
state->alignment = (uint32_t)getBufferMemoryRequirements(state->vkDevice, tmpBuf);
gVulkan.vkDestroyBuffer(state->vkDevice, tmpBuf, NULL);
gVulkan.vkFreeMemory(state->vkDevice, tmpMem, NULL);

state.alignment = 256;
cgpu->work_size = 64;

applog(LOG_NOTICE, "GPU %d: selecting lookup gap of 4", cgpu->driver_id);
cgpu->lookup_gap = 4;

unsigned int bsize = 1024;
size_t ipt = (bsize / cgpu->lookup_gap + (bsize % cgpu->lookup_gap > 0));
size_t ipt = scrypt_mem / cgpu->lookup_gap;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you please document and explain in comments the logic behind calculations of buffers/memory sizes in this function and what variables mean (for example, what "ipt" stands for)?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Scrypt memory size in bytes scrypt_mem = 128 * r * N
Lookup gap reduces memory usage but increases computational complexity. With lookup gap memory usage is
ipt = scrypt_mem / lookup_gap
max concurrent threads = allocated_buffer_size / ipt

size_t map = 88;
size_t gpu_max_alloc = cgpu->gpu_max_alloc;
unsigned max_threads = 32*1024;

applog(LOG_DEBUG, "GPU %d: %u MB mem, %u MB max alloc", cgpu->driver_id, (unsigned)(cgpu->gpu_memory / 1024 / 1024), (unsigned)(cgpu->gpu_max_alloc / 1024 / 1024));

if (0 != gpu_max_alloc) {
if (cgpu->gpu_memory > 4ull*1024ull*1024ull*1024ull) {
map = 100;
}
} else {
gpu_max_alloc = cgpu->gpu_memory;
}

if (!cgpu->buffer_size) {
unsigned int base_alloc = (int)(cgpu->gpu_max_alloc * 88 / 100 / 1024 / 1024 / 8) * 8 * 1024 * 1024;
cgpu->thread_concurrency = (uint32_t)(base_alloc / scrypt_mem / ipt);
size_t base_alloc = (gpu_max_alloc * map / 100 / 1024 / 1024 / 8) * 8 * 1024 * 1024;
cgpu->thread_concurrency = (uint32_t)(base_alloc / ipt);
cgpu->thread_concurrency = (cgpu->thread_concurrency / cgpu->work_size) * cgpu->work_size;
cgpu->buffer_size = base_alloc / 1024 / 1024;
applog(LOG_DEBUG, "88%% Max Allocation: %u", base_alloc);
applog(LOG_DEBUG, "%u%% Max Allocation: %u", (unsigned)map, (unsigned)base_alloc);
applog(LOG_NOTICE, "GPU %d: selecting buffer_size of %zu", cgpu->driver_id, cgpu->buffer_size);
}

if (cgpu->buffer_size) {
// use the buffer-size to overwrite the thread-concurrency
cgpu->thread_concurrency = (int)((cgpu->buffer_size * 1024 * 1024) / ipt / scrypt_mem);
}

cgpu->thread_concurrency = min(cgpu->thread_concurrency, /*cgpu->work_size*/ 32 * 1024);
cgpu->thread_concurrency = min(cgpu->thread_concurrency, max_threads);
uint32_t chunkSize = copy_only ? (cgpu->thread_concurrency * 32) : ((cgpu->thread_concurrency * hash_len_bits + 7) / 8);

applog(LOG_DEBUG, "GPU %d: setting thread_concurrency to %d based on buffer size %d and lookup gap %d", cgpu->driver_id, (int)(cgpu->thread_concurrency), (int)(cgpu->buffer_size), (int)(cgpu->lookup_gap));

state->bufSize = alignBuffer(scrypt_mem * ipt * cgpu->thread_concurrency, state->alignment);
state->memConstantSize = alignBuffer(sizeof(AlgorithmConstants), state->alignment);
state->memParamsSize = alignBuffer(sizeof(AlgorithmParams), state->alignment);
state->memInputSize = alignBuffer(PREIMAGE_SIZE, state->alignment);
state->memOutputSize = alignBuffer(chunkSize, state->alignment);
state->sharedMemorySize = state->memConstantSize + state->memParamsSize + state->memInputSize + 2 * state->memOutputSize;
state.bufSize = alignBuffer(ipt * cgpu->thread_concurrency, state.alignment);
state.memConstantSize = alignBuffer(sizeof(AlgorithmConstants), state.alignment);
state.memParamsSize = alignBuffer(sizeof(AlgorithmParams), state.alignment);
state.memInputSize = alignBuffer(PREIMAGE_SIZE, state.alignment);
state.memOutputSize = alignBuffer(chunkSize, state.alignment);
state.sharedMemorySize = state.memConstantSize + state.memParamsSize + state.memInputSize + 2 * state.memOutputSize;

state->gpuLocalMemory = allocateGPUMemory(state->deviceId, state->vkDevice, state->bufSize, true, true);
state->gpuSharedMemory = allocateGPUMemory(state->deviceId, state->vkDevice, state->sharedMemorySize, false, true);
state.gpuLocalMemory = allocateGPUMemory(state.deviceId, state.vkDevice, state.bufSize, true, true);
if (NULL == state.gpuLocalMemory) {
applog(LOG_ERR, "Cannot allocated gpuLocalMemory: %u kB GPU memory type for GPU index %u", (unsigned)(state.bufSize / 1024), state.deviceId);
return NULL;
}
state.gpuSharedMemory = allocateGPUMemory(state.deviceId, state.vkDevice, state.sharedMemorySize, false, true);
if (NULL == state.gpuSharedMemory) {
applog(LOG_ERR, "Cannot allocated gpuSharedMemory: %u kB GPU memory type for GPU index %u", (unsigned)(state.sharedMemorySize / 1024), state.deviceId);
return NULL;
}

state->padbuffer8 = createBuffer(state->vkDevice, computeQueueFamilyIndex, state->gpuLocalMemory, state->bufSize, 0);
state.padbuffer8 = createBuffer(state.vkDevice, computeQueueFamilyIndex, state.gpuLocalMemory, state.bufSize, 0);

uint64_t o = 0;
state->gpu_constants = createBuffer(state->vkDevice, computeQueueFamilyIndex, state->gpuSharedMemory, state->memConstantSize, o);
o += state->memConstantSize;
state->gpu_params = createBuffer(state->vkDevice, computeQueueFamilyIndex, state->gpuSharedMemory, state->memParamsSize, o);
o += state->memParamsSize;
state->CLbuffer0 = createBuffer(state->vkDevice, computeQueueFamilyIndex, state->gpuSharedMemory, state->memInputSize, o);
o += state->memInputSize;
state->outputBuffer[0] = createBuffer(state->vkDevice, computeQueueFamilyIndex, state->gpuSharedMemory, state->memOutputSize, o);
o += state->memOutputSize;
state->outputBuffer[1] = createBuffer(state->vkDevice, computeQueueFamilyIndex, state->gpuSharedMemory, state->memOutputSize, o);

gVulkan.vkGetDeviceQueue(state->vkDevice, computeQueueFamilyIndex, 0, &state->queue);

state->pipelineLayout = bindBuffers(state->vkDevice, &state->descriptorSet, &state->descriptorPool, &state->descriptorSetLayout,
state->padbuffer8, state->gpu_constants, state->gpu_params, state->CLbuffer0, state->outputBuffer[0], state->outputBuffer[1]
state.gpu_constants = createBuffer(state.vkDevice, computeQueueFamilyIndex, state.gpuSharedMemory, state.memConstantSize, o);
o += state.memConstantSize;
state.gpu_params = createBuffer(state.vkDevice, computeQueueFamilyIndex, state.gpuSharedMemory, state.memParamsSize, o);
o += state.memParamsSize;
state.CLbuffer0 = createBuffer(state.vkDevice, computeQueueFamilyIndex, state.gpuSharedMemory, state.memInputSize, o);
o += state.memInputSize;
state.outputBuffer[0] = createBuffer(state.vkDevice, computeQueueFamilyIndex, state.gpuSharedMemory, state.memOutputSize, o);
o += state.memOutputSize;
state.outputBuffer[1] = createBuffer(state.vkDevice, computeQueueFamilyIndex, state.gpuSharedMemory, state.memOutputSize, o);

gVulkan.vkGetDeviceQueue(state.vkDevice, computeQueueFamilyIndex, 0, &state.queue);

state.pipelineLayout = bindBuffers(state.vkDevice, &state.descriptorSet, &state.descriptorPool, &state.descriptorSetLayout,
state.padbuffer8, state.gpu_constants, state.gpu_params, state.CLbuffer0, state.outputBuffer[0], state.outputBuffer[1]
);

void *ptr = NULL;
CHECK_RESULT(gVulkan.vkMapMemory(state->vkDevice, state->gpuSharedMemory, 0, state->memConstantSize, 0, (void **)&ptr), "vkMapMemory", NULL);
CHECK_RESULT(gVulkan.vkMapMemory(state.vkDevice, state.gpuSharedMemory, 0, state.memConstantSize, 0, (void **)&ptr), "vkMapMemory", NULL);
memcpy(ptr, (const void*)&gpuConstants, sizeof(AlgorithmConstants));
gVulkan.vkUnmapMemory(state->vkDevice, state->gpuSharedMemory);
gVulkan.vkUnmapMemory(state.vkDevice, state.gpuSharedMemory);

VkCommandPoolCreateInfo commandPoolCreateInfo = {
VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
0,
VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT,
computeQueueFamilyIndex
};
CHECK_RESULT(gVulkan.vkCreateCommandPool(state->vkDevice, &commandPoolCreateInfo, 0, &state->commandPool), "vkCreateCommandPool", NULL);
CHECK_RESULT(gVulkan.vkCreateCommandPool(state.vkDevice, &commandPoolCreateInfo, 0, &state.commandPool), "vkCreateCommandPool", NULL);

VkCommandBufferAllocateInfo commandBufferAllocateInfo = {
VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO,
0,
state->commandPool,
state.commandPool,
VK_COMMAND_BUFFER_LEVEL_PRIMARY,
1
};
CHECK_RESULT(gVulkan.vkAllocateCommandBuffers(state->vkDevice, &commandBufferAllocateInfo, &state->commandBuffer), "vkAllocateCommandBuffers", NULL);
CHECK_RESULT(gVulkan.vkAllocateCommandBuffers(state.vkDevice, &commandBufferAllocateInfo, &state.commandBuffer), "vkAllocateCommandBuffers", NULL);

VkSemaphoreCreateInfo semaphoreCreateInfo = { VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO };
semaphoreCreateInfo.pNext = NULL;
semaphoreCreateInfo.flags = 0;
CHECK_RESULT(gVulkan.vkCreateSemaphore(state->vkDevice, &semaphoreCreateInfo, NULL, &state->semaphore), "vkCreateSemaphore", NULL);
CHECK_RESULT(gVulkan.vkCreateSemaphore(state.vkDevice, &semaphoreCreateInfo, NULL, &state.semaphore), "vkCreateSemaphore", NULL);

VkFenceCreateInfo fenceCreateInfo = { VK_STRUCTURE_TYPE_FENCE_CREATE_INFO };
fenceCreateInfo.pNext = NULL;
fenceCreateInfo.flags = 0;

CHECK_RESULT(gVulkan.vkCreateFence(state->vkDevice, &fenceCreateInfo, NULL, &state->fence), "vkCreateFence", NULL);
CHECK_RESULT(gVulkan.vkCreateFence(state.vkDevice, &fenceCreateInfo, NULL, &state.fence), "vkCreateFence", NULL);

#if 0
char options[256];
snprintf(options, sizeof(options), "#version 450\n#define LOOKUP_GAP %d\n#define WORKSIZE %d\n#define LABEL_SIZE %d\n",
cgpu->lookup_gap, (int)cgpu->work_size, hash_len_bits);

state->pipeline = compileShader(state->vkDevice, state->pipelineLayout, &state->shaderModule, scrypt_chacha_comp, options, (int)cgpu->work_size, hash_len_bits, copy_only);
state.pipeline = compileShader(state.vkDevice, state.pipelineLayout, &state.shaderModule, scrypt_chacha_comp, options, (int)cgpu->work_size, hash_len_bits, copy_only);
#else
// char filename[64];
// snprintf(filename, sizeof(filename), "kernel-%02d-%03d.spirv", (int)cgpu->work_size, hash_len_bits);
state->pipeline = loadShader(state->vkDevice, state->pipelineLayout, &state->shaderModule, cgpu->work_size, hash_len_bits);
state.pipeline = loadShader(state.vkDevice, state.pipelineLayout, &state.shaderModule, cgpu->work_size, hash_len_bits);
#endif
if (!state->pipeline) {
if (!state.pipeline) {
return NULL;
}

VkCommandBufferBeginInfo commandBufferBeginInfo = { VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO, 0, 0, 0 };
CHECK_RESULT(gVulkan.vkBeginCommandBuffer(state->commandBuffer, &commandBufferBeginInfo), "vkBeginCommandBuffer", NULL);
CHECK_RESULT(gVulkan.vkBeginCommandBuffer(state.commandBuffer, &commandBufferBeginInfo), "vkBeginCommandBuffer", NULL);

gVulkan.vkCmdBindPipeline(state->commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, state->pipeline);
gVulkan.vkCmdBindDescriptorSets(state->commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, state->pipelineLayout, 0, 1, &state->descriptorSet, 0, 0);
gVulkan.vkCmdBindPipeline(state.commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, state.pipeline);
gVulkan.vkCmdBindDescriptorSets(state.commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, state.pipelineLayout, 0, 1, &state.descriptorSet, 0, 0);
#if 1
gVulkan.vkCmdDispatch(state->commandBuffer, cgpu->thread_concurrency / cgpu->work_size, 1, 1);
gVulkan.vkCmdDispatch(state.commandBuffer, cgpu->thread_concurrency / cgpu->work_size, 1, 1);
#else
gVulkan.vkCmdDispatch(state->commandBuffer, 1, 1, 1);
gVulkan.vkCmdDispatch(state.commandBuffer, 1, 1, 1);
#endif
CHECK_RESULT(gVulkan.vkEndCommandBuffer(state->commandBuffer), "vkEndCommandBuffer", NULL);
CHECK_RESULT(gVulkan.vkEndCommandBuffer(state.commandBuffer), "vkEndCommandBuffer", NULL);

return state;
_vulkanState* pstate = (_vulkanState*)calloc(1, sizeof(_vulkanState));
if (nullptr != pstate) {
memcpy(pstate, &state, sizeof(_vulkanState));
}
return pstate;
}

static int vulkan_detect(struct cgpu_info *gpus, int *active)
Expand Down Expand Up @@ -308,14 +322,19 @@ static int vulkan_detect(struct cgpu_info *gpus, int *active)
for (unsigned i = 0; i < gPhysicalDeviceCount; i++) {
struct cgpu_info *cgpu = &gpus[*active];

VkPhysicalDeviceProperties physicalDeviceProperties;
gVulkan.vkGetPhysicalDeviceProperties(gPhysicalDevices[i], &physicalDeviceProperties);
VkPhysicalDeviceMaintenance3Properties physicalDeviceProperties3;
physicalDeviceProperties3.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_MAINTENANCE_3_PROPERTIES;
physicalDeviceProperties3.pNext = NULL;
VkPhysicalDeviceProperties2 physicalDeviceProperties2;
physicalDeviceProperties2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2;
physicalDeviceProperties2.pNext = &physicalDeviceProperties3;
gVulkan.vkGetPhysicalDeviceProperties2(gPhysicalDevices[i], &physicalDeviceProperties2);

if (0x10DE == physicalDeviceProperties.vendorID) {
if (0x10DE == physicalDeviceProperties2.properties.vendorID) {
continue;
}

memcpy(cgpu->name, physicalDeviceProperties.deviceName, min(sizeof(cgpu->name),sizeof(physicalDeviceProperties.deviceName)));
memcpy(cgpu->name, physicalDeviceProperties2.properties.deviceName, min(sizeof(cgpu->name),sizeof(physicalDeviceProperties2.properties.deviceName)));
cgpu->name[sizeof(cgpu->name) - 1] = 0;

VkPhysicalDeviceMemoryProperties memoryProperties;
Expand All @@ -332,12 +351,14 @@ static int vulkan_detect(struct cgpu_info *gpus, int *active)
VkMemoryType t = memoryProperties.memoryTypes[j];
if ((t.propertyFlags & VK_MEMORY_HEAP_DEVICE_LOCAL_BIT) != 0) {
if (t.heapIndex < memoryProperties.memoryHeapCount) {
cgpu->gpu_max_alloc = memoryProperties.memoryHeaps[t.heapIndex].size;
cgpu->gpu_memory = memoryProperties.memoryHeaps[t.heapIndex].size;
break;
}
}
}

cgpu->gpu_max_alloc = physicalDeviceProperties3.maxMemoryAllocationSize;

*active += 1;
most_devices++;
}
Expand Down
4 changes: 4 additions & 0 deletions src/vulkan/gen/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
include_directories(${CMAKE_CURRENT_BINARY_DIR})

if (MSVC)
set (CMAKE_CXX_FLAGS_DEBUG "/MD /Zi /O2 /Ob1 /DNDEBUG")
endif()

# A custom command and target to turn the Vulkan kernel into a byte array header

add_custom_command(
Expand Down
13 changes: 2 additions & 11 deletions src/vulkan/gen/scrypt-chacha.comp
Original file line number Diff line number Diff line change
Expand Up @@ -413,8 +413,7 @@ main()
uvec4 X[8];
const uint lid = gl_LocalInvocationID.x;
const uint gid = gl_GlobalInvocationID.x;
uint Nfactor = 0;
uint tmp = N >> 1;
uint tmp;
uvec2 nonce;

scrypt_hmac_state hmac_pw, work;
Expand All @@ -432,14 +431,6 @@ main()
nonce.x = global_work_offset.x + gid;
}

/* Determine the Nfactor */
while ((tmp & 1) == 0) {
tmp >>= 1;
Nfactor++;
}

const uint effective_concurrency = (concurrent_threads << 9) >> Nfactor;

password[0] = buffer0[0];
password[1] = buffer0[1];
password[2] = buffer0[2];
Expand Down Expand Up @@ -555,7 +546,7 @@ main()
}
//////////////////////////////////////////////////
/* 2: X = ROMix(X) */
scrypt_ROMix(X, N, gid, effective_concurrency);
scrypt_ROMix(X, N, gid, concurrent_threads);

/* 3: Out = PBKDF2(password, X) */
//////////////////////////////////////////////////
Expand Down
Loading