Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
121 changes: 81 additions & 40 deletions src/cuda/GPU_Microbenchmark/hw_def/common/gpuConfig.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
#include <cstdlib>
#include <cstdint>
#include <iostream>
#include <fstream>
#include <sstream>
#include <cuda_runtime.h>
#include <fcntl.h>
#include <unistd.h>
Expand Down Expand Up @@ -237,46 +239,85 @@ inline unsigned queryGrInfo(uint32_t info_index)

unsigned intilizeDeviceProp(unsigned deviceID, int argc, char *argv[])
{
cudaSetDevice(deviceID);
cudaGetDeviceProperties(&deviceProp, deviceID);

int clockRateKHz;
cudaDeviceGetAttribute(&clockRateKHz, cudaDevAttrClockRate, deviceID);

// core stats

config.SM_NUMBER = deviceProp.multiProcessorCount;
config.MAX_THREADS_PER_SM = deviceProp.maxThreadsPerMultiProcessor;
config.MAX_SHARED_MEM_SIZE = deviceProp.sharedMemPerMultiprocessor;
config.WARP_SIZE = deviceProp.warpSize;
config.MAX_WARPS_PER_SM =
deviceProp.maxThreadsPerMultiProcessor / deviceProp.warpSize;
config.MAX_REG_PER_SM = deviceProp.regsPerMultiprocessor;

// threadblock stats
config.MAX_THREAD_BLOCK_SIZE = deviceProp.maxThreadsPerBlock;
config.MAX_SHARED_MEM_SIZE_PER_BLOCK = deviceProp.sharedMemPerBlock;
config.MAX_REG_PER_BLOCK = deviceProp.regsPerBlock;

// launched thread blocks to ensure GPU is fully occupied as much as possible
config.THREADS_PER_BLOCK = deviceProp.maxThreadsPerBlock;
config.BLOCKS_PER_SM =
deviceProp.maxThreadsPerMultiProcessor / deviceProp.maxThreadsPerBlock;
config.THREADS_PER_SM = config.BLOCKS_PER_SM * config.THREADS_PER_BLOCK;
config.BLOCKS_NUM = config.BLOCKS_PER_SM * config.SM_NUMBER;
config.TOTAL_THREADS = config.THREADS_PER_BLOCK * config.BLOCKS_NUM;

// L2 cache
config.L2_SIZE = deviceProp.l2CacheSize;

// memory
config.MEM_SIZE = deviceProp.totalGlobalMem;
config.MEM_CLK_FREQUENCY = deviceProp.memoryClockRate * 1e-3f;
config.MEM_BITWIDTH = deviceProp.memoryBusWidth;
config.CLK_FREQUENCY = clockRateKHz * 1e-3f;

config.FBP_COUNT = queryGrInfo(NV2080_CTRL_GR_INFO_INDEX_LITTER_NUM_FBPS);
config.L2_BANKS = queryGrInfo(NV2080_CTRL_GR_INFO_INDEX_LITTER_NUM_LTCS);
// Check if running in GPGPU-Sim by looking for gpgpusim.config
std::ifstream configFile("gpgpusim.config");
bool isGpgpuSim = configFile.is_open();

if (isGpgpuSim) {
// Parse gpgpusim.config for available parameters
unsigned n_mem = 32, n_sub_partition = 2; // defaults
unsigned l2_nsets = 32, l2_linesize = 128, l2_assoc = 24; // defaults for L2 per bank
std::string line;
while (std::getline(configFile, line)) {
std::istringstream iss(line);
std::string key;
if (iss >> key) {
if (key == "-gpgpu_n_mem") {
iss >> n_mem; // number of memory controllers
} else if (key == "-gpgpu_n_sub_partition_per_mchannel") {
iss >> n_sub_partition; // number of L2 banks per memory controller
} else if (key == "-gpgpu_cache:dl2") {
// Format: X:nsets:linesize:assoc,... where X is any letter
std::string cacheConfig;
iss >> cacheConfig;
// Parse X:nsets:linesize:assoc using sscanf, skip first char
char dummy;
sscanf(cacheConfig.c_str(), "%c:%u:%u:%u", &dummy, &l2_nsets, &l2_linesize, &l2_assoc);
}
}
}
configFile.close();

// Use struct default values (already initialized in GpuConfig)
// Override FBP_COUNT and L2_BANKS from gpgpusim.config
config.FBP_COUNT = n_mem;
config.L2_BANKS = n_mem * n_sub_partition;
// L2_SIZE = (nsets * linesize * assoc) per bank * banks_per_controller * num_controllers
size_t l2_size_per_bank = (size_t)l2_nsets * l2_linesize * l2_assoc;
config.L2_SIZE = l2_size_per_bank * n_sub_partition * n_mem;
} else {
// Running on real hardware - query device properties
cudaSetDevice(deviceID);
cudaGetDeviceProperties(&deviceProp, deviceID);

int clockRateKHz;
cudaDeviceGetAttribute(&clockRateKHz, cudaDevAttrClockRate, deviceID);

// core stats
config.SM_NUMBER = deviceProp.multiProcessorCount;
config.MAX_THREADS_PER_SM = deviceProp.maxThreadsPerMultiProcessor;
config.MAX_SHARED_MEM_SIZE = deviceProp.sharedMemPerMultiprocessor;
config.WARP_SIZE = deviceProp.warpSize;
config.MAX_WARPS_PER_SM =
deviceProp.maxThreadsPerMultiProcessor / deviceProp.warpSize;
config.MAX_REG_PER_SM = deviceProp.regsPerMultiprocessor;

// threadblock stats
config.MAX_THREAD_BLOCK_SIZE = deviceProp.maxThreadsPerBlock;
config.MAX_SHARED_MEM_SIZE_PER_BLOCK = deviceProp.sharedMemPerBlock;
config.MAX_REG_PER_BLOCK = deviceProp.regsPerBlock;

// launched thread blocks to ensure GPU is fully occupied as much as possible
config.THREADS_PER_BLOCK = deviceProp.maxThreadsPerBlock;
config.BLOCKS_PER_SM =
deviceProp.maxThreadsPerMultiProcessor / deviceProp.maxThreadsPerBlock;
config.THREADS_PER_SM = config.BLOCKS_PER_SM * config.THREADS_PER_BLOCK;
config.BLOCKS_NUM = config.BLOCKS_PER_SM * config.SM_NUMBER;
config.TOTAL_THREADS = config.THREADS_PER_BLOCK * config.BLOCKS_NUM;

// L2 cache
config.L2_SIZE = deviceProp.l2CacheSize;

// memory
config.MEM_SIZE = deviceProp.totalGlobalMem;
config.MEM_CLK_FREQUENCY = deviceProp.memoryClockRate * 1e-3f;
config.MEM_BITWIDTH = deviceProp.memoryBusWidth;
config.CLK_FREQUENCY = clockRateKHz * 1e-3f;

// Get FBP_COUNT and L2_BANKS from NVIDIA RM API
config.FBP_COUNT = queryGrInfo(NV2080_CTRL_GR_INFO_INDEX_LITTER_NUM_FBPS);
config.L2_BANKS = queryGrInfo(NV2080_CTRL_GR_INFO_INDEX_LITTER_NUM_LTCS);
}

parseGpuConfigArgs(argc, argv);
printGpuConfig();
Expand Down
15 changes: 7 additions & 8 deletions src/cuda/parboil/driver/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,13 @@

import sys
import os
from itertools import imap

import globals
import actions
import options
import parboilfile
import process
import benchmark

from . import globals
from . import actions
from . import options
from . import parboilfile
from . import process
from . import benchmark

def run():
# Print a banner message
Expand Down
53 changes: 26 additions & 27 deletions src/cuda/parboil/driver/actions.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,24 +4,23 @@
# call lower-level routines from the process or benchmark modules.

import os
from itertools import imap

import process
import benchmark
import globals
from text import format_columns
from . import process
from . import benchmark
from . import globals
from .text import format_columns

from error import ErrorType
from .error import ErrorType

def benchmark_iter():
"""Iterate over the benchmarks in 'bmks'."""
# bmks is a dictionary from str to Future(Benchmark)
return imap(lambda x: x.get(), globals.benchmarks.itervalues())
return map(lambda x: x.get(), globals.benchmarks.values())

def list_benchmarks():
"""List all benchmarks on standard output."""
print "Benchmarks:"
for bmk in benchmark_iter(): print " " + bmk.name
print("Benchmarks:")
for bmk in benchmark_iter(): print(" " + bmk.name)

def describe_benchmarks():
"""Print descriptions of all benchmarks to standard output."""
Expand All @@ -30,8 +29,8 @@ def describe_benchmarks():
def describe_benchmark(bmk):
"""Print a description of one benchmark to standard output."""

print " " + bmk.name
print format_columns(bmk.describe(), 4)
print(" " + bmk.name)
print(format_columns(bmk.describe(), 4))

def lookup_benchmark(name):
"""Find a benchmark, given its name. Returns None if no benchmark
Expand All @@ -43,11 +42,11 @@ def lookup_benchmark(name):
if bmk.invalid is None:
return bmk
else:
print "Error in benchmark:"
print str(bmk.invalid)
print("Error in benchmark:")
print(str(bmk.invalid))
return None
else:
print "Cannot find benchmark"
print("Cannot find benchmark")
return None

def with_benchmark_named(name, action):
Expand All @@ -62,9 +61,9 @@ def compile_benchmark(bmk, version_name, platform=None):
"""Compile the benchmark 'bmk'."""
try: impl = bmk.impls[version_name]
except KeyError:
print "Cannot find benchmark version"
print("Cannot find benchmark version")
return

return impl.build(bmk, platform)

def clean_benchmark(bmk, version_name=None, platform=None):
Expand All @@ -74,56 +73,56 @@ def clean_benchmark(bmk, version_name=None, platform=None):
if version_name:
try: impl = bmk.impls[version_name]
except KeyError:
print "Cannot find benchmark version"
print("Cannot find benchmark version")
return

impl.clean(bmk, platform)
else:
# Clean all versions
for impl in bmk.impls.itervalues():
for impl in bmk.impls.values():
impl.clean(bmk, platform)

def run_benchmark(bmk, version_name, input_name, check=True, extra_opts=[], platform=None):
"""Run the benchmark 'bmk'."""
try: impl = bmk.impls[version_name]
except KeyError:
print "Cannot find benchmark version"
print("Cannot find benchmark version")
return ErrorType.CannotFindVersion

try: data = bmk.datas[input_name]
except KeyError:
print "Cannot find data set"
print("Cannot find data set")
return ErrorType.CannotFindDataSet

# Run the benchmark
ret = impl.run(bmk, data, check, extra_opts=extra_opts, platform=platform)

if ret is not ErrorType.Success:
print "Run failed!"
print("Run failed!")
return ret

# Verify the output
if check:
success = impl.check(bmk, data)

if not success:
print "Output checking tool detected a mismatch"
print("Output checking tool detected a mismatch")
return ErrorType.OutputMismatch
else:
print "Output was not checked for correctness"
print("Output was not checked for correctness")

return ErrorType.Success

def debug_benchmark(bmk, version_name, input_name, check=True, extra_opts=[], platform=None):
"""Debug the benchmark."""
try: impl = bmk.impls[version_name]
except KeyError:
print "Cannot find benchmark version"
print("Cannot find benchmark version")
return ErrorType.CannotFindVersion

try: data = bmk.datas[input_name]
except KeyError:
print "Cannot find data set"
print("Cannot find data set")
return ErrorType.CannotFindDataSet

# Run the benchmark
Expand Down
Loading