accel-sim · JRPan · Jan 21, 2026 · Jan 20, 2026
diff --git a/src/cuda/GPU_Microbenchmark/hw_def/common/gpuConfig.h b/src/cuda/GPU_Microbenchmark/hw_def/common/gpuConfig.h
@@ -5,6 +5,8 @@
 #include <cstdlib>
 #include <cstdint>
 #include <iostream>
+#include <fstream>
+#include <sstream>
 #include <cuda_runtime.h>
 #include <fcntl.h>
 #include <unistd.h>
@@ -237,46 +239,85 @@ inline unsigned queryGrInfo(uint32_t info_index)
 
 unsigned intilizeDeviceProp(unsigned deviceID, int argc, char *argv[])
 {
-    cudaSetDevice(deviceID);
-    cudaGetDeviceProperties(&deviceProp, deviceID);
-
-    int clockRateKHz;
-    cudaDeviceGetAttribute(&clockRateKHz, cudaDevAttrClockRate, deviceID);
-
-    // core stats
-
-    config.SM_NUMBER = deviceProp.multiProcessorCount;
-    config.MAX_THREADS_PER_SM = deviceProp.maxThreadsPerMultiProcessor;
-    config.MAX_SHARED_MEM_SIZE = deviceProp.sharedMemPerMultiprocessor;
-    config.WARP_SIZE = deviceProp.warpSize;
-    config.MAX_WARPS_PER_SM =
-        deviceProp.maxThreadsPerMultiProcessor / deviceProp.warpSize;
-    config.MAX_REG_PER_SM = deviceProp.regsPerMultiprocessor;
-
-    // threadblock stats
-    config.MAX_THREAD_BLOCK_SIZE = deviceProp.maxThreadsPerBlock;
-    config.MAX_SHARED_MEM_SIZE_PER_BLOCK = deviceProp.sharedMemPerBlock;
-    config.MAX_REG_PER_BLOCK = deviceProp.regsPerBlock;
-
-    // launched thread blocks to ensure GPU is fully occupied as much as possible
-    config.THREADS_PER_BLOCK = deviceProp.maxThreadsPerBlock;
-    config.BLOCKS_PER_SM =
-        deviceProp.maxThreadsPerMultiProcessor / deviceProp.maxThreadsPerBlock;
-    config.THREADS_PER_SM = config.BLOCKS_PER_SM * config.THREADS_PER_BLOCK;
-    config.BLOCKS_NUM = config.BLOCKS_PER_SM * config.SM_NUMBER;
-    config.TOTAL_THREADS = config.THREADS_PER_BLOCK * config.BLOCKS_NUM;
-
-    // L2 cache
-    config.L2_SIZE = deviceProp.l2CacheSize;
-
-    // memory
-    config.MEM_SIZE = deviceProp.totalGlobalMem;
-    config.MEM_CLK_FREQUENCY = deviceProp.memoryClockRate * 1e-3f;
-    config.MEM_BITWIDTH = deviceProp.memoryBusWidth;
-    config.CLK_FREQUENCY = clockRateKHz * 1e-3f;
-
-    config.FBP_COUNT = queryGrInfo(NV2080_CTRL_GR_INFO_INDEX_LITTER_NUM_FBPS);
-    config.L2_BANKS = queryGrInfo(NV2080_CTRL_GR_INFO_INDEX_LITTER_NUM_LTCS);
+    // Check if running in GPGPU-Sim by looking for gpgpusim.config
+    std::ifstream configFile("gpgpusim.config");
+    bool isGpgpuSim = configFile.is_open();
+
+    if (isGpgpuSim) {
+        // Parse gpgpusim.config for available parameters
+        unsigned n_mem = 32, n_sub_partition = 2;  // defaults
+        unsigned l2_nsets = 32, l2_linesize = 128, l2_assoc = 24;  // defaults for L2 per bank
+        std::string line;
+        while (std::getline(configFile, line)) {
+            std::istringstream iss(line);
+            std::string key;
+            if (iss >> key) {
+                if (key == "-gpgpu_n_mem") {
+                    iss >> n_mem;   // number of memory controllers
+                } else if (key == "-gpgpu_n_sub_partition_per_mchannel") {
+                    iss >> n_sub_partition; // number of L2 banks per memory controller
+                } else if (key == "-gpgpu_cache:dl2") {
+                    // Format: X:nsets:linesize:assoc,... where X is any letter
+                    std::string cacheConfig;
+                    iss >> cacheConfig;
+                    // Parse X:nsets:linesize:assoc using sscanf, skip first char
+                    char dummy;
+                    sscanf(cacheConfig.c_str(), "%c:%u:%u:%u", &dummy, &l2_nsets, &l2_linesize, &l2_assoc);
+                }
+            }
+        }
+        configFile.close();
+
+        // Use struct default values (already initialized in GpuConfig)
+        // Override FBP_COUNT and L2_BANKS from gpgpusim.config
+        config.FBP_COUNT = n_mem;
+        config.L2_BANKS = n_mem * n_sub_partition;
+        // L2_SIZE = (nsets * linesize * assoc) per bank * banks_per_controller * num_controllers
+        size_t l2_size_per_bank = (size_t)l2_nsets * l2_linesize * l2_assoc;
+        config.L2_SIZE = l2_size_per_bank * n_sub_partition * n_mem;
+    } else {
+        // Running on real hardware - query device properties
+        cudaSetDevice(deviceID);
+        cudaGetDeviceProperties(&deviceProp, deviceID);
+
+        int clockRateKHz;
+        cudaDeviceGetAttribute(&clockRateKHz, cudaDevAttrClockRate, deviceID);
+
+        // core stats
+        config.SM_NUMBER = deviceProp.multiProcessorCount;
+        config.MAX_THREADS_PER_SM = deviceProp.maxThreadsPerMultiProcessor;
+        config.MAX_SHARED_MEM_SIZE = deviceProp.sharedMemPerMultiprocessor;
+        config.WARP_SIZE = deviceProp.warpSize;
+        config.MAX_WARPS_PER_SM =
+            deviceProp.maxThreadsPerMultiProcessor / deviceProp.warpSize;
+        config.MAX_REG_PER_SM = deviceProp.regsPerMultiprocessor;
+
+        // threadblock stats
+        config.MAX_THREAD_BLOCK_SIZE = deviceProp.maxThreadsPerBlock;
+        config.MAX_SHARED_MEM_SIZE_PER_BLOCK = deviceProp.sharedMemPerBlock;
+        config.MAX_REG_PER_BLOCK = deviceProp.regsPerBlock;
+
+        // launched thread blocks to ensure GPU is fully occupied as much as possible
+        config.THREADS_PER_BLOCK = deviceProp.maxThreadsPerBlock;
+        config.BLOCKS_PER_SM =
+            deviceProp.maxThreadsPerMultiProcessor / deviceProp.maxThreadsPerBlock;
+        config.THREADS_PER_SM = config.BLOCKS_PER_SM * config.THREADS_PER_BLOCK;
+        config.BLOCKS_NUM = config.BLOCKS_PER_SM * config.SM_NUMBER;
+        config.TOTAL_THREADS = config.THREADS_PER_BLOCK * config.BLOCKS_NUM;
+
+        // L2 cache
+        config.L2_SIZE = deviceProp.l2CacheSize;
+
+        // memory
+        config.MEM_SIZE = deviceProp.totalGlobalMem;
+        config.MEM_CLK_FREQUENCY = deviceProp.memoryClockRate * 1e-3f;
+        config.MEM_BITWIDTH = deviceProp.memoryBusWidth;
+        config.CLK_FREQUENCY = clockRateKHz * 1e-3f;
+
+        // Get FBP_COUNT and L2_BANKS from NVIDIA RM API
+        config.FBP_COUNT = queryGrInfo(NV2080_CTRL_GR_INFO_INDEX_LITTER_NUM_FBPS);
+        config.L2_BANKS = queryGrInfo(NV2080_CTRL_GR_INFO_INDEX_LITTER_NUM_LTCS);
+    }
 
     parseGpuConfigArgs(argc, argv);
     printGpuConfig();

diff --git a/src/cuda/parboil/driver/__init__.py b/src/cuda/parboil/driver/__init__.py
@@ -2,14 +2,13 @@
 
 import sys
 import os
-from itertools import imap
-
-import globals
-import actions
-import options
-import parboilfile
-import process
-import benchmark
+
+from . import globals
+from . import actions
+from . import options
+from . import parboilfile
+from . import process
+from . import benchmark
 
 def run():
     # Print a banner message

diff --git a/src/cuda/parboil/driver/actions.py b/src/cuda/parboil/driver/actions.py
@@ -4,24 +4,23 @@
 # call lower-level routines from the process or benchmark modules.
 
 import os
-from itertools import imap
 
-import process
-import benchmark
-import globals
-from text import format_columns
+from . import process
+from . import benchmark
+from . import globals
+from .text import format_columns
 
-from error import ErrorType
+from .error import ErrorType
 
 def benchmark_iter():
     """Iterate over the benchmarks in 'bmks'."""
     # bmks is a dictionary from str to Future(Benchmark)
-    return imap(lambda x: x.get(), globals.benchmarks.itervalues())
+    return map(lambda x: x.get(), globals.benchmarks.values())
 
 def list_benchmarks():
     """List all benchmarks on standard output."""
-    print "Benchmarks:"
-    for bmk in benchmark_iter(): print "  " + bmk.name
+    print("Benchmarks:")
+    for bmk in benchmark_iter(): print("  " + bmk.name)
 
 def describe_benchmarks():
     """Print descriptions of all benchmarks to standard output."""
@@ -30,8 +29,8 @@ def describe_benchmarks():
 def describe_benchmark(bmk):
     """Print a description of one benchmark to standard output."""
 
-    print "  " + bmk.name
-    print format_columns(bmk.describe(), 4)
+    print("  " + bmk.name)
+    print(format_columns(bmk.describe(), 4))
 
 def lookup_benchmark(name):
     """Find a benchmark, given its name.  Returns None if no benchmark
@@ -43,11 +42,11 @@ def lookup_benchmark(name):
         if bmk.invalid is None:
             return bmk
         else:
-            print "Error in benchmark:"
-            print str(bmk.invalid)
+            print("Error in benchmark:")
+            print(str(bmk.invalid))
             return None
     else:
-        print "Cannot find benchmark"
+        print("Cannot find benchmark")
         return None
 
 def with_benchmark_named(name, action):
@@ -62,9 +61,9 @@ def compile_benchmark(bmk, version_name, platform=None):
     """Compile the benchmark 'bmk'."""
     try: impl = bmk.impls[version_name]
     except KeyError:
-        print "Cannot find benchmark version"
+        print("Cannot find benchmark version")
         return
-    
+
     return impl.build(bmk, platform)
 
 def clean_benchmark(bmk, version_name=None, platform=None):
@@ -74,56 +73,56 @@ def clean_benchmark(bmk, version_name=None, platform=None):
     if version_name:
         try: impl = bmk.impls[version_name]
         except KeyError:
-            print "Cannot find benchmark version"
+            print("Cannot find benchmark version")
             return
 
         impl.clean(bmk, platform)
     else:
         # Clean all versions
-        for impl in bmk.impls.itervalues():
+        for impl in bmk.impls.values():
             impl.clean(bmk, platform)
 
 def run_benchmark(bmk, version_name, input_name, check=True, extra_opts=[], platform=None):
     """Run the benchmark 'bmk'."""
     try: impl = bmk.impls[version_name]
     except KeyError:
-        print "Cannot find benchmark version"
+        print("Cannot find benchmark version")
         return ErrorType.CannotFindVersion
-    
+
     try: data = bmk.datas[input_name]
     except KeyError:
-        print "Cannot find data set"
+        print("Cannot find data set")
         return ErrorType.CannotFindDataSet
 
     # Run the benchmark
     ret = impl.run(bmk, data, check, extra_opts=extra_opts, platform=platform)
 
     if ret is not ErrorType.Success:
-        print "Run failed!"
+        print("Run failed!")
         return ret
 
     # Verify the output
     if check:
         success = impl.check(bmk, data)
 
         if not success:
-            print "Output checking tool detected a mismatch"
+            print("Output checking tool detected a mismatch")
             return ErrorType.OutputMismatch
     else:
-        print "Output was not checked for correctness"
+        print("Output was not checked for correctness")
 
     return ErrorType.Success
 
 def debug_benchmark(bmk, version_name, input_name, check=True, extra_opts=[], platform=None):
     """Debug the benchmark."""
     try: impl = bmk.impls[version_name]
     except KeyError:
-        print "Cannot find benchmark version"
+        print("Cannot find benchmark version")
         return ErrorType.CannotFindVersion
-    
+
     try: data = bmk.datas[input_name]
     except KeyError:
-        print "Cannot find data set"
+        print("Cannot find data set")
         return ErrorType.CannotFindDataSet
 
     # Run the benchmark