diff --git a/advisor.cpp b/advisor.cpp index 721e26d..1dbd6d6 100644 --- a/advisor.cpp +++ b/advisor.cpp @@ -27,16 +27,20 @@ std::vector *Advisor::recommend( Tristate::Tristate isBatched, Tristate::Tristate isFloat, Tristate::Tristate isForward, Tristate::Tristate isInPlace, Tristate::Tristate isReal, int maxSignalInc, int maxMemory, - bool allowTransposition, bool squareOnly, bool crop) { + bool disallowRotation, bool allowTransposition, bool disallowSizeOptimization, + int countOfOptimizedDimensions, bool squareOnly, bool crop) { Validator::validate(device); maxMemory = getMaxMemory(device, maxMemory); - Validator::validate(x, y, z, n, device, maxSignalInc, maxMemory, allowTransposition, squareOnly); + Validator::validate(x, y, z, n, device, maxSignalInc, maxMemory, + countOfOptimizedDimensions, allowTransposition, squareOnly); GeneralTransform tr = GeneralTransform(device, x, y, z, n, isBatched, isFloat, isForward, isInPlace, isReal); - SizeOptimizer optimizer(CudaVersion::V_8, tr, allowTransposition); + SizeOptimizer optimizer(CudaVersion::V_12, tr, allowTransposition); std::vector *result = - optimizer.optimize(howMany, maxSignalInc, maxMemory, squareOnly, crop); + optimizer.optimize(howMany, maxSignalInc, maxMemory, disallowRotation, + disallowSizeOptimization, countOfOptimizedDimensions, + squareOnly, crop); return result; } @@ -45,11 +49,13 @@ std::vector *Advisor::find( Tristate::Tristate isBatched, Tristate::Tristate isFloat, Tristate::Tristate isForward, Tristate::Tristate isInPlace, Tristate::Tristate isReal, int maxSignalInc, int maxMemory, - bool allowTransposition, bool squareOnly, bool crop) { + bool disallowRotation, bool allowTransposition, bool disallowSizeOptimization, + int countOfOptimizedDimensions, bool squareOnly, bool crop) { std::vector *candidates = recommend(howMany, device, x, y, z, n, isBatched, isFloat, isForward, - isInPlace, isReal, maxSignalInc, maxMemory, allowTransposition, - squareOnly, crop); + isInPlace, isReal, maxSignalInc, maxMemory, disallowRotation, + allowTransposition, disallowSizeOptimization, + countOfOptimizedDimensions, squareOnly, crop); std::vector *result = benchmark(*candidates); std::sort(result->begin(), result->end(), BenchmarkResult::execSort); delete candidates; @@ -61,6 +67,11 @@ std::vector *Advisor::benchmark( std::vector *results = new std::vector(); int size = transforms.size(); + + if (transforms.size() > 0) { + Benchmarker::benchmark(transforms.at(0)); + } + for (int i = 0; i < size; i++) { results->push_back(Benchmarker::benchmark(transforms.at(i))); } diff --git a/advisor.h b/advisor.h index eb34ce9..49f151d 100644 --- a/advisor.h +++ b/advisor.h @@ -25,7 +25,10 @@ class Advisor { Tristate::Tristate isInPlace = Tristate::TRUE, Tristate::Tristate isReal = Tristate::TRUE, int maxSignalInc = INT_MAX, int maxMemory = INT_MAX, + bool disallowRotation = false, bool allowTransposition = false, + bool disallowSizeOptimization = false, + int countOfOptimizedDimensions = MaxNumberOfOptimizedDimensions, bool squareOnly = false, bool crop = false); @@ -37,7 +40,10 @@ class Advisor { Tristate::Tristate isInPlace = Tristate::TRUE, Tristate::Tristate isReal = Tristate::TRUE, int maxSignalInc = INT_MAX, int maxMemory = INT_MAX, + bool disallowRotation = false, bool allowTransposition = false, + bool disallowSizeOptimization = false, + int countOfOptimizedDimensions = MaxNumberOfOptimizedDimensions, bool squareOnly = false, bool crop = false); diff --git a/generalTransform.cpp b/generalTransform.cpp index ee47051..bc9c29d 100644 --- a/generalTransform.cpp +++ b/generalTransform.cpp @@ -12,7 +12,13 @@ GeneralTransform::GeneralTransform(int device, int X, int Y, int Z, int N, X(X), Y(Y), Z(Z), + originalX(X), + originalY(Y), + originalZ(Z), N(N), + kernelInvocationX(0), + kernelInvocationY(0), + kernelInvocationZ(0), isBatched(isBatched), isFloat(isFloat), isForward(isForward), @@ -25,14 +31,24 @@ GeneralTransform::GeneralTransform(int X, int Y, int Z, X(X), Y(Y), Z(Z), + originalX(X), + originalY(Y), + originalZ(Z), N(tr.N), + kernelInvocationX(0), + kernelInvocationY(0), + kernelInvocationZ(0), isBatched(tr.isBatched), isFloat(tr.isFloat), isForward(tr.isForward), isInPlace(tr.isInPlace), isReal(tr.isReal) {} -GeneralTransform::GeneralTransform(const GeneralTransform &tr) { *this = tr; } +GeneralTransform::GeneralTransform(const GeneralTransform &tr) + : originalX(tr.originalX), + originalY(tr.originalY), + originalZ(tr.originalZ) + { *this = tr; } GeneralTransform &GeneralTransform::operator=(const GeneralTransform &tr) { if (this != &tr) { @@ -46,6 +62,9 @@ GeneralTransform &GeneralTransform::operator=(const GeneralTransform &tr) { this->isForward = tr.isForward; this->isInPlace = tr.isInPlace; this->isReal = tr.isReal; + this->kernelInvocationX = tr.kernelInvocationX; + this->kernelInvocationY = tr.kernelInvocationY; + this->kernelInvocationZ = tr.kernelInvocationZ; } return *this; } diff --git a/generalTransform.h b/generalTransform.h index 465e61e..b589a27 100644 --- a/generalTransform.h +++ b/generalTransform.h @@ -31,6 +31,15 @@ class GeneralTransform { Tristate::Tristate isInPlace; // otherwise out-of-place Tristate::Tristate isReal; // otherwise C2C + const int originalX; + const int originalY; + const int originalZ; + + // number of kernel invocations for selected dimension + // the number of kernel invocations is decided separately for each dimension + int kernelInvocationX; + int kernelInvocationY; + int kernelInvocationZ; }; } // namespace cuFFTAdvisor diff --git a/inputParser.cpp b/inputParser.cpp index 531abce..d5590a5 100644 --- a/inputParser.cpp +++ b/inputParser.cpp @@ -21,6 +21,9 @@ InputParser::InputParser(int argc, char **argv) { maxSignalInc = parseMaxSignalInc(); maxMemMB = parseMaxMemMB(); allowTransposition = parseAllowTransposition(); + disallowRotation = parseDisallowRotation(); + disallowSizeOptimization = parseDisallowSizeOptimization(); + countOfOptimizedDimensions = parseCountOfOptimizedDimensions(); squareOnly = parseSquareOnly(); crop = parseCrop(); } @@ -58,6 +61,20 @@ int InputParser::parseMaxMemMB() { return INT_MAX; } +int InputParser::parseCountOfOptimizedDimensions() { + for (int i = 0; i < (argc - 1); i++) { + if (safeEquals(argv[i], "--countOfOptimizedDimensions")) { + if (NULL == argv[i + 1]) { + return -1; + } + int count = atoi(argv[i + 1]); + argv[i] = argv[i + 1] = NULL; + return count; + } + } + return MaxNumberOfOptimizedDimensions; +} + bool InputParser::reportUnparsed(FILE *stream) { bool error = false; if (NULL != stream) { @@ -195,6 +212,26 @@ bool InputParser::parseAllowTransposition() { return false; } +bool InputParser::parseDisallowRotation() { + for (int i = 0; i < argc; i++) { + if (safeEquals(argv[i], "--disallowRotation")) { + argv[i] = NULL; + return true; + } + } + return false; +} + +bool InputParser::parseDisallowSizeOptimization() { + for (int i = 0; i < argc; i++) { + if (safeEquals(argv[i], "--disallowSizeOptimization")) { + argv[i] = NULL; + return true; + } + } + return false; +} + bool InputParser::parseSquareOnly() { for (int i = 0; i < argc; i++) { if (safeEquals(argv[i], "--squareOnly")) { diff --git a/inputParser.h b/inputParser.h index 833f79d..ebc10fa 100644 --- a/inputParser.h +++ b/inputParser.h @@ -19,6 +19,9 @@ class InputParser { int maxSignalInc; int maxMemMB; bool allowTransposition; + bool disallowRotation; + bool disallowSizeOptimization; + int countOfOptimizedDimensions; bool squareOnly; bool crop; Tristate::Tristate isBatched; @@ -35,6 +38,9 @@ class InputParser { int parseMaxSignalInc(); int parseMaxMemMB(); bool parseAllowTransposition(); + bool parseDisallowRotation(); + bool parseDisallowSizeOptimization(); + int parseCountOfOptimizedDimensions(); bool parseSquareOnly(); bool parseCrop(); Tristate::Tristate parseIsReal(); diff --git a/main.cpp b/main.cpp index f2fcc9d..524ee62 100644 --- a/main.cpp +++ b/main.cpp @@ -46,6 +46,12 @@ int printHelp() { "(swapping dimensions). Prohibited by default. Valid for " "'-find' only." << std::endl; + std::cout << "\t--disallowRotation : consider also rotation of X and Y axes " + "(swapping dimensions). Allowed by default." + << std::endl; + std::cout << "\t--disallowSizeOptimization : disable size optimization by padding/cropping. " + "Allowed by default." + << std::endl; std::cout << "\t--squareOnly : consider only square shapes " "(X dimension size will be used as a starting point). " "Incompatible with --allowTransposition." @@ -61,6 +67,9 @@ int printHelp() { std::cout << "\t--maxMem MB : max memory (in MB) that transformation can " "use, default = device limit" << std::endl; + std::cout << "\t--countOfOptimizedDimensions COUNT : number of size dimensionsthat will be optimized, " + "Default: '3' for 3D signal, '2' for 2D signal, '1' for 1D signal" + << std::endl; return -1; } @@ -109,8 +118,11 @@ int parseRecommend(int argc, char **argv, int howMany) { howMany, parser.device, parser.x, parser.y, parser.z, parser.n, parser.isBatched, parser.isFloat, parser.isForward, parser.isInPlace, parser.isReal, parser.maxSignalInc, - parser.maxMemMB, parser.allowTransposition, parser.squareOnly, - parser.crop); + parser.maxMemMB, parser.disallowRotation, + parser.allowTransposition, + parser.disallowSizeOptimization, + parser.countOfOptimizedDimensions, + parser.squareOnly, parser.crop); cuFFTAdvisor::Transform::printHeader(stdout); std::cout << std::endl; @@ -142,8 +154,10 @@ int parseFind(int argc, char **argv, int howMany) { parser.isFloat, parser.isForward, parser.isInPlace, parser.isReal, parser.maxSignalInc, parser.maxMemMB, - parser.allowTransposition, parser.squareOnly, - parser.crop); + parser.disallowRotation, parser.allowTransposition, + parser.disallowSizeOptimization, + parser.countOfOptimizedDimensions, + parser.squareOnly, parser.crop); cuFFTAdvisor::BenchmarkResult::printHeader(stdout); std::cout << std::endl; diff --git a/rotation_decision_tree_marked_leaves.png b/rotation_decision_tree_marked_leaves.png new file mode 100644 index 0000000..06bc871 Binary files /dev/null and b/rotation_decision_tree_marked_leaves.png differ diff --git a/sizeOptimizer.cpp b/sizeOptimizer.cpp index 3266bc6..a4e266b 100644 --- a/sizeOptimizer.cpp +++ b/sizeOptimizer.cpp @@ -3,7 +3,7 @@ namespace cuFFTAdvisor { const struct SizeOptimizer::Polynom SizeOptimizer::UNIT = { - .value = 1, 0, 0, 0, 0, 0, 0}; + .value = 1, 0, 0, 0, 0, 0, 0, 0}; SizeOptimizer::SizeOptimizer(CudaVersion::CudaVersion version, GeneralTransform &tr, bool allowTrans) @@ -11,7 +11,8 @@ SizeOptimizer::SizeOptimizer(CudaVersion::CudaVersion version, log_2(1.0 / std::log(2)), log_3(1.0 / std::log(3)), log_5(1.0 / std::log(5)), - log_7(1.0 / std::log(7)) { + log_7(1.0 / std::log(7)), + log_11(1.0 / std::log(11)) { if (Tristate::BOTH == tr.isFloat) { // if user is not sure if they needs double, then they doesn't need it tr.isFloat = Tristate::TRUE; @@ -36,26 +37,115 @@ SizeOptimizer::SizeOptimizer(CudaVersion::CudaVersion version, std::vector *SizeOptimizer::optimize(size_t nBest, int maxPercIncrease, int maxMemMB, + bool disallowRotation, + bool disallowOptimization, + int dimensionCount, bool squareOnly, bool crop) { std::vector preoptimized; for (auto in : input) { + + size_t testedConfigs = std::max(MinimalCountOfUsedConfigurations, nBest); + std::vector *tmp = - optimizeXYZ(in, nBest, maxPercIncrease, squareOnly, crop); + optimizeXYZ(in, testedConfigs, maxPercIncrease, disallowRotation, + disallowOptimization, dimensionCount, squareOnly, crop); preoptimized.insert(preoptimized.end(), tmp->begin(), tmp->end()); delete tmp; } return optimizeN(&preoptimized, maxMemMB, nBest); } +void SizeOptimizer::swapSizes(GeneralTransform &in) { + in.X = in.originalY; + in.Y = in.originalX; +} + +bool SizeOptimizer::swapSizes2D(GeneralTransform &in, const Polynom &x, const Polynom &y) { + int primesCountX = getNoOfPrimes(in.X); + int primesCountY = getNoOfPrimes(in.Y); + + bool divisibleBy2X = in.X % 2 == 0; + bool divisibleBy2Y = in.Y % 2 == 0; + float differenceBetweenXY = (float)std::max(in.X, in.Y) / std::min(in.X, in.Y); + int kernelCountX = x.invocations; + int kernelCountY = y.invocations; + + if (!divisibleBy2X) { + if (kernelCountX <= 1) { + if (in.X <= in.Y) { + if (divisibleBy2Y) { + if (differenceBetweenXY <= 35) { + swapSizes(in); + } + } + } else { + swapSizes(in); + } + } else { + swapSizes(in); + } + } else { + if (kernelCountY <= 1) { + if (kernelCountX <= 1) { + if (!divisibleBy2Y) { + if (primesCountY <= 1) { + swapSizes(in); + } + } + } else { + if (primesCountY <= 1) { + if (!(differenceBetweenXY <= 35000)) { + swapSizes(in); + } + } else { + if (!divisibleBy2Y) { + if (!(differenceBetweenXY <= 10)) { + swapSizes(in); + } + } else { + swapSizes(in); + } + } + } + } else { + if (primesCountX <= 1) { + if (!(primesCountY <= 2)) { + if (kernelCountY <= 3) { + if (!(differenceBetweenXY <= 100000)) { + swapSizes(in); + } + } else { + swapSizes(in); + } + } + } + } + } + + return true; +} + + + bool SizeOptimizer::sizeSort(const Transform *l, const Transform *r) { if (l->N != r->N) return l->N > r->N; // prefer bigger batches + + int lKernelCount = l->kernelInvocationX + l->kernelInvocationY + l->kernelInvocationZ; + int rKernelCount = r->kernelInvocationX + r->kernelInvocationY + r->kernelInvocationZ; + size_t lDims = l->X * l->Y * l->Z; size_t rDims = r->X * r->Y * r->Z; + + if (lKernelCount != rKernelCount) return lKernelCount < rKernelCount; + if (l->kernelInvocationX != r->kernelInvocationX) return l->kernelInvocationX < r->kernelInvocationX; + if (l->kernelInvocationY != r->kernelInvocationY) return l->kernelInvocationY < r->kernelInvocationY; + if (l->kernelInvocationZ != r->kernelInvocationZ) return l->kernelInvocationZ < r->kernelInvocationZ; + if (lDims != rDims) return lDims < rDims; - if (l->Z != r->Z) return l->Z < r->Z; + if (l->X != r->X) return l->X < r->X; if (l->Y != r->Y) return l->Y < r->Y; - return l->X < r->X; + return l->Z < r->Z; } bool SizeOptimizer::perfSort(const Transform *l, const Transform *r) { @@ -77,6 +167,7 @@ bool SizeOptimizer::perfSort(const Transform *l, const Transform *r) { std::vector *SizeOptimizer::optimizeN( std::vector *transforms, size_t maxMem, size_t nBest) { std::vector *result = new std::vector(); + for (auto& gt : *transforms) { if (Tristate::isNot(gt.isBatched)) { collapse(gt, false, gt.N, maxMem, result); @@ -85,6 +176,7 @@ std::vector *SizeOptimizer::optimizeN( collapseBatched(gt, maxMem, result); } } + std::sort(result->begin(), result->end(), perfSort); while (result->size() > nBest) { delete result->back(); @@ -123,7 +215,8 @@ bool SizeOptimizer::collapse(GeneralTransform >, bool isBatched, size_t N, std::vector transforms; TransformGenerator::generate(gt.device, gt.X, gt.Y, gt.Z, N, isBatched, gt.isFloat, gt.isForward, gt.isInPlace, - gt.isReal, transforms); + gt.isReal, gt.kernelInvocationX, gt.kernelInvocationY, + gt.kernelInvocationZ, transforms); size_t noOfTransforms = transforms.size(); for (size_t i = 0; i < noOfTransforms; i++) { @@ -133,6 +226,7 @@ bool SizeOptimizer::collapse(GeneralTransform >, bool isBatched, size_t N, size_t planSize = std::max(r.planSizeEstimateB, r.planSizeEstimate2B); size_t totalSizeBytes = r.transform->dataSizeB + planSize; size_t totalMB = std::ceil(toMB(totalSizeBytes)); + if (totalMB <= maxMemMB) { result->push_back(t); updated = true; @@ -164,10 +258,13 @@ size_t SizeOptimizer::getMinSize(GeneralTransform &tr, int maxPercDecrease, bool std::vector *SizeOptimizer::optimizeXYZ(GeneralTransform &tr, size_t nBest, int maxPercIncrease, + bool disallowRotation, + bool disallowOptimization, + int dimensionCount, bool squareOnly, bool crop) { - std::vector *polysX = generatePolys(tr.X, tr.isFloat, crop); + std::vector *polysX = generatePolys(tr.X, tr.isFloat, crop, disallowOptimization); std::vector *polysY; std::vector *polysZ; std::set *recPolysX = filterOptimal(polysX, crop); @@ -179,7 +276,7 @@ std::vector *SizeOptimizer::optimizeXYZ(GeneralTransform &tr, polysY = polysX; recPolysY = recPolysX; } else { - polysY = generatePolys(tr.Y, tr.isFloat, crop); + polysY = generatePolys(tr.Y, tr.isFloat, crop, disallowOptimization || dimensionCount < 2); recPolysY = filterOptimal(polysY, crop); } @@ -191,7 +288,7 @@ std::vector *SizeOptimizer::optimizeXYZ(GeneralTransform &tr, polysZ = polysY; recPolysZ = recPolysY; } else { - polysZ = generatePolys(tr.Z, tr.isFloat, crop); + polysZ = generatePolys(tr.Z, tr.isFloat, crop, disallowOptimization || dimensionCount < 3); recPolysZ = filterOptimal(polysZ, crop); } @@ -213,6 +310,15 @@ std::vector *SizeOptimizer::optimizeXYZ(GeneralTransform &tr, // we can take nbest only, as others very probably won't be faster found++; GeneralTransform t((int)x.value, (int)y.value, (int)z.value, tr); + + if (!disallowRotation && t.Y != 1) { + swapSizes2D(t, x, y); + } + + t.kernelInvocationX = x.invocations; + t.kernelInvocationY = y.invocations; + t.kernelInvocationZ = z.invocations; + result->push_back(t); } } @@ -238,6 +344,27 @@ int SizeOptimizer::getNoOfPrimes(Polynom &poly) { if (poly.exponent3 != 0) counter++; if (poly.exponent5 != 0) counter++; if (poly.exponent7 != 0) counter++; + if (poly.exponent11 != 0) counter++; + return counter; +} + +int SizeOptimizer::getNoOfPrimes(long size) { + int counter = 0; + if (size % 2 == 0) { + counter++; + } + if (size % 3 == 0) { + counter++; + } + if (size % 5 == 0) { + counter++; + } + if (size % 7 == 0) { + counter++; + } + if (size % 11 == 0) { + counter++; + } return counter; } @@ -271,44 +398,131 @@ int SizeOptimizer::getInvocationsV8(Polynom &poly, bool isFloat) { return result; } +int SizeOptimizer::getInvocationsV12(Polynom &poly, bool isFloat) { + int result = 0; + if (isFloat) { + if (poly.value <= V12_REGULAR_MAX_SP) + { + return 1; + } + result += getInvocations(V12_RADIX_2_MAX_SP, poly.exponent2); + result += getInvocations(V12_RADIX_3_MAX_SP, poly.exponent3); + result += getInvocations(V12_RADIX_5_MAX_SP, poly.exponent5); + result += getInvocations(V12_RADIX_7_MAX_SP, poly.exponent7); + result += getInvocations(V12_RADIX_11_MAX_SP, poly.exponent11); + } else { + if (poly.value <= V12_REGULAR_MAX_DP) + { + return 1; + } + result += getInvocations(V12_RADIX_2_MAX_DP, poly.exponent2); + result += getInvocations(V12_RADIX_3_MAX_DP, poly.exponent3); + result += getInvocations(V12_RADIX_5_MAX_DP, poly.exponent5); + result += getInvocations(V12_RADIX_7_MAX_DP, poly.exponent7); + result += getInvocations(V12_RADIX_11_MAX_DP, poly.exponent11); + } + return result; +} + + int SizeOptimizer::getInvocations(Polynom &poly, bool isFloat) { switch (version) { case (CudaVersion::V_8): return getInvocationsV8(poly, isFloat); - // case (CudaVersion::V_9): - // return getInvocationsV9(poly); // FIXME implement + case (CudaVersion::V_12): + return getInvocationsV12(poly, isFloat); default: throw std::domain_error("Unsupported version of CUDA"); } } +SizeOptimizer::Polynom SizeOptimizer::SetCorrectValuesToOriginalPolynom(int num, bool isFloat) { + Polynom poly; + poly.value = num; + + while (num % 2 == 0) { + poly.exponent2++; + num /= 2; + } + while (num % 3 == 0) { + poly.exponent3++; + num /= 3; + } + while (num % 5 == 0) { + poly.exponent5++; + num /= 5; + } + while (num % 7 == 0) { + poly.exponent7++; + num /= 7; + } + while (num % 11 == 0) { + poly.exponent11++; + num /= 11; + } + + if (num == 1) { + poly.invocations = getInvocations(poly, isFloat); + poly.noOfPrimes = getNoOfPrimes(poly); + } + else { + poly.invocations = INT_MAX; + poly.noOfPrimes = INT_MAX; + } + + return poly; +} + std::vector *SizeOptimizer::generatePolys( - size_t num, bool isFloat, bool crop) { + size_t num, bool isFloat, bool crop, bool useOriginalSize) { + std::vector *result = new std::vector(); + + if (useOriginalSize) { + Polynom p = SetCorrectValuesToOriginalPolynom(num, isFloat); + + result->push_back(p); + + return result; + } + size_t maxPow2 = std::ceil(log(num) * log_2); size_t max = std::pow(2, maxPow2); size_t maxPow3 = std::ceil(std::log(max) * log_3); size_t maxPow5 = std::ceil(std::log(max) * log_5); size_t maxPow7 = std::ceil(std::log(max) * log_7); + size_t maxPow11 = std::ceil(std::log(max) * log_11); - for (size_t a = 1; a <= maxPow2; a++) { // we want at least one multiple of two + for (size_t a = 0; a <= maxPow2; a++) { for (size_t b = 0; b <= maxPow3; b++) { for (size_t c = 0; c <= maxPow5; c++) { for (size_t d = 0; d <= maxPow7; d++) { - size_t value = std::pow(2, a) * std::pow(3, b) - * std::pow(5, c) * std::pow(7, d); - bool incCond = !crop && ((value >= num) && (value <= max)); - bool decrCond = crop && (value <= num); - if (incCond || decrCond) { - Polynom p; - p.value = value; - p.exponent2 = a; - p.exponent3 = b; - p.exponent5 = c; - p.exponent7 = d; - p.invocations = getInvocations(p, isFloat); - p.noOfPrimes = getNoOfPrimes(p); - result->push_back(p); + for (size_t e = 0; e <= maxPow11; e++) { + size_t value = std::pow(2, a) * std::pow(3, b) + * std::pow(5, c) * std::pow(7, d) + * std::pow(11, e); + + if (a == 0) { + // we want at least one multiple of two if regular_fft kernel is not induced + if ((isFloat && value > V12_REGULAR_MAX_SP) || (!isFloat && value > V12_REGULAR_MAX_DP)) { + continue; + } + } + + bool incCond = !crop && ((value >= num) && (value <= max)); + bool decrCond = crop && (value <= num); + if (incCond || decrCond) { + Polynom p; + p.value = value; + p.exponent2 = a; + p.exponent3 = b; + p.exponent5 = c; + p.exponent7 = d; + p.exponent11 = e; + p.invocations = getInvocations(p, isFloat); + p.noOfPrimes = getNoOfPrimes(p); + result->push_back(p); + } } } } @@ -348,8 +562,11 @@ std::set // add all polynoms which have a minimal number of kernel invocations for (size_t i = 0; i < size; i++) { Polynom &tmp = input->at(i); + // tmp.invocations -> number of kernel invocations + // tmp.noOfPrimes -> number of primes in size factorization + // -> cannot use more than 5 (2, 3, 5, 7, 11) if ((tmp.invocations <= (minInv.invocations + 2)) && - (tmp.noOfPrimes <= 4)) { + (tmp.noOfPrimes <= 5)) { result->insert(tmp); } } diff --git a/sizeOptimizer.h b/sizeOptimizer.h index 2cdc536..ee7b0ae 100644 --- a/sizeOptimizer.h +++ b/sizeOptimizer.h @@ -15,13 +15,28 @@ namespace cuFFTAdvisor { class SizeOptimizer { private: struct Polynom { - size_t value; - int invocations; - int noOfPrimes; - size_t exponent2; - size_t exponent3; - size_t exponent5; - size_t exponent7; + size_t value = 1; + int invocations = 0; + int noOfPrimes = 0; + size_t exponent2 = 0; + size_t exponent3 = 0; + size_t exponent5 = 0; + size_t exponent7 = 0; + size_t exponent11 = 0; + + Polynom() = default; + + Polynom(size_t value, int invocations, int noOfPrimes, size_t exponent2, size_t exponent3, size_t exponent5, size_t exponent7, size_t exponent11) + { + value = value; + invocations = invocations; + noOfPrimes = noOfPrimes; + exponent2 = exponent2; + exponent3 = exponent3; + exponent5 = exponent5; + exponent7 = exponent7; + exponent11 = exponent11; + } }; struct valueComparator { @@ -45,23 +60,44 @@ class SizeOptimizer { SizeOptimizer(CudaVersion::CudaVersion version, GeneralTransform &tr, bool allowTrans); std::vector *optimize(size_t nBest, int maxPercIncrease, - int maxMemMB, bool squareOnly, - bool crop); + int maxMemMB, bool disallowRotation, + bool disallowOptimization, int dimensionCount, + bool squareOnly, bool crop); private: + + void swapSizes(GeneralTransform &in); + /** + * This method (if not disallowed) rotate sizes in 2D configurations based on + * trained decision tree (rotation_decision_tree_marked_leaves.png). + * This function is basically desicion tree rewritten to source code leaving + * empty branches (i.e. branches, that end with "not swapped") + * + * Used parameters: + * divisibility by 2; called kernel count; distinct prime count; + * size comparison between X and Y; size difference between X and Y; + */ + bool swapSizes2D(GeneralTransform &in, const Polynom &x, const Polynom &y); + int getNoOfPrimes(Polynom &poly); + int getNoOfPrimes(long size); int getInvocations(int maxPower, size_t num); + + SizeOptimizer::Polynom SetCorrectValuesToOriginalPolynom(int num, bool isFloat); + std::vector *> optimize(GeneralTransform &tr, size_t nBest, int maxPercIncrease); int getInvocations(Polynom &poly, bool isFloat); int getInvocationsV8(Polynom &poly, bool isFloat); + int getInvocationsV12(Polynom &poly, bool isFloat); std::set *filterOptimal( std::vector *input, bool crop); - std::vector *generatePolys(size_t num, bool isFloat, bool crop); + std::vector *generatePolys(size_t num, bool isFloat, bool crop, bool disallowRotation); std::vector *optimizeXYZ(GeneralTransform &tr, size_t nBest, - int maxPercIncrease, bool squareOnly, - bool crop); + int maxPercIncrease, bool disallowRotation, + bool disallowOptimization, int dimensionCount, + bool squareOnly, bool crop); std::vector *optimizeN( std::vector *transforms, size_t maxMem, size_t nBest); void collapseBatched(GeneralTransform >, size_t maxMem, @@ -82,15 +118,34 @@ class SizeOptimizer { const double log_3; const double log_5; const double log_7; + const double log_11; + static const int V8_2D_REGULAR_MAX_SP = 0; static const int V8_RADIX_2_MAX_SP = 10; static const int V8_RADIX_3_MAX_SP = 6; static const int V8_RADIX_5_MAX_SP = 3; static const int V8_RADIX_7_MAX_SP = 3; + + + static const int V8_2D_REGULAR_MAX_DP = 0; static const int V8_RADIX_2_MAX_DP = 9; static const int V8_RADIX_3_MAX_DP = 5; static const int V8_RADIX_5_MAX_DP = 3; static const int V8_RADIX_7_MAX_DP = 3; + + static const int V12_REGULAR_MAX_SP = 5103; + static const int V12_RADIX_2_MAX_SP = 11; + static const int V12_RADIX_3_MAX_SP = 7; + static const int V12_RADIX_5_MAX_SP = 4; + static const int V12_RADIX_7_MAX_SP = 3; + static const int V12_RADIX_11_MAX_SP = 3; + + static const int V12_REGULAR_MAX_DP = 2187; + static const int V12_RADIX_2_MAX_DP = 11; + static const int V12_RADIX_3_MAX_DP = 6; + static const int V12_RADIX_5_MAX_DP = 5; + static const int V12_RADIX_7_MAX_DP = 4; + static const int V12_RADIX_11_MAX_DP = 3; static const Polynom UNIT; }; diff --git a/transform.h b/transform.h index 4497f1d..66c0ab8 100644 --- a/transform.h +++ b/transform.h @@ -16,7 +16,8 @@ class Transform { enum Rank { RANK_1D = 1, RANK_2D = 2, RANK_3D = 3 }; Transform(int device, int X, int Y, int Z, int N, bool isBatched, - bool isFloat, bool isForward, bool isInPlace, bool isReal) + bool isFloat, bool isForward, bool isInPlace, bool isReal, + int kernelInvocationX, int kernelInvocationY, int kernelInvocationZ) : device(device), X(X), Y(Y), @@ -26,7 +27,10 @@ class Transform { isInPlace(isInPlace), isReal(isReal), isFloat(isFloat), - isForward(isForward) { + isForward(isForward), + kernelInvocationX(kernelInvocationX), + kernelInvocationY(kernelInvocationY), + kernelInvocationZ(kernelInvocationZ) { // preserve order of these methods! setRankInfo(); setTypeInfo(); @@ -47,6 +51,10 @@ class Transform { bool isFloat; // otherwise double bool isForward; // otherwise inverse + int kernelInvocationX; + int kernelInvocationY; + int kernelInvocationZ; + // derived Rank rank; size_t elems; // of the transform // FIXME remove diff --git a/transformGenerator.cpp b/transformGenerator.cpp index 73d64fb..71c4c36 100644 --- a/transformGenerator.cpp +++ b/transformGenerator.cpp @@ -5,14 +5,19 @@ namespace cuFFTAdvisor { void TransformGenerator::generate(int device, int x, int y, int z, int n, bool isBatched, bool isFloat, bool isForward, bool isInPlace, Tristate::Tristate isReal, + int kernelInvocationX, + int kernelInvocationY, + int kernelInvocationZ, std::vector &result) { if (Tristate::FALSE != isReal) { result.push_back(new Transform(device, x, y, z, n, isBatched, isFloat, - isForward, isInPlace, true)); + isForward, isInPlace, true, + kernelInvocationX, kernelInvocationY, kernelInvocationZ)); } if (Tristate::TRUE != isReal) { result.push_back(new Transform(device, x, y, z, n, isBatched, isFloat, - isForward, isInPlace, false)); + isForward, isInPlace, false, + kernelInvocationX, kernelInvocationY, kernelInvocationZ)); } } @@ -20,14 +25,17 @@ void TransformGenerator::generate(int device, int x, int y, int z, int n, bool isBatched, bool isFloat, bool isForward, Tristate::Tristate isInPlace, Tristate::Tristate isReal, + int kernelInvocationX, + int kernelInvocationY, + int kernelInvocationZ, std::vector &result) { if (Tristate::FALSE != isInPlace) { generate(device, x, y, z, n, isBatched, isFloat, isForward, true, isReal, - result); + kernelInvocationX, kernelInvocationY, kernelInvocationZ, result); } if (Tristate::TRUE != isInPlace) { generate(device, x, y, z, n, isBatched, isFloat, isForward, false, isReal, - result); + kernelInvocationX, kernelInvocationY, kernelInvocationZ, result); } } @@ -36,14 +44,17 @@ void TransformGenerator::generate(int device, int x, int y, int z, int n, Tristate::Tristate isForward, Tristate::Tristate isInPlace, Tristate::Tristate isReal, + int kernelInvocationX, + int kernelInvocationY, + int kernelInvocationZ, std::vector &result) { if (Tristate::FALSE != isForward) { generate(device, x, y, z, n, isBatched, isFloat, true, isInPlace, isReal, - result); + kernelInvocationX, kernelInvocationY, kernelInvocationZ, result); } if (Tristate::TRUE != isForward) { generate(device, x, y, z, n, isBatched, isFloat, false, isInPlace, isReal, - result); + kernelInvocationX, kernelInvocationY, kernelInvocationZ, result); } } @@ -52,14 +63,17 @@ void TransformGenerator::generate(int device, int x, int y, int z, int n, Tristate::Tristate isForward, Tristate::Tristate isInPlace, Tristate::Tristate isReal, + int kernelInvocationX, + int kernelInvocationY, + int kernelInvocationZ, std::vector &result) { if (Tristate::FALSE != isFloat) { generate(device, x, y, z, n, isBatched, true, isForward, isInPlace, isReal, - result); + kernelInvocationX, kernelInvocationY, kernelInvocationZ, result); } if (Tristate::TRUE != isFloat) { generate(device, x, y, z, n, isBatched, false, isForward, isInPlace, isReal, - result); + kernelInvocationX, kernelInvocationY, kernelInvocationZ, result); } } @@ -69,17 +83,31 @@ void TransformGenerator::generate(int device, int x, int y, int z, int n, Tristate::Tristate isForward, Tristate::Tristate isInPlace, Tristate::Tristate isReal, + int kernelInvocationX, + int kernelInvocationY, + int kernelInvocationZ, std::vector &result) { if (Tristate::FALSE != isBatched) { generate(device, x, y, z, n, true, isFloat, isForward, isInPlace, isReal, - result); + kernelInvocationX, kernelInvocationY, kernelInvocationZ, result); } if (Tristate::TRUE != isBatched) { generate(device, x, y, z, n, false, isFloat, isForward, isInPlace, isReal, - result); + kernelInvocationX, kernelInvocationY, kernelInvocationZ, result); } } +void TransformGenerator::generate(int device, int x, int y, int z, int n, + Tristate::Tristate isBatched, + Tristate::Tristate isFloat, + Tristate::Tristate isForward, + Tristate::Tristate isInPlace, + Tristate::Tristate isReal, + std::vector &result) { + generate(device, x, y, z, n, isBatched, isFloat, isForward, isInPlace, isReal, + 0, 0, 0, result); +} + void TransformGenerator::transpose(GeneralTransform &tr, std::vector &result) { std::set, TransposeComp> candidates; diff --git a/transformGenerator.h b/transformGenerator.h index 14ff42e..7cf9f40 100644 --- a/transformGenerator.h +++ b/transformGenerator.h @@ -16,24 +16,35 @@ class TransformGenerator { Tristate::Tristate isInPlace, Tristate::Tristate isReal, std::vector &result); + static void generate(int device, int x, int y, int z, int n, + Tristate::Tristate isBatched, Tristate::Tristate isFloat, + Tristate::Tristate isForward, + Tristate::Tristate isInPlace, Tristate::Tristate isReal, + int kernelInvocationX, int kernelInvocationY, int kernelInvocationZ, + std::vector &result); + static void generate(int device, int x, int y, int z, int n, bool isBatched, bool isFloat, bool isForward, bool isInPlace, Tristate::Tristate isReal, + int kernelInvocationX, int kernelInvocationY, int kernelInvocationZ, std::vector &result); static void generate(int device, int x, int y, int z, int n, bool isBatched, bool isFloat, bool isForward, Tristate::Tristate isInPlace, Tristate::Tristate isReal, + int kernelInvocationX, int kernelInvocationY, int kernelInvocationZ, std::vector &result); static void generate(int device, int x, int y, int z, int n, bool isBatched, bool isFloat, Tristate::Tristate isForward, Tristate::Tristate isInPlace, Tristate::Tristate isReal, + int kernelInvocationX, int kernelInvocationY, int kernelInvocationZ, std::vector &result); static void generate(int device, int x, int y, int z, int n, bool isBatched, Tristate::Tristate isFloat, Tristate::Tristate isForward, Tristate::Tristate isInPlace, Tristate::Tristate isReal, + int kernelInvocationX, int kernelInvocationY, int kernelInvocationZ, std::vector &result); static void transpose(GeneralTransform &tr, diff --git a/utils.h b/utils.h index db25da5..585040d 100644 --- a/utils.h +++ b/utils.h @@ -50,7 +50,7 @@ static inline const char *toString(Tristate t) { } // namespace Tristate namespace CudaVersion { // FIXME implement auto detection -enum CudaVersion { V_8, V_9 }; +enum CudaVersion { V_8, V_9, V_12 }; } // namespace CudaVersion static inline const char *toString(cuFFTAdvisor::Tristate::Tristate t) { @@ -75,4 +75,8 @@ static inline bool safeEquals(const char *l, const char *r) { } // namespace cuFFTAdvisor +const int MaxNumberOfOptimizedDimensions = 3; + +const size_t MinimalCountOfUsedConfigurations = 10; + #endif // CUFFTADVISOR_UTILS_H_ diff --git a/validator.cpp b/validator.cpp index 75bd53a..2900b2f 100644 --- a/validator.cpp +++ b/validator.cpp @@ -4,6 +4,7 @@ namespace cuFFTAdvisor { void Validator::validate(int x, int y, int z, int n, int device, int maxSignalInc, int maxMemMB, + int countOfOptimizedDimensions, bool allowTrans, bool squareOnly) { validate(x, y, z, n, device); @@ -25,6 +26,12 @@ void Validator::validate(int x, int y, int z, int n, int device, if (allowTrans && squareOnly) { throw std::logic_error("Incompatible parameters. See help for detailed info.\n"); } + + if (countOfOptimizedDimensions <= 0 || countOfOptimizedDimensions > 3) { + throw std::logic_error( + "Count of optimized input size dimensions must be between 1 and 3. Wrong input or int " + "overflow\n"); + } } void Validator::validate(int device) { diff --git a/validator.h b/validator.h index 7d5ef76..ba0438c 100644 --- a/validator.h +++ b/validator.h @@ -11,7 +11,7 @@ class Validator { public: static void validate(int x, int y, int z, int n, int device); static void validate(int x, int y, int z, int n, int device, int maxSignalInc, - int maxMemMB, bool allowTrans, bool squareOnly); + int maxMemMB, int countOfOptimizedDimensions, bool allowTrans, bool squareOnly); static void validate(int device); };