From 9965c7cf6d5454a969aaf62e1ed1eb3736374a9b Mon Sep 17 00:00:00 2001
From: Dan van der Ster <dan.vanderster@clyso.com>
Date: Sun, 26 Oct 2025 13:48:34 -0700
Subject: [PATCH 01/28] Introduce clyso-nines

Signed-off-by: Dan van der Ster <dan.vanderster@clyso.com>
---
 otto/src/clyso/ceph/otto/tools/clyso-nines | 634 +++++++++++++++++++++
 1 file changed, 634 insertions(+)
 create mode 100755 otto/src/clyso/ceph/otto/tools/clyso-nines

diff --git a/otto/src/clyso/ceph/otto/tools/clyso-nines b/otto/src/clyso/ceph/otto/tools/clyso-nines
new file mode 100755
index 0000000..4d2374b
--- /dev/null
+++ b/otto/src/clyso/ceph/otto/tools/clyso-nines
@@ -0,0 +1,634 @@
+#!/usr/bin/env python3
+
+#   clyso-nines - A tool to estimate data loss risk in Ceph clusters
+#
+#   Copyright (C) 2025 Dan van der Ster <dan.vanderster@clyso.com>
+#
+#   This program is free software: you can redistribute it and/or modify
+#   it under the terms of the GNU Affero General Public License as
+#   published by the Free Software Foundation, either version 3 of the
+#   License, or (at your option) any later version.
+#
+#   This program is distributed in the hope that it will be useful,
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#   GNU Affero General Public License for more details.
+#
+#   You should have received a copy of the GNU Affero General Public License
+#   along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+import argparse
+from collections import defaultdict
+import math
+import random
+import time
+import shlex
+
+# ---------------------------- Utilities ----------------------------
+
+def reconstruct_command_line(parser, args):
+    command_parts = [parser.prog, args.command]
+
+    for arg in vars(args):
+        if arg == 'command' or arg == 'func':
+            continue
+        value = getattr(args, arg)
+        if isinstance(value, bool):
+            if value:
+                command_parts.append(f"--{arg.replace('_', '-')}")
+        elif value is not None:
+            command_parts.append(f"--{arg.replace('_', '-')}")
+            command_parts.append(shlex.quote(str(value)))
+    return ' '.join(command_parts)
+
+def round_to_nearest_power_of_two(n):
+    if n <= 0:
+        raise ValueError("Input must be a positive number.")
+    lower = 2 ** math.floor(math.log2(n))
+    upper = 2 ** math.ceil(math.log2(n))
+    return lower if (n - lower) < (upper - n) else upper
+
+def count_nines(value, max_digits=20):
+    # Round to given precision
+    rounded = round(value, max_digits)
+    s = f"{rounded:.{max_digits}f}"
+
+    if not s.startswith("0."):
+        return f"{max_digits} nines (1.0)"  # Treat 1.0 as "max nines"
+
+    decimal_part = s.split('.')[1]
+    count = 0
+    for c in decimal_part:
+        if c == '9':
+            count += 1
+        else:
+            break
+    return f"{count}-nines" if count > 0 else "0-nines (0.0)"
+
+def comb(n, k):
+    """Return C(n, k) with math.comb-like semantics."""
+    if not (isinstance(n, int) and isinstance(k, int)):
+        raise TypeError("n and k must be integers")
+    if n < 0 or k < 0 or k > n:
+        raise ValueError("n must be >= 0 and 0 <= k <= n")
+    k = min(k, n - k)  # symmetry
+    c = 1
+    for i in range(1, k + 1):
+        c = c * (n - i + 1) // i
+    return c
+
+def mttdl(N, k, afr, fudge, recovery_time_sec):
+    T_year = 31_536_000  # seconds in a year
+    p = afr * recovery_time_sec / T_year
+    prob_k_failures = fudge*comb(N, k) * (p ** k)
+    mttdl_seconds = recovery_time_sec / prob_k_failures
+    return mttdl_seconds
+
+def prob_exact_k_failures(n, p, k):
+    return comb(n, k) * (p ** k) * ((1 - p) ** (n - k))
+
+def prob_k_or_more_failures(n, p, k):
+    return sum(comb(n, i) * (p ** i) * ((1 - p) ** (n - i)) for i in range(k, n + 1))
+
+def expected_max_osd_load(n_osds, n_pgs):
+    if n_pgs == 0 or n_osds == 0:
+        return float('inf')
+    if n_pgs < n_osds:
+        # Sparse case: use log-ratio approximation
+        return math.ceil(math.log(n_osds) / math.log(n_osds / n_pgs))
+    else:
+        # Dense case: average load + deviation
+        avg = n_pgs / n_osds
+        deviation = math.sqrt(2 * avg * math.log(n_osds))
+        return math.ceil(avg + deviation)
+
+def format_bytes(val):
+    units = ["B", "KB", "MB", "GB", "TB", "PB"]
+    thresholds = [1, 1024, 1024**2, 1024**3, 1024**4, 1024**5]
+
+    for i in reversed(range(1, len(units))):
+        if val >= thresholds[i]:
+            return f"~{val / thresholds[i]:.1f}{units[i]}"
+    return f"~{round(val)}B"
+
+def format_int(val):
+    units = ["", "K", "M", "B"]
+    thresholds = [1, 1000, 1000**2, 1000**3]
+
+    for i in reversed(range(1, len(units))):
+        if val >= thresholds[i]:
+            return f"~{val / thresholds[i]:.1f}{units[i]}"
+    return f"~{round(val)}"
+
+def format_time(seconds):
+    units = ["s", "min", "hr", "days", "years", "thousand years", "million years", "billion years"]
+    thresholds = [1, 60, 3600, 86400, 86400*365, 86400*365*1000, 86400*365*10**6, 86400*365*10**9]
+
+    for i in reversed(range(1, len(units))):
+        if seconds >= thresholds[i]:
+            return f"{round(seconds / thresholds[i], 1)} {units[i]}"
+    return f"{round(seconds)} s"
+
+def simple_crush(osd_ids, number_of_domains, size, num_pgs):
+    if size > number_of_domains:
+        raise ValueError("Replication size cannot be greater than the number of failure domains.")
+
+    # Group OSDs into failure domains evenly
+    domains = defaultdict(list)
+    for i, osd in enumerate(osd_ids):
+        domain = i % number_of_domains
+        domains[domain].append(osd)
+
+    domain_ids = list(domains.keys())
+
+    pgs = []
+    for _ in range(num_pgs):
+        # Randomly choose failure domains for this PG
+        chosen_domains = random.sample(domain_ids, size)
+        pg_osds = []
+        for domain in chosen_domains:
+            # Pick a random OSD from the selected domain
+            pg_osds.append(random.choice(domains[domain]))
+        pgs.append(tuple(pg_osds))
+
+    return pgs
+
+def confidence_interval(successes, trials, confidence=0.95):
+    if trials == 0:
+        return (0, 0)
+    p_hat = successes / trials
+    z = 1.96  # For 95% confidence
+    margin_of_error = z * ((p_hat * (1 - p_hat)) / trials) ** 0.5
+    return (max(0, p_hat - margin_of_error), min(1, p_hat + margin_of_error))
+
+
+def effective_recovery_rate(object_size_bytes,
+                            recovery_bw=50*1024*1024,  # 50 MB/s
+                            osd_recovery_sleep=0.1,
+                            objects_per_chunk=30):
+    """
+    Calculate Ceph effective recovery throughput considering chunking and sleep.
+
+    object_size_bytes: size of one object in bytes
+    recovery_bw_MBps: raw recovery bandwidth (MB/s)
+    osd_recovery_sleep: sleep time after each chunk (seconds)
+    objects_per_chunk: number of objects recovered per batch (default 30)
+    """
+    chunk_size_bytes = (object_size_bytes * objects_per_chunk)
+
+    # Time to recover chunk at given bandwidth (seconds)
+    transfer_time = chunk_size_bytes / recovery_bw
+
+    # Add recovery sleep per chunk
+    total_time = transfer_time + osd_recovery_sleep
+
+    # Effective throughput (Bps)
+    return chunk_size_bytes / total_time
+
+
+# ---------------------------- Simulate Subcommand ----------------------------
+def simulate(args):
+
+    # Set random seed for reproducibility
+    random.seed(args.seed)
+
+    raw_to_stored_factor = 0
+    profile = ""
+    # Derive PG size and min_size
+    if args.k is not None and args.m is not None:
+        pg_size = args.k + args.m
+        min_size = args.k
+        raw_to_stored_factor = args.k / (args.k + args.m)
+        profile = f"{args.k}+{args.m} erasure"
+        num_failures = args.m + 1
+    elif args.size is not None:
+        pg_size = args.size
+        min_size = 1
+        raw_to_stored_factor = 1 / (args.size)
+        profile = f"{args.size}x replicated"
+        num_failures = args.size
+    else:
+        raise ValueError("Specify either --size or both --k and --m.")
+
+    num_simulations = args.simulations
+    if num_simulations < 100:
+        raise ValueError("Number of simulations must be at least 100 for meaningful results.")
+    if num_simulations % 50 != 0:
+        raise ValueError("Number of simulations must be a multiple of 50 for progress display.")
+
+    print(f"---------------------------")
+    print()
+
+    if not hasattr(args, 'pgs'):
+        # simulating a randomized pool
+
+        num_domains = args.num_domains
+        pgs_per_osd = args.pgs_per_osd
+        num_osds = args.num_osds
+
+        failure_domain = args.failure_domain
+
+        if num_osds % args.num_domains != 0:
+            raise ValueError("Number of OSDs must be divisible by number of domains.")
+        osds_per_domain = int(num_osds / args.num_domains)
+
+        num_pgs = round_to_nearest_power_of_two((num_osds * pgs_per_osd) / pg_size)
+        osds = list(range(1, num_osds + 1))
+        pgs = simple_crush(osds,num_domains,pg_size,num_pgs) # [tuple(random.sample(osds, pg_size)) for _ in range(num_pgs)]
+
+        # Print simulation summary
+        print(f"Simulating a {profile} pool with {num_osds} OSDs and {num_domains} {failure_domain}s ({osds_per_domain} OSDs per {failure_domain})")
+        print(f"    PGs: {num_pgs} total, averaging {num_pgs * pg_size / num_osds:.3g} per OSD")
+#        print(f"    Total data: {format_bytes(total_bytes)} raw, {format_bytes(raw_to_stored_factor*total_bytes)} stored")
+
+#        print(f"    Data per {failure_domain}: {format_bytes(osds_per_domain*bytes_per_osd)} raw, {format_bytes(raw_to_stored_factor*osds_per_domain*bytes_per_osd)} stored")
+    else:
+        # simulating failures on a given pool
+        pgs = args.pgs
+        osds = args.osds
+        num_pgs = len(pgs)
+        num_osds = len(osds)
+        pgs_per_osd = num_pgs * pg_size / num_osds
+        
+        # Print analyze summary
+        print(f"Analyzing a {profile} pool with {num_osds} OSDs and {num_pgs} PGs")
+        print(f"    PGs: {num_pgs} total, ~{num_pgs * pg_size / num_osds:.3g} per OSD")
+
+    # Re-derive the stored bytes/objects per PG from tb_per_osd
+    bytes_per_osd = args.tb_per_osd * 1024**4
+    total_bytes = num_osds * bytes_per_osd
+    bytes_per_pg = total_bytes / num_pgs * raw_to_stored_factor
+    try:
+        objects_per_pg = bytes_per_pg / args.object_size
+    except ZeroDivisionError:
+        objects_per_pg = 0
+    if args.k is not None:
+        bytes_per_pg /= args.k
+
+    print(f"    Data per PG: {format_bytes(bytes_per_pg)}, {format_int(objects_per_pg)} objects")
+    print(f"    Example PG: {pgs[0]}\n")
+
+    print(f"---------------------------")
+    print()
+
+    # simulate the major incidents
+    print(f"Monte Carlo Sim: n={num_simulations} {num_failures}x failures")
+    threshold = pg_size - min_size
+    num_batches = 50
+    batch_size = num_simulations // num_batches
+    simulations_with_loss = 0
+    total_pg_losses = 0
+    print()
+    print(f"    |{'-' * num_batches}|")
+    print(f"    |", end="", flush=True)
+    simulations_done = 0
+    sim_runtime = 10 # seconds
+    start_time = time.time()
+    for batch in range(num_batches):
+        batch_good = True
+        for _ in range(int(num_simulations/num_batches)):
+            simulations_done += 1
+            failed_osds = set(random.sample(osds, num_failures))
+            dead_pgs = sum(
+                1 for pg in pgs if sum(1 for osd in pg if osd in failed_osds) > threshold
+            )
+            total_pg_losses += dead_pgs
+            if dead_pgs > 0:
+                simulations_with_loss += 1
+                batch_good = False
+        if batch_good:
+            print(".",end="",flush=True)
+        else:
+            print("x",end="",flush=True)
+        # Exit early if taking too long
+        elapsed = time.time() - start_time
+        if elapsed > sim_runtime:
+            print(" taking too long, exiting early. ",end="",flush=True)
+            num_simulations = simulations_done
+            break
+
+    print(f"|")
+    print(f"    |{'-' * num_batches}|")
+    print()
+
+    print(f"    Theoretical loss rate calculation:")
+    if args.k is not None:
+        print(f"        OSD failure combinations: C({num_osds}, {num_failures}) = {comb(num_osds, num_failures)} ")
+        print(f"        OSD to PG failure ratio: C({pg_size}, {num_failures}) = {comb(pg_size, num_failures)}")
+        print(f"        Loss rate: {num_pgs} PGs * {comb(pg_size, num_failures)} / {comb(num_osds, num_failures)} combinations")
+        loss_rate = num_pgs / comb(num_osds, num_failures) * comb(pg_size, num_failures)
+    else:
+        print(f"        OSD failure combinations: C({num_osds}, {num_failures}) = {comb(num_osds, num_failures)}")
+        print(f"        OSD to PG failure ratio: C({pg_size}, {num_failures}) = {comb(pg_size, num_failures)}")
+        print(f"        Loss rate: {num_pgs} PGs / {comb(num_osds, num_failures)} combinations")
+        loss_rate = num_pgs / comb(num_osds, num_failures)
+    loss_rate = min(1.0, loss_rate)
+    print(f"    Theoretical Incidents with ≥1 PG lost: {loss_rate*num_simulations:.4g}/{num_simulations} ({loss_rate * 100:.3g}%)")
+    if simulations_with_loss > 0:
+        loss_rate = simulations_with_loss / num_simulations
+        print(f"    Observed Incidents with ≥1 PG lost: {simulations_with_loss}/{num_simulations} ({100 * loss_rate:.3f}%)")
+    else:
+        print(f"    Observed Incidents with ≥1 PG lost: 0/{num_simulations} (0%)")
+        print("    Warning: No data loss observed in simulations, using theoretical estimate.")
+        total_pg_losses = simulations_with_loss = loss_rate * num_simulations  # use theoretical estimate
+
+    confidence = confidence_interval(simulations_with_loss, num_simulations)
+    print(f"        95% Confidence Interval: {confidence[0] * 100:.3g}% - {confidence[1] * 100:.3g}%")
+    
+    pgs_lost_per_event = total_pg_losses / num_simulations
+    pct_data_loss = total_pg_losses / (num_simulations * num_pgs)
+    print(f"    Expected Data Loss Per Incident: {pgs_lost_per_event:.3g} PGs, {format_bytes(pgs_lost_per_event*bytes_per_pg)} ({pct_data_loss * 100:.3g}%)")
+
+    # Try to compute a meaningful MTTDL and Nines Durability Number
+    print("")
+    print("---------------------------")
+    print("")
+
+    print(f"Estimating the probability of {num_failures}x failures")
+    afr = args.afr
+    if pg_size > 1:
+        print(f"    Estimated per-drive AFR: {afr}")
+
+        # recovery rate is a function of object size, recovery bandwidth, and OSD recovery sleep
+        recovery_rate = effective_recovery_rate(args.object_size,
+                                                args.recovery_rate*1024*1024,
+                                                args.osd_recovery_sleep,
+                                                objects_per_chunk=30)
+
+        print(f"    Estimated per-OSD recovery rate: {format_bytes(recovery_rate)}ps ({format_int(recovery_rate/args.object_size)} objects/s)")
+
+        recovery_time_per_pg = bytes_per_pg / recovery_rate
+        print(f"    Expected PG recovery time ({format_bytes(bytes_per_pg)}/{format_bytes(recovery_rate)}ps): {format_time(recovery_time_per_pg)}")
+
+        # Estimate max OSD load during recovery when things are critical
+        # e.g. for 3x repl: how long to recover 2 OSDs failed.
+        # e.g. for 6+3 EC: how long to recover 3 OSDs failed.
+        if args.k is not None:
+            critical_osds = args.m
+        else:
+            critical_osds = args.size - 1
+        num_osds_left = num_osds - num_failures + 1
+        # How many PGs need to be recovered 
+        max_osd_load = expected_max_osd_load(num_osds_left, critical_osds*pgs_per_osd)
+
+        print(f"    Expected most loaded OSD when recovering {format_int((num_failures-1)*pgs_per_osd)} PGs to {num_osds-num_failures+1} OSDs = {max_osd_load} PGs")
+
+        recovery_time = max(max_osd_load * recovery_time_per_pg, 60) # minimum 1 minutes per PG
+        print(f"    Expected critical period ({max_osd_load} PGs * {format_time(recovery_time_per_pg)}) = {format_time(recovery_time)}")
+
+        critical_windows_per_year = 31536000 / recovery_time
+        print(f"    Expected critical periods annually (1yr / {format_time(recovery_time)}) = {critical_windows_per_year:.1f}")
+
+        failure_prob_per_disk = recovery_time / 31536000 * afr
+        print(f"    Expected per-OSD critical period failure rate = {failure_prob_per_disk:.2g} (1 in {format_int(1/failure_prob_per_disk)})")
+
+        print(f"    Failure correlation factor = {args.fudge}")
+        
+        failure_prob = args.fudge * prob_k_or_more_failures(num_osds, failure_prob_per_disk, num_failures)
+        print(f"    Probability of {num_failures}+ failures in any critical period ({format_time(recovery_time)}) = 1 in {format_int(1/failure_prob)}")
+
+        expected_annual_events = critical_windows_per_year * failure_prob
+        print(f"    Expected annual {num_failures}+ failure events: {expected_annual_events:.3g} (1 every {format_time(31536000/expected_annual_events)})")
+        mtt_multi_failure = mttdl(num_osds, num_failures, afr, args.fudge, recovery_time)
+        print(f"    Alternate calculation (mean time to {num_failures}+ concurrent failures) = {format_time(mtt_multi_failure)}")
+    else: # size == 1 (no redundancy)
+        print("    No redundancy configured, data loss is certain on any failure.")
+        expected_annual_events = num_osds * afr
+        mtt_multi_failure = 31536000 / expected_annual_events
+        print(f"    Expected annual failure events: {expected_annual_events:.3g} (1 every {format_time(31536000/expected_annual_events)})")
+
+
+
+    print("---------------------------")
+    print("")
+
+    min_loss_rate = confidence[0]
+    max_loss_rate = confidence[1]
+
+    print(f"Final MTTDL/Durability Estimates:")
+    print("")
+    print(f"    Mean time to {num_failures}x failure: {format_time(mtt_multi_failure)}")
+    try:
+        print(f"    Probability that {num_failures}x failure causes data loss (95% CI): {min_loss_rate*100:.3g}%-{max_loss_rate*100:.3g}% (1 in {format_int(1/max_loss_rate)} to 1 in {format_int(1/min_loss_rate)})")
+    except ZeroDivisionError:
+        print(f"    Probability that {num_failures}x failure causes data loss (95% CI): {min_loss_rate*100:.3g}%-{max_loss_rate*100:.3g}% (1 in {format_int(1/max_loss_rate)} to ∞)")
+    print("")
+    print(f"Combining the above to estimate annual data loss:")
+
+    # Compute MTTDL
+    expected_losses_per_year = expected_annual_events * loss_rate
+    min_expected_losses_per_year = expected_annual_events * min_loss_rate
+    max_expected_losses_per_year = expected_annual_events * max_loss_rate
+    mttdl_sec = 31536000/expected_losses_per_year
+    min_mttdl_sec = 31536000/max_expected_losses_per_year
+    try:
+        max_mttdl_sec = 31536000/min_expected_losses_per_year
+        print(f"    Mean Time To Data Loss (MTTDL): {format_time(mttdl_sec)}")
+        print(f"        (95% CI: {format_time(min_mttdl_sec)} - {format_time(max_mttdl_sec)})")
+    except ZeroDivisionError:
+        print(f"    Mean Time To Data Loss (MTTDL): {format_time(mttdl_sec)}")
+        print(f"        (95% CI: {format_time(min_mttdl_sec)} - ∞)")
+
+    # Compute Nines Durability Number
+    expected_pgs_lost_per_year = expected_losses_per_year * pgs_lost_per_event
+    min_expected_pgs_lost_per_year = min_expected_losses_per_year * pgs_lost_per_event
+    max_expected_pgs_lost_per_year = max_expected_losses_per_year * pgs_lost_per_event
+    expected_durability = 1 - (expected_pgs_lost_per_year / num_pgs)
+    min_expected_durability = 1 - (max_expected_pgs_lost_per_year / num_pgs)
+    max_expected_durability = 1 - (min_expected_pgs_lost_per_year / num_pgs)
+    print(f"    Durability: {count_nines(expected_durability)}")
+    print(f"        (95% CI: {count_nines(min_expected_durability)} to {count_nines(max_expected_durability)})")
+
+    expected_bytes_lost_per_year = expected_pgs_lost_per_year * bytes_per_pg
+    min_expected_bytes_lost_per_year = min_expected_pgs_lost_per_year * bytes_per_pg
+    max_expected_bytes_lost_per_year = max_expected_pgs_lost_per_year * bytes_per_pg
+
+    print(f"    Expected annual data loss: {format_bytes(expected_bytes_lost_per_year)}")
+    print(f"        (95% CI: {format_bytes(min_expected_bytes_lost_per_year)} - {format_bytes(max_expected_bytes_lost_per_year)})")
+
+    return (format_time(mttdl_sec), count_nines(expected_durability))
+
+
+# ---------------------------- Analyze Subcommand ----------------------------
+
+def analyze_pool(args, pool, pg_dump):
+    print(f"Analyzing pool '{pool['pool_name']}' (id {pool['pool_id']}, size {pool['size']}) ")
+    if 'erasure_code_profile' in pool and pool['erasure_code_profile']:
+        print(f"    Type: erasure-coded ({pool['erasure_code_profile']})")
+        # For simplicity, assume standard k+m profile naming
+        try:
+            k, m = map(int, pool['erasure_code_profile'].split('+'))
+            args.k = k
+            args.m = m
+            args.size = None
+        except:
+            print("    Warning: Unable to parse erasure code profile. Please specify --k and --m manually.")
+            return
+    else:
+        print(f"    Type: replicated")
+        args.size = pool['size']
+        args.k = None
+        args.m = None
+
+    args.pg_num = pool['pg_num']
+
+    pg_stats = pg_dump['pg_map']['pg_stats']
+    pgs = [pg['acting'] for pg in pg_stats if pg['pgid'].startswith(f"{pool['pool_id']}.")]
+    assert len(pgs) == args.pg_num, "PG count mismatch!"
+    args.pgs = pgs
+
+    osds = set()
+    for pg in pg_stats:
+        osds.update(pg['acting'])
+        osds.update(pg['up'])
+
+    args.osds = list(osds)
+    args.num_osds = pg_dump['pg_map']['osd_stats_sum']['num_osds']
+    assert args.num_osds == len(osds), "OSD count mismatch!"
+
+    print(f"    Number of PGs: {args.pg_num}")
+    print(f"    Number of OSDs: {len(osds)}")
+
+    # Derive tb_per_osd from pool size and pg stat_sum num_bytes
+    bytes_stored = sum(pg['stat_sum']['num_bytes'] for pg in pg_stats if pg['pgid'].startswith(f"{pool['pool_id']}."))
+    objects_stored = sum(pg['stat_sum']['num_objects'] for pg in pg_stats if pg['pgid'].startswith(f"{pool['pool_id']}."))
+    raw_to_stored_factor = 0
+    if args.k is not None and args.m is not None:
+        raw_to_stored_factor = (args.k) / (args.k + args.m)
+    else:
+        raw_to_stored_factor = 1 / (args.size)
+    bytes_raw = bytes_stored / raw_to_stored_factor
+    objects_raw = objects_stored / raw_to_stored_factor
+    args.tb_per_osd = bytes_raw / len(osds) / (1024**4)
+    args.object_size = bytes_raw / objects_raw if objects_raw > 0 else 4*1024*1024  # default to 4MB
+
+    print(f"    Total stored data: {format_bytes(bytes_stored)} ({format_int(objects_stored)} objects)")
+    print(f"    Total raw data: {format_bytes(bytes_raw)} ({format_int(objects_raw)} objects)")
+    print(f"    Data per OSD: {format_bytes(args.tb_per_osd * 1024**4)} ({format_int(objects_raw / len(osds))} objects)")
+    print(f"    Average object size: {format_bytes(args.object_size)}")
+    print("")
+    [mttdl, nines] = simulate(args)
+
+    print(f"AAA {pool['pool_name']}: {mttdl} MTTDL, {nines} Durability")
+
+def analyze(args):
+    import subprocess
+    import json
+
+    # Set random seed for reproducibility
+    random.seed(args.seed)
+
+    cluster = args.cluster
+    pool_name = args.pool
+
+    # fetch pool list
+    try:
+        ceph_cmd = ['ceph', f'--cluster={cluster}', 'osd', 'pool', 'ls', 'detail', '--format=json']
+        pool_json = subprocess.check_output(ceph_cmd)
+        pool_data = json.loads(pool_json)
+    except Exception as e:
+        pool_data = None
+
+    # Try using local copy of ceph-osd-pool-ls-detail.json
+    if pool_data is None:
+        try:
+            with open('ceph-osd-pool-ls-detail.json', 'r') as f:
+                pool_data = json.load(f)
+            print("Using local ceph-osd-pool-ls-detail.json file.")
+        except Exception as e2:
+            print(f"Error loading local pool data: {e2}")
+            return
+
+    # fetch ceph pg dump
+    try:
+        ceph_cmd = ['ceph', f'--cluster={cluster}', 'pg', 'dump', '--format=json']
+        pg_dump_json = subprocess.check_output(ceph_cmd)
+        pg_dump_data = json.loads(pg_dump_json)
+    except Exception as e:
+        pg_dump_data = None
+
+    # Try using local copy of ceph-pg-dump.json
+    if pg_dump_data is None:
+        try:
+            with open('ceph-pg-dump.json', 'r') as f:
+                pg_dump_data = json.load(f)
+            print("Using local ceph-pg-dump.json file.")
+            print("")
+            print("========================================")
+            print("")
+        except Exception as e2:
+            print(f"Error loading local PG dump data: {e2}")
+            return
+
+    found = False
+    for p in pool_data:
+        if p['pool_name'] == pool_name or not pool_name:
+            analyze_pool(args, p, pg_dump_data)
+            found = True
+            print("")
+            print("========================================")
+            print("")
+
+    if not found:
+        print(f"Pool '{pool_name}' not found in cluster '{cluster}'.")
+
+    return
+
+
+# ---------------------------- Main CLI ----------------------------
+def main():
+    parser = argparse.ArgumentParser(
+            prog='clyso-nines',
+            description='Clyso Nines - Data Loss Risk Estimator for Ceph Clusters'
+    )
+    parser.add_argument('--version', action='version', version='clyso-nines 0.1')
+
+    subparsers = parser.add_subparsers(dest='command')
+
+    sim = subparsers.add_parser('simulate', help='Calculate durability of a randomized pool')
+    sim.add_argument('--verbose', action='store_true', help='enable verbose output')
+    sim.add_argument('--num-osds', type=int, default=100, help="number of OSDs in the Cluster (default: 100)")
+    sim.add_argument('--failure-domain', type=str, default="host", help="Type of failure domain (host/rack/...) (default: host)")
+    sim.add_argument('--num-domains', type=int, default=10, help="number of failure domains (default: 10)")
+    sim.add_argument('--tb-per-osd', type=int, default=10, help="raw data per OSD in TB (default: 10)")
+    sim.add_argument('--object-size', type=int, default=4*1024*1024, help="average object size in bytes (default: 4MB)")
+    sim.add_argument('--pgs-per-osd', type=int, default=100, help="target PGs per OSD (default: 100)")
+    sim.add_argument('--size', type=int, default=3, help="simulate a replicated pool with SIZE replicas (default: 3)")
+    sim.add_argument('--k', type=int, help="simulate an erasure coded pool with K data shards")
+    sim.add_argument('--m', type=int, help="simulate an erasure coded pool with M parity shards")
+    sim.add_argument('--simulations', type=int, default=1000, help="number of incidents to simulate (default: 1000)")
+    sim.add_argument('--seed', type=int, default=int(time.time()), help='random seed (default: current time)')
+    sim.add_argument('--fudge', type=int, default=100, help='correlation "fudge" factor (default: 100)')
+    sim.add_argument('--afr', type=float, default=0.02, help='Device AFR (default: 0.02)')
+    sim.add_argument('--recovery-rate', type=float, default=50, help='Recovery rate per OSD (default: 50MBps)')
+    sim.add_argument('--osd-recovery-sleep', type=float, default=0.1, help='OSD Recovery Sleep in seconds (default: 0.1s)')
+    sim.set_defaults(func=simulate)
+
+    an = subparsers.add_parser('analyze', help='Calculate durability of the local pools')
+    an.add_argument('--verbose', action='store_true', help='enable verbose output')
+    an.add_argument('--cluster', type=str, default='ceph', help='Ceph cluster name (default: ceph)')
+    an.add_argument('--pool', type=str, help='analyze only the given pool (default: all pools)')
+    an.add_argument('--simulations', type=int, default=1000, help="number of incidents to simulate (default: 1000)")
+    an.add_argument('--seed', type=int, default=int(time.time()), help='random seed (default: current time)')
+    an.add_argument('--fudge', type=int, default=100, help='correlation "fudge" factor (default: 100)')
+    an.add_argument('--afr', type=float, default=0.02, help='Device AFR (default: 0.02)')
+    an.add_argument('--recovery-rate', type=int, default=50, help='Per-OSD data recovery rate (default: 50MBps)')
+    an.add_argument('--osd-recovery-sleep', type=float, default=0.1, help='OSD Recovery Sleep in seconds (default: 0.1s)')
+    an.set_defaults(func=analyze)
+
+    args = parser.parse_args()
+    if not args.command:
+        parser.print_help()
+    else:
+        # Reconstruct command
+        cmdline = reconstruct_command_line(parser, args)
+        # Print header
+        print(f"Clyso Nines - Data Loss Risk Estimator for Ceph Clusters")
+        print("---------------------------------------------------------")
+        print(f"Command line: {cmdline}")
+        print("---------------------------------------------------------")
+        print()
+        args.func(args)
+
+if __name__ == "__main__":
+    main()

From 09539b820a4e35f1ec885fefdcb9566b9e158fbe Mon Sep 17 00:00:00 2001
From: Dan van der Ster <dan.vanderster@clyso.com>
Date: Tue, 28 Oct 2025 14:44:31 -0700
Subject: [PATCH 02/28] clyso-nines: fixes

Signed-off-by: Dan van der Ster <dan.vanderster@clyso.com>
---
 otto/src/clyso/ceph/otto/tools/clyso-nines | 221 ++++++++++++---------
 1 file changed, 131 insertions(+), 90 deletions(-)

diff --git a/otto/src/clyso/ceph/otto/tools/clyso-nines b/otto/src/clyso/ceph/otto/tools/clyso-nines
index 4d2374b..7b14e3d 100755
--- a/otto/src/clyso/ceph/otto/tools/clyso-nines
+++ b/otto/src/clyso/ceph/otto/tools/clyso-nines
@@ -24,8 +24,14 @@ import random
 import time
 import shlex
 
+VERBOSE = False
+
 # ---------------------------- Utilities ----------------------------
 
+def vprint(*args, **kwargs):
+    if VERBOSE:
+        print(*args, **kwargs)
+
 def reconstruct_command_line(parser, args):
     command_parts = [parser.prog, args.command]
 
@@ -54,7 +60,7 @@ def count_nines(value, max_digits=20):
     s = f"{rounded:.{max_digits}f}"
 
     if not s.startswith("0."):
-        return f"{max_digits} nines (1.0)"  # Treat 1.0 as "max nines"
+        return f"{max_digits} nines"  # Treat 1.0 as "max nines"
 
     decimal_part = s.split('.')[1]
     count = 0
@@ -216,7 +222,7 @@ def simulate(args):
     if num_simulations % 50 != 0:
         raise ValueError("Number of simulations must be a multiple of 50 for progress display.")
 
-    print(f"---------------------------")
+    vprint(f"---------------------------")
     print()
 
     if not hasattr(args, 'pgs'):
@@ -238,10 +244,8 @@ def simulate(args):
 
         # Print simulation summary
         print(f"Simulating a {profile} pool with {num_osds} OSDs and {num_domains} {failure_domain}s ({osds_per_domain} OSDs per {failure_domain})")
-        print(f"    PGs: {num_pgs} total, averaging {num_pgs * pg_size / num_osds:.3g} per OSD")
-#        print(f"    Total data: {format_bytes(total_bytes)} raw, {format_bytes(raw_to_stored_factor*total_bytes)} stored")
+        vprint(f"    PGs: {num_pgs} total, averaging {num_pgs * pg_size / num_osds:.3g} per OSD")
 
-#        print(f"    Data per {failure_domain}: {format_bytes(osds_per_domain*bytes_per_osd)} raw, {format_bytes(raw_to_stored_factor*osds_per_domain*bytes_per_osd)} stored")
     else:
         # simulating failures on a given pool
         pgs = args.pgs
@@ -251,8 +255,9 @@ def simulate(args):
         pgs_per_osd = num_pgs * pg_size / num_osds
         
         # Print analyze summary
-        print(f"Analyzing a {profile} pool with {num_osds} OSDs and {num_pgs} PGs")
-        print(f"    PGs: {num_pgs} total, ~{num_pgs * pg_size / num_osds:.3g} per OSD")
+        print(f"Analyzing '{args.pool_name}':")
+        print(f"    Info: {profile} pool with {num_osds} OSDs and {num_pgs} PGs")
+        vprint(f"    PGs: {num_pgs} total, ~{num_pgs * pg_size / num_osds:.3g} per OSD")
 
     # Re-derive the stored bytes/objects per PG from tb_per_osd
     bytes_per_osd = args.tb_per_osd * 1024**4
@@ -266,21 +271,21 @@ def simulate(args):
         bytes_per_pg /= args.k
 
     print(f"    Data per PG: {format_bytes(bytes_per_pg)}, {format_int(objects_per_pg)} objects")
-    print(f"    Example PG: {pgs[0]}\n")
+    vprint(f"    Example PG: {pgs[0]}\n")
 
-    print(f"---------------------------")
-    print()
+    vprint(f"---------------------------")
+    vprint()
 
     # simulate the major incidents
-    print(f"Monte Carlo Sim: n={num_simulations} {num_failures}x failures")
+    vprint(f"Monte Carlo Sim: n={num_simulations} {num_failures}x failures")
     threshold = pg_size - min_size
     num_batches = 50
     batch_size = num_simulations // num_batches
     simulations_with_loss = 0
     total_pg_losses = 0
-    print()
-    print(f"    |{'-' * num_batches}|")
-    print(f"    |", end="", flush=True)
+    vprint()
+    vprint(f"    |{'-' * num_batches}|")
+    vprint(f"    |", end="", flush=True)
     simulations_done = 0
     sim_runtime = 10 # seconds
     start_time = time.time()
@@ -297,57 +302,57 @@ def simulate(args):
                 simulations_with_loss += 1
                 batch_good = False
         if batch_good:
-            print(".",end="",flush=True)
+            vprint(".",end="",flush=True)
         else:
-            print("x",end="",flush=True)
+            vprint("x",end="",flush=True)
         # Exit early if taking too long
         elapsed = time.time() - start_time
         if elapsed > sim_runtime:
-            print(" taking too long, exiting early. ",end="",flush=True)
+            vprint(" taking too long, exiting early. ",end="",flush=True)
             num_simulations = simulations_done
             break
 
-    print(f"|")
-    print(f"    |{'-' * num_batches}|")
-    print()
+    vprint(f"|")
+    vprint(f"    |{'-' * num_batches}|")
+    vprint()
 
-    print(f"    Theoretical loss rate calculation:")
+    vprint(f"    Theoretical loss rate calculation:")
     if args.k is not None:
-        print(f"        OSD failure combinations: C({num_osds}, {num_failures}) = {comb(num_osds, num_failures)} ")
-        print(f"        OSD to PG failure ratio: C({pg_size}, {num_failures}) = {comb(pg_size, num_failures)}")
-        print(f"        Loss rate: {num_pgs} PGs * {comb(pg_size, num_failures)} / {comb(num_osds, num_failures)} combinations")
+        vprint(f"        OSD failure combinations: C({num_osds}, {num_failures}) = {comb(num_osds, num_failures)} ")
+        vprint(f"        OSD to PG failure ratio: C({pg_size}, {num_failures}) = {comb(pg_size, num_failures)}")
+        vprint(f"        Loss rate: {num_pgs} PGs * {comb(pg_size, num_failures)} / {comb(num_osds, num_failures)} combinations")
         loss_rate = num_pgs / comb(num_osds, num_failures) * comb(pg_size, num_failures)
     else:
-        print(f"        OSD failure combinations: C({num_osds}, {num_failures}) = {comb(num_osds, num_failures)}")
-        print(f"        OSD to PG failure ratio: C({pg_size}, {num_failures}) = {comb(pg_size, num_failures)}")
-        print(f"        Loss rate: {num_pgs} PGs / {comb(num_osds, num_failures)} combinations")
+        vprint(f"        OSD failure combinations: C({num_osds}, {num_failures}) = {comb(num_osds, num_failures)}")
+        vprint(f"        OSD to PG failure ratio: C({pg_size}, {num_failures}) = {comb(pg_size, num_failures)}")
+        vprint(f"        Loss rate: {num_pgs} PGs / {comb(num_osds, num_failures)} combinations")
         loss_rate = num_pgs / comb(num_osds, num_failures)
     loss_rate = min(1.0, loss_rate)
-    print(f"    Theoretical Incidents with ≥1 PG lost: {loss_rate*num_simulations:.4g}/{num_simulations} ({loss_rate * 100:.3g}%)")
+    vprint(f"    Theoretical Incidents with ≥1 PG lost: {loss_rate*num_simulations:.4g}/{num_simulations} ({loss_rate * 100:.3g}%)")
     if simulations_with_loss > 0:
         loss_rate = simulations_with_loss / num_simulations
-        print(f"    Observed Incidents with ≥1 PG lost: {simulations_with_loss}/{num_simulations} ({100 * loss_rate:.3f}%)")
+        vprint(f"    Observed Incidents with ≥1 PG lost: {simulations_with_loss}/{num_simulations} ({100 * loss_rate:.3f}%)")
     else:
-        print(f"    Observed Incidents with ≥1 PG lost: 0/{num_simulations} (0%)")
-        print("    Warning: No data loss observed in simulations, using theoretical estimate.")
+        vprint(f"    Observed Incidents with ≥1 PG lost: 0/{num_simulations} (0%)")
+        vprint("    Warning: No data loss observed in simulations, using theoretical estimate.")
         total_pg_losses = simulations_with_loss = loss_rate * num_simulations  # use theoretical estimate
 
     confidence = confidence_interval(simulations_with_loss, num_simulations)
-    print(f"        95% Confidence Interval: {confidence[0] * 100:.3g}% - {confidence[1] * 100:.3g}%")
+    vprint(f"        95% Confidence Interval: {confidence[0] * 100:.3g}% - {confidence[1] * 100:.3g}%")
     
     pgs_lost_per_event = total_pg_losses / num_simulations
     pct_data_loss = total_pg_losses / (num_simulations * num_pgs)
-    print(f"    Expected Data Loss Per Incident: {pgs_lost_per_event:.3g} PGs, {format_bytes(pgs_lost_per_event*bytes_per_pg)} ({pct_data_loss * 100:.3g}%)")
+    vprint(f"    Expected Data Loss Per Incident: {pgs_lost_per_event:.3g} PGs, {format_bytes(pgs_lost_per_event*bytes_per_pg)} ({pct_data_loss * 100:.3g}%)")
 
     # Try to compute a meaningful MTTDL and Nines Durability Number
-    print("")
-    print("---------------------------")
-    print("")
+    vprint("")
+    vprint("---------------------------")
+    vprint("")
 
-    print(f"Estimating the probability of {num_failures}x failures")
+    vprint(f"Estimating the probability of {num_failures}x failures")
     afr = args.afr
     if pg_size > 1:
-        print(f"    Estimated per-drive AFR: {afr}")
+        vprint(f"    Estimated per-drive AFR: {afr}")
 
         # recovery rate is a function of object size, recovery bandwidth, and OSD recovery sleep
         recovery_rate = effective_recovery_rate(args.object_size,
@@ -355,10 +360,17 @@ def simulate(args):
                                                 args.osd_recovery_sleep,
                                                 objects_per_chunk=30)
 
-        print(f"    Estimated per-OSD recovery rate: {format_bytes(recovery_rate)}ps ({format_int(recovery_rate/args.object_size)} objects/s)")
+        try:
+            vprint(f"    Estimated per-OSD recovery rate: {format_bytes(recovery_rate)}ps ({format_int(recovery_rate/args.object_size)} objects/s)")
+        except ZeroDivisionError:
+            vprint(f"    Estimated per-OSD recovery rate: {format_bytes(recovery_rate)}ps ({format_int(recovery_rate)} Bps)")
 
-        recovery_time_per_pg = bytes_per_pg / recovery_rate
-        print(f"    Expected PG recovery time ({format_bytes(bytes_per_pg)}/{format_bytes(recovery_rate)}ps): {format_time(recovery_time_per_pg)}")
+        try:
+            recovery_time_per_pg = bytes_per_pg / recovery_rate
+        except ZeroDivisionError:
+            recovery_time_per_pg = 0
+
+        vprint(f"    Expected PG recovery time ({format_bytes(bytes_per_pg)}/{format_bytes(recovery_rate)}ps): {format_time(recovery_time_per_pg)}")
 
         # Estimate max OSD load during recovery when things are critical
         # e.g. for 3x repl: how long to recover 2 OSDs failed.
@@ -371,49 +383,50 @@ def simulate(args):
         # How many PGs need to be recovered 
         max_osd_load = expected_max_osd_load(num_osds_left, critical_osds*pgs_per_osd)
 
-        print(f"    Expected most loaded OSD when recovering {format_int((num_failures-1)*pgs_per_osd)} PGs to {num_osds-num_failures+1} OSDs = {max_osd_load} PGs")
+        vprint(f"    Expected most loaded OSD when recovering {format_int((num_failures-1)*pgs_per_osd)} PGs to {num_osds-num_failures+1} OSDs = {max_osd_load} PGs")
 
         recovery_time = max(max_osd_load * recovery_time_per_pg, 60) # minimum 1 minutes per PG
-        print(f"    Expected critical period ({max_osd_load} PGs * {format_time(recovery_time_per_pg)}) = {format_time(recovery_time)}")
+        vprint(f"    Expected critical period ({max_osd_load} PGs * {format_time(recovery_time_per_pg)}) = {format_time(recovery_time)}")
 
         critical_windows_per_year = 31536000 / recovery_time
-        print(f"    Expected critical periods annually (1yr / {format_time(recovery_time)}) = {critical_windows_per_year:.1f}")
+        vprint(f"    Expected critical periods annually (1yr / {format_time(recovery_time)}) = {critical_windows_per_year:.1f}")
 
         failure_prob_per_disk = recovery_time / 31536000 * afr
-        print(f"    Expected per-OSD critical period failure rate = {failure_prob_per_disk:.2g} (1 in {format_int(1/failure_prob_per_disk)})")
+        vprint(f"    Expected per-OSD critical period failure rate = {failure_prob_per_disk:.2g} (1 in {format_int(1/failure_prob_per_disk)})")
 
-        print(f"    Failure correlation factor = {args.fudge}")
+        vprint(f"    Failure correlation factor = {args.fudge}")
         
         failure_prob = args.fudge * prob_k_or_more_failures(num_osds, failure_prob_per_disk, num_failures)
-        print(f"    Probability of {num_failures}+ failures in any critical period ({format_time(recovery_time)}) = 1 in {format_int(1/failure_prob)}")
+        vprint(f"    Probability of {num_failures}+ failures in any critical period ({format_time(recovery_time)}) = 1 in {format_int(1/failure_prob)}")
 
         expected_annual_events = critical_windows_per_year * failure_prob
-        print(f"    Expected annual {num_failures}+ failure events: {expected_annual_events:.3g} (1 every {format_time(31536000/expected_annual_events)})")
+        vprint(f"    Expected annual {num_failures}+ failure events: {expected_annual_events:.3g} (1 every {format_time(31536000/expected_annual_events)})")
         mtt_multi_failure = mttdl(num_osds, num_failures, afr, args.fudge, recovery_time)
-        print(f"    Alternate calculation (mean time to {num_failures}+ concurrent failures) = {format_time(mtt_multi_failure)}")
+        vprint(f"    Alternate calculation (mean time to {num_failures}+ concurrent failures) = {format_time(mtt_multi_failure)}")
     else: # size == 1 (no redundancy)
-        print("    No redundancy configured, data loss is certain on any failure.")
+        vprint("    No redundancy configured, data loss is certain on any failure.")
         expected_annual_events = num_osds * afr
         mtt_multi_failure = 31536000 / expected_annual_events
-        print(f"    Expected annual failure events: {expected_annual_events:.3g} (1 every {format_time(31536000/expected_annual_events)})")
+        vprint(f"    Expected annual failure events: {expected_annual_events:.3g} (1 every {format_time(31536000/expected_annual_events)})")
 
 
 
-    print("---------------------------")
-    print("")
+    vprint("---------------------------")
+    vprint("")
 
     min_loss_rate = confidence[0]
     max_loss_rate = confidence[1]
 
-    print(f"Final MTTDL/Durability Estimates:")
-    print("")
-    print(f"    Mean time to {num_failures}x failure: {format_time(mtt_multi_failure)}")
+    vprint(f"MTTDL/Durability Estimates:")
+    vprint(f"    Mean time to {num_failures}x failure: {format_time(mtt_multi_failure)}")
     try:
-        print(f"    Probability that {num_failures}x failure causes data loss (95% CI): {min_loss_rate*100:.3g}%-{max_loss_rate*100:.3g}% (1 in {format_int(1/max_loss_rate)} to 1 in {format_int(1/min_loss_rate)})")
+        vprint(f"    Probability that {num_failures}x failures causes data loss: {loss_rate*100:.3g}% (1 in {format_int(1/loss_rate)})")
+        vprint(f"       95% CI: {min_loss_rate*100:.3g}%-{max_loss_rate*100:.3g}% (1 in {format_int(1/max_loss_rate)} to 1 in {format_int(1/min_loss_rate)})")
     except ZeroDivisionError:
-        print(f"    Probability that {num_failures}x failure causes data loss (95% CI): {min_loss_rate*100:.3g}%-{max_loss_rate*100:.3g}% (1 in {format_int(1/max_loss_rate)} to ∞)")
-    print("")
-    print(f"Combining the above to estimate annual data loss:")
+        vprint(f"    Probability that {num_failures}x failures causes data loss: {loss_rate*100:.3g}% (1 in {format_int(1/loss_rate)})")
+        vprint(f"        95% CI: {min_loss_rate*100:.3g}%-{max_loss_rate*100:.3g}% (1 in {format_int(1/max_loss_rate)} to ∞)")
+    vprint("")
+    vprint(f"Combining the above to estimate annual data loss:")
 
     # Compute MTTDL
     expected_losses_per_year = expected_annual_events * loss_rate
@@ -423,11 +436,11 @@ def simulate(args):
     min_mttdl_sec = 31536000/max_expected_losses_per_year
     try:
         max_mttdl_sec = 31536000/min_expected_losses_per_year
-        print(f"    Mean Time To Data Loss (MTTDL): {format_time(mttdl_sec)}")
-        print(f"        (95% CI: {format_time(min_mttdl_sec)} - {format_time(max_mttdl_sec)})")
+        print(f"    MTTDL: {format_time(mttdl_sec)}")
+        vprint(f"    (95% CI: {format_time(min_mttdl_sec)} - {format_time(max_mttdl_sec)})")
     except ZeroDivisionError:
-        print(f"    Mean Time To Data Loss (MTTDL): {format_time(mttdl_sec)}")
-        print(f"        (95% CI: {format_time(min_mttdl_sec)} - ∞)")
+        print(f"    MTTDL: {format_time(mttdl_sec)}")
+        vprint(f"    (95% CI: {format_time(min_mttdl_sec)} - ∞)")
 
     # Compute Nines Durability Number
     expected_pgs_lost_per_year = expected_losses_per_year * pgs_lost_per_event
@@ -437,14 +450,14 @@ def simulate(args):
     min_expected_durability = 1 - (max_expected_pgs_lost_per_year / num_pgs)
     max_expected_durability = 1 - (min_expected_pgs_lost_per_year / num_pgs)
     print(f"    Durability: {count_nines(expected_durability)}")
-    print(f"        (95% CI: {count_nines(min_expected_durability)} to {count_nines(max_expected_durability)})")
+    vprint(f"    (95% CI: {count_nines(min_expected_durability)} to {count_nines(max_expected_durability)})")
 
     expected_bytes_lost_per_year = expected_pgs_lost_per_year * bytes_per_pg
     min_expected_bytes_lost_per_year = min_expected_pgs_lost_per_year * bytes_per_pg
     max_expected_bytes_lost_per_year = max_expected_pgs_lost_per_year * bytes_per_pg
 
-    print(f"    Expected annual data loss: {format_bytes(expected_bytes_lost_per_year)}")
-    print(f"        (95% CI: {format_bytes(min_expected_bytes_lost_per_year)} - {format_bytes(max_expected_bytes_lost_per_year)})")
+    vprint(f"Expected annual data loss: {format_bytes(expected_bytes_lost_per_year)}")
+    vprint(f"    (95% CI: {format_bytes(min_expected_bytes_lost_per_year)} - {format_bytes(max_expected_bytes_lost_per_year)})")
 
     return (format_time(mttdl_sec), count_nines(expected_durability))
 
@@ -452,20 +465,41 @@ def simulate(args):
 # ---------------------------- Analyze Subcommand ----------------------------
 
 def analyze_pool(args, pool, pg_dump):
-    print(f"Analyzing pool '{pool['pool_name']}' (id {pool['pool_id']}, size {pool['size']}) ")
+    import subprocess
+    import json
+
+    vprint(f"Analyzing pool '{pool['pool_name']}' (id {pool['pool_id']}, size {pool['size']}) ")
     if 'erasure_code_profile' in pool and pool['erasure_code_profile']:
-        print(f"    Type: erasure-coded ({pool['erasure_code_profile']})")
-        # For simplicity, assume standard k+m profile naming
+        vprint(f"    Type: erasure-coded ({pool['erasure_code_profile']})")
         try:
-            k, m = map(int, pool['erasure_code_profile'].split('+'))
-            args.k = k
-            args.m = m
-            args.size = None
+            # ceph osd erasure-code-profile get
+            try:
+                ceph_cmd = ['ceph', f'--cluster={args.cluster}', 'osd', 'erasure-code-profile', 'get', pool['erasure_code_profile'], '--format=json']
+                ec_profile_json = subprocess.check_output(ceph_cmd)
+                ec_profile_data = json.loads(ec_profile_json)
+                args.k = int(ec_profile_data['k'])
+                args.m = int(ec_profile_data['m'])
+            except Exception as e:
+                print(f"Error fetching EC profile data: {e}")
+                ec_profile_data = None
+
+            # Try using local copy of ceph-osd-erasure-code-profile-get.json
+            if ec_profile_data is None:
+                try:
+                    with open('ceph-osd-erasure-code-profile-get.json', 'r') as f:
+                        ec_profile_data = json.load(f)
+                    print("Using local ceph-osd-erasure-code-profile-get.json file.")
+                    args.k = int(ec_profile_data['k'])
+                    args.m = int(ec_profile_data['m'])
+                except Exception as e2:
+                    print(f"Error loading local EC profile data: {e2}")
+                    return
         except:
             print("    Warning: Unable to parse erasure code profile. Please specify --k and --m manually.")
+            print(pool['erasure_code_profile'])
             return
     else:
-        print(f"    Type: replicated")
+        vprint(f"    Type: replicated")
         args.size = pool['size']
         args.k = None
         args.m = None
@@ -476,6 +510,7 @@ def analyze_pool(args, pool, pg_dump):
     pgs = [pg['acting'] for pg in pg_stats if pg['pgid'].startswith(f"{pool['pool_id']}.")]
     assert len(pgs) == args.pg_num, "PG count mismatch!"
     args.pgs = pgs
+    args.pool_name = pool['pool_name']
 
     osds = set()
     for pg in pg_stats:
@@ -486,8 +521,8 @@ def analyze_pool(args, pool, pg_dump):
     args.num_osds = pg_dump['pg_map']['osd_stats_sum']['num_osds']
     assert args.num_osds == len(osds), "OSD count mismatch!"
 
-    print(f"    Number of PGs: {args.pg_num}")
-    print(f"    Number of OSDs: {len(osds)}")
+    vprint(f"    Number of PGs: {args.pg_num}")
+    vprint(f"    Number of OSDs: {len(osds)}")
 
     # Derive tb_per_osd from pool size and pg stat_sum num_bytes
     bytes_stored = sum(pg['stat_sum']['num_bytes'] for pg in pg_stats if pg['pgid'].startswith(f"{pool['pool_id']}."))
@@ -502,14 +537,13 @@ def analyze_pool(args, pool, pg_dump):
     args.tb_per_osd = bytes_raw / len(osds) / (1024**4)
     args.object_size = bytes_raw / objects_raw if objects_raw > 0 else 4*1024*1024  # default to 4MB
 
-    print(f"    Total stored data: {format_bytes(bytes_stored)} ({format_int(objects_stored)} objects)")
-    print(f"    Total raw data: {format_bytes(bytes_raw)} ({format_int(objects_raw)} objects)")
-    print(f"    Data per OSD: {format_bytes(args.tb_per_osd * 1024**4)} ({format_int(objects_raw / len(osds))} objects)")
-    print(f"    Average object size: {format_bytes(args.object_size)}")
-    print("")
+    vprint(f"    Total stored data: {format_bytes(bytes_stored)} ({format_int(objects_stored)} objects)")
+    vprint(f"    Total raw data: {format_bytes(bytes_raw)} ({format_int(objects_raw)} objects)")
+    vprint(f"    Data per OSD: {format_bytes(args.tb_per_osd * 1024**4)} ({format_int(objects_raw / len(osds))} objects)")
+    vprint(f"    Average object size: {format_bytes(args.object_size)}")
+    vprint("")
     [mttdl, nines] = simulate(args)
 
-    print(f"AAA {pool['pool_name']}: {mttdl} MTTDL, {nines} Durability")
 
 def analyze(args):
     import subprocess
@@ -553,9 +587,9 @@ def analyze(args):
             with open('ceph-pg-dump.json', 'r') as f:
                 pg_dump_data = json.load(f)
             print("Using local ceph-pg-dump.json file.")
-            print("")
-            print("========================================")
-            print("")
+            vprint("")
+            vprint("========================================")
+            vprint("")
         except Exception as e2:
             print(f"Error loading local PG dump data: {e2}")
             return
@@ -565,9 +599,9 @@ def analyze(args):
         if p['pool_name'] == pool_name or not pool_name:
             analyze_pool(args, p, pg_dump_data)
             found = True
-            print("")
-            print("========================================")
-            print("")
+            vprint("")
+            vprint("========================================")
+            vprint("")
 
     if not found:
         print(f"Pool '{pool_name}' not found in cluster '{cluster}'.")
@@ -620,6 +654,8 @@ def main():
     if not args.command:
         parser.print_help()
     else:
+        global VERBOSE
+        VERBOSE = args.verbose
         # Reconstruct command
         cmdline = reconstruct_command_line(parser, args)
         # Print header
@@ -627,8 +663,13 @@ def main():
         print("---------------------------------------------------------")
         print(f"Command line: {cmdline}")
         print("---------------------------------------------------------")
-        print()
+        vprint()
         args.func(args)
+        vprint()
+        vprint("---------------------------------------------------------")
+        vprint(f"Command line: {cmdline}")
+        vprint("---------------------------------------------------------")
+        vprint()
 
 if __name__ == "__main__":
     main()

From 8c1003e1a99824329068113d163bb41736bb0ed6 Mon Sep 17 00:00:00 2001
From: Dan van der Ster <dan.vanderster@clyso.com>
Date: Tue, 4 Nov 2025 09:43:54 -0800
Subject: [PATCH 03/28] clyso-nines: add --skip-monte-carlo and optimize
 cumulative binomial

Signed-off-by: Dan van der Ster <dan.vanderster@clyso.com>
---
 otto/src/clyso/ceph/otto/tools/clyso-nines | 37 ++++++++++++++++++----
 1 file changed, 31 insertions(+), 6 deletions(-)

diff --git a/otto/src/clyso/ceph/otto/tools/clyso-nines b/otto/src/clyso/ceph/otto/tools/clyso-nines
index 7b14e3d..8aa9feb 100755
--- a/otto/src/clyso/ceph/otto/tools/clyso-nines
+++ b/otto/src/clyso/ceph/otto/tools/clyso-nines
@@ -55,6 +55,9 @@ def round_to_nearest_power_of_two(n):
     return lower if (n - lower) < (upper - n) else upper
 
 def count_nines(value, max_digits=20):
+    if value < 0 or value > 1:
+        raise ValueError(f"Value {value} must be between 0 and 1")
+
     # Round to given precision
     rounded = round(value, max_digits)
     s = f"{rounded:.{max_digits}f}"
@@ -94,7 +97,10 @@ def prob_exact_k_failures(n, p, k):
     return comb(n, k) * (p ** k) * ((1 - p) ** (n - k))
 
 def prob_k_or_more_failures(n, p, k):
-    return sum(comb(n, i) * (p ** i) * ((1 - p) ** (n - i)) for i in range(k, n + 1))
+    cumulative = 0
+    for i in range(k): # 0, 1, 2, ..., k-1
+        cumulative += prob_exact_k_failures(n, p, i)
+    return 1 - cumulative
 
 def expected_max_osd_load(n_osds, n_pgs):
     if n_pgs == 0 or n_osds == 0:
@@ -216,6 +222,11 @@ def simulate(args):
     else:
         raise ValueError("Specify either --size or both --k and --m.")
 
+    if args.domains_down > 0:
+        num_failures -= args.domains_down
+
+    num_failures = max(0, num_failures)
+
     num_simulations = args.simulations
     if num_simulations < 100:
         raise ValueError("Number of simulations must be at least 100 for meaningful results.")
@@ -281,6 +292,9 @@ def simulate(args):
     threshold = pg_size - min_size
     num_batches = 50
     batch_size = num_simulations // num_batches
+    if args.skip_monte_carlo:
+        vprint("    Skipping Monte Carlo simulation as per user request.")
+        num_batches = 0
     simulations_with_loss = 0
     total_pg_losses = 0
     vprint()
@@ -293,7 +307,15 @@ def simulate(args):
         batch_good = True
         for _ in range(int(num_simulations/num_batches)):
             simulations_done += 1
-            failed_osds = set(random.sample(osds, num_failures))
+            failed_osds = set()
+            if args.domains_down > 0:
+                # pre-fail args.domains_down * osds_per_domain OSDs
+                failed_osds = set([osds[i] for i in range(args.domains_down * osds_per_domain)])
+
+            failed_osds.update(random.sample(list(set(osds) - failed_osds), num_failures))
+
+            assert len(failed_osds) == num_failures + args.domains_down * (osds_per_domain), f"Failed OSD count mismatch! {len(failed_osds)} != {num_failures + args.domains_down * osds_per_domain}"
+
             dead_pgs = sum(
                 1 for pg in pgs if sum(1 for osd in pg if osd in failed_osds) > threshold
             )
@@ -329,7 +351,7 @@ def simulate(args):
         loss_rate = num_pgs / comb(num_osds, num_failures)
     loss_rate = min(1.0, loss_rate)
     vprint(f"    Theoretical Incidents with ≥1 PG lost: {loss_rate*num_simulations:.4g}/{num_simulations} ({loss_rate * 100:.3g}%)")
-    if simulations_with_loss > 0:
+    if simulations_with_loss > 0 or not args.skip_monte_carlo:
         loss_rate = simulations_with_loss / num_simulations
         vprint(f"    Observed Incidents with ≥1 PG lost: {simulations_with_loss}/{num_simulations} ({100 * loss_rate:.3f}%)")
     else:
@@ -446,9 +468,9 @@ def simulate(args):
     expected_pgs_lost_per_year = expected_losses_per_year * pgs_lost_per_event
     min_expected_pgs_lost_per_year = min_expected_losses_per_year * pgs_lost_per_event
     max_expected_pgs_lost_per_year = max_expected_losses_per_year * pgs_lost_per_event
-    expected_durability = 1 - (expected_pgs_lost_per_year / num_pgs)
-    min_expected_durability = 1 - (max_expected_pgs_lost_per_year / num_pgs)
-    max_expected_durability = 1 - (min_expected_pgs_lost_per_year / num_pgs)
+    expected_durability = max(0, 1 - (expected_pgs_lost_per_year / num_pgs))
+    min_expected_durability = max(0, 1 - (max_expected_pgs_lost_per_year / num_pgs))
+    max_expected_durability = max(0, 1 - (min_expected_pgs_lost_per_year / num_pgs))
     print(f"    Durability: {count_nines(expected_durability)}")
     vprint(f"    (95% CI: {count_nines(min_expected_durability)} to {count_nines(max_expected_durability)})")
 
@@ -636,6 +658,8 @@ def main():
     sim.add_argument('--afr', type=float, default=0.02, help='Device AFR (default: 0.02)')
     sim.add_argument('--recovery-rate', type=float, default=50, help='Recovery rate per OSD (default: 50MBps)')
     sim.add_argument('--osd-recovery-sleep', type=float, default=0.1, help='OSD Recovery Sleep in seconds (default: 0.1s)')
+    sim.add_argument('--domains-down', type=int, default=0, help='number of failure domains to take down in simulation (default: 0)')
+    sim.add_argument('--skip-monte-carlo', action='store_true', help='Skip the Monte Carlo sims and use theoretical estimates instead')
     sim.set_defaults(func=simulate)
 
     an = subparsers.add_parser('analyze', help='Calculate durability of the local pools')
@@ -648,6 +672,7 @@ def main():
     an.add_argument('--afr', type=float, default=0.02, help='Device AFR (default: 0.02)')
     an.add_argument('--recovery-rate', type=int, default=50, help='Per-OSD data recovery rate (default: 50MBps)')
     an.add_argument('--osd-recovery-sleep', type=float, default=0.1, help='OSD Recovery Sleep in seconds (default: 0.1s)')
+    an.add_argument('--skip-monte-carlo', action='store_true', help='Skip the Monte Carlo sims and use theoretical estimates instead')
     an.set_defaults(func=analyze)
 
     args = parser.parse_args()

From 84794fefded04c359a30c6197df9920c822a96b3 Mon Sep 17 00:00:00 2001
From: Dan van der Ster <dan.vanderster@clyso.com>
Date: Tue, 4 Nov 2025 10:59:29 -0800
Subject: [PATCH 04/28] clyso-nines: optimize for large values, prettier
 printing

Signed-off-by: Dan van der Ster <dan.vanderster@clyso.com>
---
 otto/src/clyso/ceph/otto/tools/clyso-nines | 69 ++++++++++++++++------
 1 file changed, 52 insertions(+), 17 deletions(-)

diff --git a/otto/src/clyso/ceph/otto/tools/clyso-nines b/otto/src/clyso/ceph/otto/tools/clyso-nines
index 8aa9feb..c8d8e08 100755
--- a/otto/src/clyso/ceph/otto/tools/clyso-nines
+++ b/otto/src/clyso/ceph/otto/tools/clyso-nines
@@ -23,6 +23,10 @@ import math
 import random
 import time
 import shlex
+import itertools
+
+# hide cursor
+print('\033[?25l', end="")
 
 VERBOSE = False
 
@@ -251,7 +255,11 @@ def simulate(args):
 
         num_pgs = round_to_nearest_power_of_two((num_osds * pgs_per_osd) / pg_size)
         osds = list(range(1, num_osds + 1))
-        pgs = simple_crush(osds,num_domains,pg_size,num_pgs) # [tuple(random.sample(osds, pg_size)) for _ in range(num_pgs)]
+        vprint(f"Generating {num_pgs} PGs with size {pg_size} over {num_osds} OSDs...")
+        if not args.skip_monte_carlo:
+            pgs = simple_crush(osds,num_domains,pg_size,num_pgs) # [tuple(random.sample(osds, pg_size)) for _ in range(num_pgs)]
+        else:
+            pgs = [tuple(random.sample(osds, pg_size)) for _ in range(num_pgs)]
 
         # Print simulation summary
         print(f"Simulating a {profile} pool with {num_osds} OSDs and {num_domains} {failure_domain}s ({osds_per_domain} OSDs per {failure_domain})")
@@ -298,14 +306,22 @@ def simulate(args):
     simulations_with_loss = 0
     total_pg_losses = 0
     vprint()
-    vprint(f"    |{'-' * num_batches}|")
-    vprint(f"    |", end="", flush=True)
+    if not args.skip_monte_carlo:
+        vprint(f"    ╭{'─' * num_batches}╮")
+        vprint(f"    │", end="", flush=True)
+        num_simulations = 1000000000
     simulations_done = 0
-    sim_runtime = 10 # seconds
+    sim_runtime = args.max_mc_runtime
     start_time = time.time()
+    frames = ["⣾", "⣷", "⣯", "⣟", "⡿", "⢿", "⣻", "⣽"]
+    spinner = itertools.cycle(frames)
+    spin_time = start_time
     for batch in range(num_batches):
         batch_good = True
         for _ in range(int(num_simulations/num_batches)):
+            if spin_time + 0.1 < time.time():
+                print(next(spinner), end="\b", flush=True)
+                spin_time = time.time()
             simulations_done += 1
             failed_osds = set()
             if args.domains_down > 0:
@@ -324,19 +340,20 @@ def simulate(args):
                 simulations_with_loss += 1
                 batch_good = False
         if batch_good:
-            vprint(".",end="",flush=True)
+            vprint("⣿",end="",flush=True)
         else:
-            vprint("x",end="",flush=True)
+            vprint("⣀",end="",flush=True)
         # Exit early if taking too long
         elapsed = time.time() - start_time
         if elapsed > sim_runtime:
-            vprint(" taking too long, exiting early. ",end="",flush=True)
+            vprint(" timeout, exiting early. ",end="",flush=True)
             num_simulations = simulations_done
             break
 
-    vprint(f"|")
-    vprint(f"    |{'-' * num_batches}|")
-    vprint()
+    if not args.skip_monte_carlo:
+        vprint(f"│")
+        vprint(f"    ╰{'─' * num_batches}╯")
+        vprint()
 
     vprint(f"    Theoretical loss rate calculation:")
     if args.k is not None:
@@ -348,15 +365,19 @@ def simulate(args):
         vprint(f"        OSD failure combinations: C({num_osds}, {num_failures}) = {comb(num_osds, num_failures)}")
         vprint(f"        OSD to PG failure ratio: C({pg_size}, {num_failures}) = {comb(pg_size, num_failures)}")
         vprint(f"        Loss rate: {num_pgs} PGs / {comb(num_osds, num_failures)} combinations")
+        assert num_failures == pg_size, "For replicated pools, num_failures should equal pg_size"
         loss_rate = num_pgs / comb(num_osds, num_failures)
     loss_rate = min(1.0, loss_rate)
     vprint(f"    Theoretical Incidents with ≥1 PG lost: {loss_rate*num_simulations:.4g}/{num_simulations} ({loss_rate * 100:.3g}%)")
-    if simulations_with_loss > 0 or not args.skip_monte_carlo:
+    if simulations_with_loss > 0:
         loss_rate = simulations_with_loss / num_simulations
         vprint(f"    Observed Incidents with ≥1 PG lost: {simulations_with_loss}/{num_simulations} ({100 * loss_rate:.3f}%)")
+    elif args.skip_monte_carlo:
+        vprint(f"    Skipped Monte Carlo simulation as per user request. Using theoretical estimate.")
+        total_pg_losses = simulations_with_loss = loss_rate * num_simulations  # use theoretical estimate
     else:
-        vprint(f"    Observed Incidents with ≥1 PG lost: 0/{num_simulations} (0%)")
-        vprint("    Warning: No data loss observed in simulations, using theoretical estimate.")
+#        vprint(f"    Observed Incidents with ≥1 PG lost: 0/{num_simulations} (0%)")
+        vprint("      !! No data loss observed in simulations, using theoretical estimate.")
         total_pg_losses = simulations_with_loss = loss_rate * num_simulations  # use theoretical estimate
 
     confidence = confidence_interval(simulations_with_loss, num_simulations)
@@ -445,8 +466,14 @@ def simulate(args):
         vprint(f"    Probability that {num_failures}x failures causes data loss: {loss_rate*100:.3g}% (1 in {format_int(1/loss_rate)})")
         vprint(f"       95% CI: {min_loss_rate*100:.3g}%-{max_loss_rate*100:.3g}% (1 in {format_int(1/max_loss_rate)} to 1 in {format_int(1/min_loss_rate)})")
     except ZeroDivisionError:
-        vprint(f"    Probability that {num_failures}x failures causes data loss: {loss_rate*100:.3g}% (1 in {format_int(1/loss_rate)})")
-        vprint(f"        95% CI: {min_loss_rate*100:.3g}%-{max_loss_rate*100:.3g}% (1 in {format_int(1/max_loss_rate)} to ∞)")
+        try:
+            vprint(f"    Probability that {num_failures}x failures causes data loss: {loss_rate*100:.3g}% (1 in {format_int(1/loss_rate)})")
+        except ZeroDivisionError:
+            vprint(f"    Probability that {num_failures}x failures causes data loss: 0%")
+        try:
+            vprint(f"        95% CI: {min_loss_rate*100:.3g}%-{max_loss_rate*100:.3g}% (1 in {format_int(1/max_loss_rate)} to ∞)")
+        except ZeroDivisionError:
+            vprint(f"        95% CI: {min_loss_rate*100:.3g}%-{max_loss_rate*100:.3g}%")
     vprint("")
     vprint(f"Combining the above to estimate annual data loss:")
 
@@ -454,8 +481,14 @@ def simulate(args):
     expected_losses_per_year = expected_annual_events * loss_rate
     min_expected_losses_per_year = expected_annual_events * min_loss_rate
     max_expected_losses_per_year = expected_annual_events * max_loss_rate
-    mttdl_sec = 31536000/expected_losses_per_year
-    min_mttdl_sec = 31536000/max_expected_losses_per_year
+    try:
+        mttdl_sec = 31536000/expected_losses_per_year
+    except ZeroDivisionError:
+        mttdl_sec = float('inf')
+    try:
+        min_mttdl_sec = 31536000/max_expected_losses_per_year
+    except ZeroDivisionError:
+        min_mttdl_sec = float('inf')
     try:
         max_mttdl_sec = 31536000/min_expected_losses_per_year
         print(f"    MTTDL: {format_time(mttdl_sec)}")
@@ -660,6 +693,7 @@ def main():
     sim.add_argument('--osd-recovery-sleep', type=float, default=0.1, help='OSD Recovery Sleep in seconds (default: 0.1s)')
     sim.add_argument('--domains-down', type=int, default=0, help='number of failure domains to take down in simulation (default: 0)')
     sim.add_argument('--skip-monte-carlo', action='store_true', help='Skip the Monte Carlo sims and use theoretical estimates instead')
+    sim.add_argument('--max-mc-runtime', type=int, default=30, help='Maximum simulation runtime in seconds (default: 30s)')
     sim.set_defaults(func=simulate)
 
     an = subparsers.add_parser('analyze', help='Calculate durability of the local pools')
@@ -673,6 +707,7 @@ def main():
     an.add_argument('--recovery-rate', type=int, default=50, help='Per-OSD data recovery rate (default: 50MBps)')
     an.add_argument('--osd-recovery-sleep', type=float, default=0.1, help='OSD Recovery Sleep in seconds (default: 0.1s)')
     an.add_argument('--skip-monte-carlo', action='store_true', help='Skip the Monte Carlo sims and use theoretical estimates instead')
+    an.add_argument('--max-mc-runtime', type=int, default=30, help='Maximum Monte Carlo simulation runtime in seconds per pool (default: 30s)')
     an.set_defaults(func=analyze)
 
     args = parser.parse_args()

From 666110e369fc515beaf8655e9678214c50abde6d Mon Sep 17 00:00:00 2001
From: Dan van der Ster <dan.vanderster@clyso.com>
Date: Tue, 4 Nov 2025 15:54:58 -0800
Subject: [PATCH 05/28] clyso-nines: correct the theoretical calculation for EC
 and replication

Signed-off-by: Dan van der Ster <dan.vanderster@clyso.com>
---
 otto/src/clyso/ceph/otto/tools/clyso-nines | 94 +++++++++++++++-------
 1 file changed, 67 insertions(+), 27 deletions(-)

diff --git a/otto/src/clyso/ceph/otto/tools/clyso-nines b/otto/src/clyso/ceph/otto/tools/clyso-nines
index c8d8e08..cb8e99b 100755
--- a/otto/src/clyso/ceph/otto/tools/clyso-nines
+++ b/otto/src/clyso/ceph/otto/tools/clyso-nines
@@ -295,6 +295,64 @@ def simulate(args):
     vprint(f"---------------------------")
     vprint()
 
+    vprint(f"Theoretical loss rate calculation:")
+    H = num_domains # num hosts
+    f = num_failures # normally m+1
+    n = osds_per_domain # osds per host
+    r = pg_size  # normally k + m
+    N = num_osds  # total osds
+
+    vprint("    PG loss rate calculation:")
+    vprint(f"      H (failure domains): {H}")
+    vprint(f"      f (failures to cause data loss): {f}")
+    vprint(f"      n (OSDs per failure domain): {n}")
+    vprint(f"      r (PG size): {r}")
+    vprint(f"      N (total OSDs): {N}")
+    vprint(f"      Number of PGs: {num_pgs}")
+
+    # Assume we have f concurrent failures. They will cause data loss if:
+    # - they land on f different failure domains
+    # - per-PG probability that those domains are hit
+    # - at least one PG is lost
+
+    # 1. Eligibility: Do the f failures land on f different domains?
+    elig = comb(H, f) * (n ** f) / comb(N, f)
+    vprint("    1. Eligibility (Do the f failures land on f different domains): C(H, f) * n^f / C(N, f)")
+    vprint(f"       C({H}, {f}) * {n}^{f} / C({N}, {f}) = {comb(H, f)} * {n**f} / {comb(N, f)} = {elig:.3g}")
+
+    # 2. Per-PG hit probability
+    #    PG picks r domains uniformly: probability it includes those f failed domains is
+    p_hosts_in_pg = comb(H - f, r - f) / comb(H, r)
+    vprint("    2. Per-PG hit probability: C(H - f, r - f) / C(H, r)")
+    vprint(f"       C({H} - {f}, {r} - {f}) / C({H}, {r}) = {comb(H - f, r - f)} / {comb(H, r)} = {p_hosts_in_pg:.3g}")
+
+    # 3. Within those f domains, PG must pick the failed OSD in each: 
+    p_osds_match = (1 / n) ** f
+    vprint("    3. PG picks failed OSDs within those domains: (1 / n)^f")
+    vprint(f"       (1 / {n})^{f} = {p_osds_match:.3g}")
+
+    # 4. Combine to get per-PG loss probability
+    p_pg_loss = p_hosts_in_pg * p_osds_match
+    vprint("    4. Per-PG loss probability: #2 * #3")
+    vprint(f"       {p_pg_loss:.3g}")
+
+    # 5. Probability that at least one PG is lost
+    p_at_least_one_lost = 1 - (1 - p_pg_loss) ** num_pgs
+    vprint("    5. Probability that at least one PG is lost: 1 - (1 - #4)^num_pgs")
+    vprint(f"    1 - (1 - {p_pg_loss:.3g})^{num_pgs} = {p_at_least_one_lost:.3g}")
+
+    loss_rate = elig * p_at_least_one_lost
+    vprint("    6. Overall data loss probability: #1 * #5")
+
+    assert 0 <= loss_rate <= 1, "Calculated loss rate is out of bounds!"
+
+    vprint()
+    vprint(f"    Finally, P. ≥1 PG lost: {loss_rate * 100:.3g}%")
+
+    vprint()
+    vprint(f"---------------------------")
+    vprint()
+
     # simulate the major incidents
     vprint(f"Monte Carlo Sim: n={num_simulations} {num_failures}x failures")
     threshold = pg_size - min_size
@@ -309,6 +367,7 @@ def simulate(args):
     if not args.skip_monte_carlo:
         vprint(f"    ╭{'─' * num_batches}╮")
         vprint(f"    │", end="", flush=True)
+    else:
         num_simulations = 1000000000
     simulations_done = 0
     sim_runtime = args.max_mc_runtime
@@ -320,7 +379,7 @@ def simulate(args):
         batch_good = True
         for _ in range(int(num_simulations/num_batches)):
             if spin_time + 0.1 < time.time():
-                print(next(spinner), end="\b", flush=True)
+                vprint(next(spinner), end="\b", flush=True)
                 spin_time = time.time()
             simulations_done += 1
             failed_osds = set()
@@ -355,20 +414,6 @@ def simulate(args):
         vprint(f"    ╰{'─' * num_batches}╯")
         vprint()
 
-    vprint(f"    Theoretical loss rate calculation:")
-    if args.k is not None:
-        vprint(f"        OSD failure combinations: C({num_osds}, {num_failures}) = {comb(num_osds, num_failures)} ")
-        vprint(f"        OSD to PG failure ratio: C({pg_size}, {num_failures}) = {comb(pg_size, num_failures)}")
-        vprint(f"        Loss rate: {num_pgs} PGs * {comb(pg_size, num_failures)} / {comb(num_osds, num_failures)} combinations")
-        loss_rate = num_pgs / comb(num_osds, num_failures) * comb(pg_size, num_failures)
-    else:
-        vprint(f"        OSD failure combinations: C({num_osds}, {num_failures}) = {comb(num_osds, num_failures)}")
-        vprint(f"        OSD to PG failure ratio: C({pg_size}, {num_failures}) = {comb(pg_size, num_failures)}")
-        vprint(f"        Loss rate: {num_pgs} PGs / {comb(num_osds, num_failures)} combinations")
-        assert num_failures == pg_size, "For replicated pools, num_failures should equal pg_size"
-        loss_rate = num_pgs / comb(num_osds, num_failures)
-    loss_rate = min(1.0, loss_rate)
-    vprint(f"    Theoretical Incidents with ≥1 PG lost: {loss_rate*num_simulations:.4g}/{num_simulations} ({loss_rate * 100:.3g}%)")
     if simulations_with_loss > 0:
         loss_rate = simulations_with_loss / num_simulations
         vprint(f"    Observed Incidents with ≥1 PG lost: {simulations_with_loss}/{num_simulations} ({100 * loss_rate:.3f}%)")
@@ -460,20 +505,15 @@ def simulate(args):
     min_loss_rate = confidence[0]
     max_loss_rate = confidence[1]
 
+    inv_loss_rate = 1/loss_rate if loss_rate > 0 else float('inf')
+    inv_min_loss_rate = 1/min_loss_rate if min_loss_rate > 0 else float('inf')
+    inv_max_loss_rate = 1/max_loss_rate if max_loss_rate > 0 else float('inf')
+
     vprint(f"MTTDL/Durability Estimates:")
     vprint(f"    Mean time to {num_failures}x failure: {format_time(mtt_multi_failure)}")
-    try:
-        vprint(f"    Probability that {num_failures}x failures causes data loss: {loss_rate*100:.3g}% (1 in {format_int(1/loss_rate)})")
-        vprint(f"       95% CI: {min_loss_rate*100:.3g}%-{max_loss_rate*100:.3g}% (1 in {format_int(1/max_loss_rate)} to 1 in {format_int(1/min_loss_rate)})")
-    except ZeroDivisionError:
-        try:
-            vprint(f"    Probability that {num_failures}x failures causes data loss: {loss_rate*100:.3g}% (1 in {format_int(1/loss_rate)})")
-        except ZeroDivisionError:
-            vprint(f"    Probability that {num_failures}x failures causes data loss: 0%")
-        try:
-            vprint(f"        95% CI: {min_loss_rate*100:.3g}%-{max_loss_rate*100:.3g}% (1 in {format_int(1/max_loss_rate)} to ∞)")
-        except ZeroDivisionError:
-            vprint(f"        95% CI: {min_loss_rate*100:.3g}%-{max_loss_rate*100:.3g}%")
+    vprint(f"    Probability that {num_failures}x failures causes data loss: {loss_rate*100:.3g}% (1 in {format_int(inv_loss_rate)})")
+    vprint(f"       95% CI: {min_loss_rate*100:.3g}%-{max_loss_rate*100:.3g}% (1 in {format_int(inv_max_loss_rate)} to 1 in {format_int(inv_min_loss_rate)})")
+
     vprint("")
     vprint(f"Combining the above to estimate annual data loss:")
 

From a03490d5e7bbc6c2389f760953f1ca5c406f9ca3 Mon Sep 17 00:00:00 2001
From: Dan van der Ster <dan.vanderster@clyso.com>
Date: Wed, 12 Nov 2025 12:54:43 -0800
Subject: [PATCH 06/28] clyso-nines: add --osds-per-domain, remove --num-osds

Signed-off-by: Dan van der Ster <dan.vanderster@clyso.com>
---
 otto/src/clyso/ceph/otto/tools/clyso-nines | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/otto/src/clyso/ceph/otto/tools/clyso-nines b/otto/src/clyso/ceph/otto/tools/clyso-nines
index cb8e99b..1b3348c 100755
--- a/otto/src/clyso/ceph/otto/tools/clyso-nines
+++ b/otto/src/clyso/ceph/otto/tools/clyso-nines
@@ -243,16 +243,13 @@ def simulate(args):
     if not hasattr(args, 'pgs'):
         # simulating a randomized pool
 
-        num_domains = args.num_domains
         pgs_per_osd = args.pgs_per_osd
-        num_osds = args.num_osds
+        num_domains = args.num_domains
+        osds_per_domain = args.osds_per_domain
+        num_osds = num_domains * osds_per_domain
 
         failure_domain = args.failure_domain
 
-        if num_osds % args.num_domains != 0:
-            raise ValueError("Number of OSDs must be divisible by number of domains.")
-        osds_per_domain = int(num_osds / args.num_domains)
-
         num_pgs = round_to_nearest_power_of_two((num_osds * pgs_per_osd) / pg_size)
         osds = list(range(1, num_osds + 1))
         vprint(f"Generating {num_pgs} PGs with size {pg_size} over {num_osds} OSDs...")
@@ -716,9 +713,9 @@ def main():
 
     sim = subparsers.add_parser('simulate', help='Calculate durability of a randomized pool')
     sim.add_argument('--verbose', action='store_true', help='enable verbose output')
-    sim.add_argument('--num-osds', type=int, default=100, help="number of OSDs in the Cluster (default: 100)")
     sim.add_argument('--failure-domain', type=str, default="host", help="Type of failure domain (host/rack/...) (default: host)")
     sim.add_argument('--num-domains', type=int, default=10, help="number of failure domains (default: 10)")
+    sim.add_argument('--osds-per-domain', type=int, default=16, help="number of OSDs per failure domain (default: 16)")
     sim.add_argument('--tb-per-osd', type=int, default=10, help="raw data per OSD in TB (default: 10)")
     sim.add_argument('--object-size', type=int, default=4*1024*1024, help="average object size in bytes (default: 4MB)")
     sim.add_argument('--pgs-per-osd', type=int, default=100, help="target PGs per OSD (default: 100)")

From 00ded19fa3f55842b8814536509ef4ff4a50d875 Mon Sep 17 00:00:00 2001
From: Dan van der Ster <dan.vanderster@clyso.com>
Date: Tue, 25 Nov 2025 08:48:43 -0800
Subject: [PATCH 07/28] clyso-nines: infer failure domains from PGs

Signed-off-by: Dan van der Ster <dan.vanderster@clyso.com>
---
 otto/src/clyso/ceph/otto/tools/clyso-nines | 91 +++++++++++++++++++---
 1 file changed, 79 insertions(+), 12 deletions(-)

diff --git a/otto/src/clyso/ceph/otto/tools/clyso-nines b/otto/src/clyso/ceph/otto/tools/clyso-nines
index 1b3348c..a6c1765 100755
--- a/otto/src/clyso/ceph/otto/tools/clyso-nines
+++ b/otto/src/clyso/ceph/otto/tools/clyso-nines
@@ -181,16 +181,16 @@ def confidence_interval(successes, trials, confidence=0.95):
 def effective_recovery_rate(object_size_bytes,
                             recovery_bw=50*1024*1024,  # 50 MB/s
                             osd_recovery_sleep=0.1,
-                            objects_per_chunk=30):
+                            osd_recovery_max_active=1):
     """
     Calculate Ceph effective recovery throughput considering chunking and sleep.
 
     object_size_bytes: size of one object in bytes
     recovery_bw_MBps: raw recovery bandwidth (MB/s)
     osd_recovery_sleep: sleep time after each chunk (seconds)
-    objects_per_chunk: number of objects recovered per batch (default 30)
+    osd_recovery_max_active: number of concurrent recovery operations per OSD
     """
-    chunk_size_bytes = (object_size_bytes * objects_per_chunk)
+    chunk_size_bytes = (object_size_bytes * osd_recovery_max_active)
 
     # Time to recover chunk at given bandwidth (seconds)
     transfer_time = chunk_size_bytes / recovery_bw
@@ -202,6 +202,58 @@ def effective_recovery_rate(object_size_bytes,
     return chunk_size_bytes / total_time
 
 
+def infer_num_failure_domains(pgs):
+    """Infer number of failure domains from OSD and PG distribution."""
+
+    # discover all 'up' OSDs
+    osds = set()
+    for pg in pgs:
+        osds.update(pg)
+
+    osd_tree = [list(osds)] # [[osd1, osd2, ...], [osd16, osd17, ...], ...]
+
+    for pg in pgs:
+        pg_size = len(pg)
+        osd_to_host = {}
+        host_size = {}
+        host_index = 0
+        for host in osd_tree:
+            for osd in pg:
+                if osd in host:
+                    osd_to_host[osd] = host_index
+                    try:
+                        host_size[host_index] += 1
+                    except KeyError:
+                        host_size[host_index] = 1
+            host_index += 1
+
+        # split OSDs from this PG across different hosts
+        for host_index in range(len(osd_tree) + pg_size - 1):
+            if host_index not in host_size:
+                continue
+            if host_size[host_index] > 1:
+                first = True
+                j = 1
+                for osd in list(osd_tree[host_index]):
+                    if osd in pg:
+                        if first:
+                            first = False
+                            continue
+                        osd_tree[host_index].remove(osd)
+                        try:
+                            osd_tree[host_index + j].append(osd)
+                        except IndexError:
+                            osd_tree.append(list([osd]))
+                        j += 1
+
+    return len(osd_tree)
+
+
+
+
+
+
+
 # ---------------------------- Simulate Subcommand ----------------------------
 def simulate(args):
 
@@ -226,7 +278,7 @@ def simulate(args):
     else:
         raise ValueError("Specify either --size or both --k and --m.")
 
-    if args.domains_down > 0:
+    if not hasattr(args, 'pgs') and args.domains_down > 0:
         num_failures -= args.domains_down
 
     num_failures = max(0, num_failures)
@@ -260,20 +312,23 @@ def simulate(args):
 
         # Print simulation summary
         print(f"Simulating a {profile} pool with {num_osds} OSDs and {num_domains} {failure_domain}s ({osds_per_domain} OSDs per {failure_domain})")
-        vprint(f"    PGs: {num_pgs} total, averaging {num_pgs * pg_size / num_osds:.3g} per OSD")
+        print(f"    PGs: {num_pgs} total, averaging {num_pgs * pg_size / num_osds:.3g} per OSD")
 
     else:
-        # simulating failures on a given pool
+        # analyzing simulated failures on a given pool
         pgs = args.pgs
         osds = args.osds
+        num_domains = args.num_domains
+        failure_domain = args.failure_domain
+        osds_per_domain = float(len(osds)) / num_domains
         num_pgs = len(pgs)
         num_osds = len(osds)
         pgs_per_osd = num_pgs * pg_size / num_osds
         
         # Print analyze summary
         print(f"Analyzing '{args.pool_name}':")
-        print(f"    Info: {profile} pool with {num_osds} OSDs and {num_pgs} PGs")
-        vprint(f"    PGs: {num_pgs} total, ~{num_pgs * pg_size / num_osds:.3g} per OSD")
+        print(f"    Info: {profile} pool with {num_osds} OSDs and {num_domains} {failure_domain}s")
+        print(f"    PGs: {num_pgs} total, ~{num_pgs * pg_size / num_osds:.3g} per OSD")
 
     # Re-derive the stored bytes/objects per PG from tb_per_osd
     bytes_per_osd = args.tb_per_osd * 1024**4
@@ -345,6 +400,7 @@ def simulate(args):
 
     vprint()
     vprint(f"    Finally, P. ≥1 PG lost: {loss_rate * 100:.3g}%")
+    print(f"    Calculated PG Loss Rate: {loss_rate * 100:.3g}%")
 
     vprint()
     vprint(f"---------------------------")
@@ -414,12 +470,15 @@ def simulate(args):
     if simulations_with_loss > 0:
         loss_rate = simulations_with_loss / num_simulations
         vprint(f"    Observed Incidents with ≥1 PG lost: {simulations_with_loss}/{num_simulations} ({100 * loss_rate:.3f}%)")
+        print(f"    Monte Carlo PG Loss Rate: {loss_rate * 100:.3g}%")
     elif args.skip_monte_carlo:
         vprint(f"    Skipped Monte Carlo simulation as per user request. Using theoretical estimate.")
+        print(f"    Monte Carlo PG Loss Rate: - skipped. using theoretical estimate.")
         total_pg_losses = simulations_with_loss = loss_rate * num_simulations  # use theoretical estimate
     else:
 #        vprint(f"    Observed Incidents with ≥1 PG lost: 0/{num_simulations} (0%)")
         vprint("      !! No data loss observed in simulations, using theoretical estimate.")
+        print(f"    Monte Carlo PG Loss Rate: - none found. using theoretical estimate.")
         total_pg_losses = simulations_with_loss = loss_rate * num_simulations  # use theoretical estimate
 
     confidence = confidence_interval(simulations_with_loss, num_simulations)
@@ -443,7 +502,7 @@ def simulate(args):
         recovery_rate = effective_recovery_rate(args.object_size,
                                                 args.recovery_rate*1024*1024,
                                                 args.osd_recovery_sleep,
-                                                objects_per_chunk=30)
+                                                args.osd_recovery_max_active)
 
         try:
             vprint(f"    Estimated per-OSD recovery rate: {format_bytes(recovery_rate)}ps ({format_int(recovery_rate/args.object_size)} objects/s)")
@@ -580,7 +639,7 @@ def analyze_pool(args, pool, pg_dump):
                 try:
                     with open('ceph-osd-erasure-code-profile-get.json', 'r') as f:
                         ec_profile_data = json.load(f)
-                    print("Using local ceph-osd-erasure-code-profile-get.json file.")
+                    print("Reading ceph-osd-erasure-code-profile-get.json ...")
                     args.k = int(ec_profile_data['k'])
                     args.m = int(ec_profile_data['m'])
                 except Exception as e2:
@@ -613,8 +672,13 @@ def analyze_pool(args, pool, pg_dump):
     args.num_osds = pg_dump['pg_map']['osd_stats_sum']['num_osds']
     assert args.num_osds == len(osds), "OSD count mismatch!"
 
+    # infer the number of failure domains
+    args.num_domains = infer_num_failure_domains(pgs)
+    args.failure_domain = "domain"
+
     vprint(f"    Number of PGs: {args.pg_num}")
     vprint(f"    Number of OSDs: {len(osds)}")
+    vprint(f"    Number of {args.failure_domain}s: {args.num_domains}")
 
     # Derive tb_per_osd from pool size and pg stat_sum num_bytes
     bytes_stored = sum(pg['stat_sum']['num_bytes'] for pg in pg_stats if pg['pgid'].startswith(f"{pool['pool_id']}."))
@@ -660,7 +724,7 @@ def analyze(args):
         try:
             with open('ceph-osd-pool-ls-detail.json', 'r') as f:
                 pool_data = json.load(f)
-            print("Using local ceph-osd-pool-ls-detail.json file.")
+            print("Reading ceph-osd-pool-ls-detail.json ...")
         except Exception as e2:
             print(f"Error loading local pool data: {e2}")
             return
@@ -678,7 +742,7 @@ def analyze(args):
         try:
             with open('ceph-pg-dump.json', 'r') as f:
                 pg_dump_data = json.load(f)
-            print("Using local ceph-pg-dump.json file.")
+            print("Reading ceph-pg-dump.json ...")
             vprint("")
             vprint("========================================")
             vprint("")
@@ -728,6 +792,7 @@ def main():
     sim.add_argument('--afr', type=float, default=0.02, help='Device AFR (default: 0.02)')
     sim.add_argument('--recovery-rate', type=float, default=50, help='Recovery rate per OSD (default: 50MBps)')
     sim.add_argument('--osd-recovery-sleep', type=float, default=0.1, help='OSD Recovery Sleep in seconds (default: 0.1s)')
+    sim.add_argument('--osd-recovery-max-active', type=int, default=1, help='OSD Recovery Max Active (default: 1)')
     sim.add_argument('--domains-down', type=int, default=0, help='number of failure domains to take down in simulation (default: 0)')
     sim.add_argument('--skip-monte-carlo', action='store_true', help='Skip the Monte Carlo sims and use theoretical estimates instead')
     sim.add_argument('--max-mc-runtime', type=int, default=30, help='Maximum simulation runtime in seconds (default: 30s)')
@@ -743,6 +808,8 @@ def main():
     an.add_argument('--afr', type=float, default=0.02, help='Device AFR (default: 0.02)')
     an.add_argument('--recovery-rate', type=int, default=50, help='Per-OSD data recovery rate (default: 50MBps)')
     an.add_argument('--osd-recovery-sleep', type=float, default=0.1, help='OSD Recovery Sleep in seconds (default: 0.1s)')
+    an.add_argument('--osd-recovery-max-active', type=int, default=1, help='OSD Recovery Max Active (default: 1)')
+    an.add_argument('--domains-down', type=int, default=0, help='number of failure domains to take down in simulation (default: 0)')
     an.add_argument('--skip-monte-carlo', action='store_true', help='Skip the Monte Carlo sims and use theoretical estimates instead')
     an.add_argument('--max-mc-runtime', type=int, default=30, help='Maximum Monte Carlo simulation runtime in seconds per pool (default: 30s)')
     an.set_defaults(func=analyze)

From 64f9f855c76e23e4c82f0bb633684582b51c9602 Mon Sep 17 00:00:00 2001
From: Dan van der Ster <dan.vanderster@clyso.com>
Date: Tue, 25 Nov 2025 10:50:56 -0800
Subject: [PATCH 08/28] clyso-nines: readability

Signed-off-by: Dan van der Ster <dan.vanderster@clyso.com>
---
 otto/src/clyso/ceph/otto/tools/clyso-nines | 60 +++++++++++-----------
 1 file changed, 31 insertions(+), 29 deletions(-)

diff --git a/otto/src/clyso/ceph/otto/tools/clyso-nines b/otto/src/clyso/ceph/otto/tools/clyso-nines
index a6c1765..483a442 100755
--- a/otto/src/clyso/ceph/otto/tools/clyso-nines
+++ b/otto/src/clyso/ceph/otto/tools/clyso-nines
@@ -205,54 +205,56 @@ def effective_recovery_rate(object_size_bytes,
 def infer_num_failure_domains(pgs):
     """Infer number of failure domains from OSD and PG distribution."""
 
-    # discover all 'up' OSDs
+    # discover all 'up' OSDs -- those are outputs of the crush map
     osds = set()
     for pg in pgs:
         osds.update(pg)
 
-    osd_tree = [list(osds)] # [[osd1, osd2, ...], [osd16, osd17, ...], ...]
+    # start with a single host containing all OSDs
+    osd_tree = [list(osds)]
 
     for pg in pgs:
         pg_size = len(pg)
+
+        # map OSDs to hosts, and count how many PG OSDs fall into each host
         osd_to_host = {}
         host_size = {}
-        host_index = 0
-        for host in osd_tree:
+
+        for host_idx, host in enumerate(osd_tree):
+            count = 0
             for osd in pg:
                 if osd in host:
-                    osd_to_host[osd] = host_index
-                    try:
-                        host_size[host_index] += 1
-                    except KeyError:
-                        host_size[host_index] = 1
-            host_index += 1
+                    osd_to_host[osd] = host_idx
+                    count += 1
+            if count > 0:
+                host_size[host_idx] = count
 
         # split OSDs from this PG across different hosts
-        for host_index in range(len(osd_tree) + pg_size - 1):
-            if host_index not in host_size:
+        for host_idx in range(len(osd_tree) + pg_size - 1):
+            if host_size[host_index] < 1:
                 continue
-            if host_size[host_index] > 1:
-                first = True
-                j = 1
-                for osd in list(osd_tree[host_index]):
-                    if osd in pg:
-                        if first:
-                            first = False
-                            continue
-                        osd_tree[host_index].remove(osd)
-                        try:
-                            osd_tree[host_index + j].append(osd)
-                        except IndexError:
-                            osd_tree.append(list([osd]))
-                        j += 1
-
-    return len(osd_tree)
-
 
+            if host_idx not in host_size:
+                continue
 
+            first = True
+            new_host_offset = 1
+            for osd in list(osd_tree[host_idx]):
+                if osd not in pg:
+                    continue
 
+                if first:
+                    first = False
+                    continue
 
+                osd_tree[host_idx].remove(osd)
+                try:
+                    osd_tree[host_idx + j].append(osd)
+                except IndexError:
+                    osd_tree.append(list([osd]))
+                new_host_offset += 1
 
+    return len(osd_tree)
 
 # ---------------------------- Simulate Subcommand ----------------------------
 def simulate(args):

From 1e50906cee256b30be53c437fcd6ba2f181ac7ba Mon Sep 17 00:00:00 2001
From: Dan van der Ster <dan.vanderster@clyso.com>
Date: Tue, 25 Nov 2025 11:00:19 -0800
Subject: [PATCH 09/28] clyso-nines: oops

Signed-off-by: Dan van der Ster <dan.vanderster@clyso.com>
---
 otto/src/clyso/ceph/otto/tools/clyso-nines | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/otto/src/clyso/ceph/otto/tools/clyso-nines b/otto/src/clyso/ceph/otto/tools/clyso-nines
index 483a442..a509e4a 100755
--- a/otto/src/clyso/ceph/otto/tools/clyso-nines
+++ b/otto/src/clyso/ceph/otto/tools/clyso-nines
@@ -231,10 +231,10 @@ def infer_num_failure_domains(pgs):
 
         # split OSDs from this PG across different hosts
         for host_idx in range(len(osd_tree) + pg_size - 1):
-            if host_size[host_index] < 1:
+            if host_idx not in host_size:
                 continue
 
-            if host_idx not in host_size:
+            if host_size[host_idx] < 1:
                 continue
 
             first = True
@@ -249,7 +249,7 @@ def infer_num_failure_domains(pgs):
 
                 osd_tree[host_idx].remove(osd)
                 try:
-                    osd_tree[host_idx + j].append(osd)
+                    osd_tree[host_idx + new_host_offset].append(osd)
                 except IndexError:
                     osd_tree.append(list([osd]))
                 new_host_offset += 1

From 9e1b11b412ea3d292da37deac1e785e119baeb29 Mon Sep 17 00:00:00 2001
From: Dan van der Ster <dan.vanderster@clyso.com>
Date: Tue, 25 Nov 2025 12:47:29 -0800
Subject: [PATCH 10/28] clyso-nines: allow floating point --tb-per-osd

Signed-off-by: Dan van der Ster <dan.vanderster@clyso.com>
---
 otto/src/clyso/ceph/otto/tools/clyso-nines | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/otto/src/clyso/ceph/otto/tools/clyso-nines b/otto/src/clyso/ceph/otto/tools/clyso-nines
index a509e4a..323016a 100755
--- a/otto/src/clyso/ceph/otto/tools/clyso-nines
+++ b/otto/src/clyso/ceph/otto/tools/clyso-nines
@@ -782,7 +782,7 @@ def main():
     sim.add_argument('--failure-domain', type=str, default="host", help="Type of failure domain (host/rack/...) (default: host)")
     sim.add_argument('--num-domains', type=int, default=10, help="number of failure domains (default: 10)")
     sim.add_argument('--osds-per-domain', type=int, default=16, help="number of OSDs per failure domain (default: 16)")
-    sim.add_argument('--tb-per-osd', type=int, default=10, help="raw data per OSD in TB (default: 10)")
+    sim.add_argument('--tb-per-osd', type=float, default=10.0, help="raw data per OSD in TB (default: 10.0)")
     sim.add_argument('--object-size', type=int, default=4*1024*1024, help="average object size in bytes (default: 4MB)")
     sim.add_argument('--pgs-per-osd', type=int, default=100, help="target PGs per OSD (default: 100)")
     sim.add_argument('--size', type=int, default=3, help="simulate a replicated pool with SIZE replicas (default: 3)")

From 12f4c9f0f96f55189dbc06aa07b2d18c0dabd324 Mon Sep 17 00:00:00 2001
From: Dan van der Ster <dan.vanderster@clyso.com>
Date: Tue, 25 Nov 2025 13:24:13 -0800
Subject: [PATCH 11/28] clyso-nines: add some test cases

Signed-off-by: Dan van der Ster <dan.vanderster@clyso.com>
---
 otto/src/clyso/ceph/otto/tools/clyso-nines | 98 +++++++++++++++++++++-
 1 file changed, 95 insertions(+), 3 deletions(-)

diff --git a/otto/src/clyso/ceph/otto/tools/clyso-nines b/otto/src/clyso/ceph/otto/tools/clyso-nines
index 323016a..4c45550 100755
--- a/otto/src/clyso/ceph/otto/tools/clyso-nines
+++ b/otto/src/clyso/ceph/otto/tools/clyso-nines
@@ -67,7 +67,7 @@ def count_nines(value, max_digits=20):
     s = f"{rounded:.{max_digits}f}"
 
     if not s.startswith("0."):
-        return f"{max_digits} nines"  # Treat 1.0 as "max nines"
+        return f"{max_digits}-nines"  # Treat 1.0 as "max nines"
 
     decimal_part = s.split('.')[1]
     count = 0
@@ -258,7 +258,6 @@ def infer_num_failure_domains(pgs):
 
 # ---------------------------- Simulate Subcommand ----------------------------
 def simulate(args):
-
     # Set random seed for reproducibility
     random.seed(args.seed)
 
@@ -767,6 +766,95 @@ def analyze(args):
     return
 
 
+def run_tests(args):
+    # run some known simulate commands
+    # add all args
+    test_args = [
+        [
+            argparse.Namespace(
+                verbose=False,
+                failure_domain="host",
+                num_domains=10,
+                osds_per_domain=16,
+                tb_per_osd=10.0,
+                object_size=4*1024*1024,
+                pgs_per_osd=100,
+                size=3,
+                k=None,
+                m=None,
+                simulations=1000,
+                seed=42,
+                fudge=100,
+                afr=0.02,
+                recovery_rate=50,
+                osd_recovery_sleep=0.1,
+                osd_recovery_max_active=1,
+                domains_down=0,
+                skip_monte_carlo=False,
+                max_mc_runtime=30
+            ),
+            ("197.1 thousand years", "10-nines")
+        ],
+        [
+            argparse.Namespace(
+                verbose=False,
+                failure_domain="host",
+                num_domains=20,
+                osds_per_domain=16,
+                tb_per_osd=10.0,
+                object_size=4*1024*1024,
+                pgs_per_osd=100,
+                k=6,
+                m=3,
+                size=None,
+                simulations=1000,
+                seed=42,
+                fudge=100,
+                afr=0.02,
+                recovery_rate=50,
+                osd_recovery_sleep=0.1,
+                osd_recovery_max_active=1,
+                domains_down=0,
+                skip_monte_carlo=False,
+                max_mc_runtime=30
+            ),
+            ("15.4 thousand years", "10-nines")
+        ],
+        [
+            argparse.Namespace(
+                verbose=False,
+                failure_domain="host",
+                num_domains=500,
+                osds_per_domain=40,
+                tb_per_osd=0.0001,
+                object_size=4*1024*1024,
+                pgs_per_osd=0.01,
+                size=3,
+                k=None,
+                m=None,
+                simulations=1000,
+                seed=42,
+                fudge=100,
+                afr=0.02,
+                recovery_rate=50,
+                osd_recovery_sleep=0.1,
+                osd_recovery_max_active=1,
+                domains_down=0,
+                skip_monte_carlo=False,
+                max_mc_runtime=30
+            ),
+            ("80.8 billion years", "20-nines")
+        ]
+    ]
+    for i, test_arg in enumerate(test_args):
+        print(f"Running test case {i+1}...")
+        (mttdl, nines) = simulate(test_arg[0])
+        assert test_arg[1][0] == mttdl, f"MTTDL test case {i+1} failed! {mttdl} != {test_arg[1][0]}"
+        assert test_arg[1][1] == nines, f"Durability test case {i+1} failed! {nines} != {test_arg[1][1]}"
+        print(f"\nTest case {i+1} completed.")
+        print("========================================")
+
+
 # ---------------------------- Main CLI ----------------------------
 def main():
     parser = argparse.ArgumentParser(
@@ -784,7 +872,7 @@ def main():
     sim.add_argument('--osds-per-domain', type=int, default=16, help="number of OSDs per failure domain (default: 16)")
     sim.add_argument('--tb-per-osd', type=float, default=10.0, help="raw data per OSD in TB (default: 10.0)")
     sim.add_argument('--object-size', type=int, default=4*1024*1024, help="average object size in bytes (default: 4MB)")
-    sim.add_argument('--pgs-per-osd', type=int, default=100, help="target PGs per OSD (default: 100)")
+    sim.add_argument('--pgs-per-osd', type=float, default=100, help="target PGs per OSD (default: 100)")
     sim.add_argument('--size', type=int, default=3, help="simulate a replicated pool with SIZE replicas (default: 3)")
     sim.add_argument('--k', type=int, help="simulate an erasure coded pool with K data shards")
     sim.add_argument('--m', type=int, help="simulate an erasure coded pool with M parity shards")
@@ -816,6 +904,10 @@ def main():
     an.add_argument('--max-mc-runtime', type=int, default=30, help='Maximum Monte Carlo simulation runtime in seconds per pool (default: 30s)')
     an.set_defaults(func=analyze)
 
+    test = subparsers.add_parser('test', help='run internal tests')
+    test.add_argument('--verbose', action='store_true', help='enable verbose output')
+    test.set_defaults(func=run_tests)
+
     args = parser.parse_args()
     if not args.command:
         parser.print_help()

From 6ab8ccb234736fc0f28b62cbf1127d0a50fd4d5d Mon Sep 17 00:00:00 2001
From: Dan van der Ster <dan.vanderster@clyso.com>
Date: Tue, 25 Nov 2025 14:49:48 -0800
Subject: [PATCH 12/28] clyso-nines: use log-sum-exp trick to avoid overflow

Signed-off-by: Dan van der Ster <dan.vanderster@clyso.com>
---
 otto/src/clyso/ceph/otto/tools/clyso-nines | 68 +++++++++++++++++++---
 1 file changed, 61 insertions(+), 7 deletions(-)

diff --git a/otto/src/clyso/ceph/otto/tools/clyso-nines b/otto/src/clyso/ceph/otto/tools/clyso-nines
index 4c45550..54cf043 100755
--- a/otto/src/clyso/ceph/otto/tools/clyso-nines
+++ b/otto/src/clyso/ceph/otto/tools/clyso-nines
@@ -97,14 +97,41 @@ def mttdl(N, k, afr, fudge, recovery_time_sec):
     mttdl_seconds = recovery_time_sec / prob_k_failures
     return mttdl_seconds
 
-def prob_exact_k_failures(n, p, k):
-    return comb(n, k) * (p ** k) * ((1 - p) ** (n - k))
+# https://en.wikipedia.org/wiki/Binomial_distribution
+def log_prob_exact_k_failures(n, p, k):
+    if p == 0:
+        return float('-inf') if k > 0 else 0.0
+    if p == 1:
+        return float('-inf') if k < n else 0.0
+    return (
+        math.log(comb(n, k))
+        + k * math.log(p)
+        + (n - k) * math.log1p(-p)
+    )
+
+def prob_k_or_more_failures(n, p, k, cutoff=1e-30):
+    """Stable + fast binomial tail probability P(X>=k)."""
+    logs = []
+    max_log = -float('inf')
+
+    # Only go forward until terms become negligible
+    for i in range(k, n + 1):
+        lp = log_prob_exact_k_failures(n, p, i)
+
+        # Track maximum for log-sum-exp normalization
+        if lp > max_log:
+            max_log = lp
+
+        logs.append(lp)
+
+        # Stop when next term is extremely small
+        if len(logs) > 5:
+            # Compare last 5 terms with peak
+            if all((max_log - x) > -math.log(cutoff) for x in logs[-5:]):
+                break
 
-def prob_k_or_more_failures(n, p, k):
-    cumulative = 0
-    for i in range(k): # 0, 1, 2, ..., k-1
-        cumulative += prob_exact_k_failures(n, p, i)
-    return 1 - cumulative
+    # log-sum-exp
+    return math.exp(max_log) * sum(math.exp(x - max_log) for x in logs)
 
 def expected_max_osd_load(n_osds, n_pgs):
     if n_pgs == 0 or n_osds == 0:
@@ -765,6 +792,31 @@ def analyze(args):
 
     return
 
+def test_prob_k_or_more_failures():
+    # (n, p, k, expected_P_X_ge_k)
+    test_cases = [
+        # Small n, moderate p (exact binomial)
+        (10,    0.1,     2, 0.263901),
+        (20,    0.05,    3, 0.0754837),
+        (50,    0.02,    2, 0.264229),
+
+        # Ceph-like: moderate n, very small p (exact binomial)
+        (1000,  1e-6,    2, 4.99168e-07),
+        (1000,  1e-5,    2, 4.96189e-05),
+        (5000,  2e-7,    2, 4.99567e-07),
+        (10000, 1e-8,    2, 4.99917e-09),
+        (10000, 1e-7,    3, 1.66492e-10),
+    ]
+
+    for n, p, k, expected in test_cases:
+        got = prob_k_or_more_failures(n, p, k)
+        if not math.isclose(got, expected, rel_tol=0.00001):
+            raise AssertionError(
+                f"Failed for n={n}, p={p}, k={k}: "
+                f"got {got:.18g}, expected {expected:.18g}"
+            )
+
+    print("\nAll prob_k_or_more_failures tests passed.")
 
 def run_tests(args):
     # run some known simulate commands
@@ -854,6 +906,8 @@ def run_tests(args):
         print(f"\nTest case {i+1} completed.")
         print("========================================")
 
+    test_prob_k_or_more_failures()
+
 
 # ---------------------------- Main CLI ----------------------------
 def main():

From 9859a5e2dc31a63f5a23cbbd455b077f123f0a05 Mon Sep 17 00:00:00 2001
From: Dan van der Ster <dan.vanderster@clyso.com>
Date: Tue, 25 Nov 2025 14:56:40 -0800
Subject: [PATCH 13/28] clyso-nines: restore cursor on exit

Signed-off-by: Dan van der Ster <dan.vanderster@clyso.com>
---
 otto/src/clyso/ceph/otto/tools/clyso-nines | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/otto/src/clyso/ceph/otto/tools/clyso-nines b/otto/src/clyso/ceph/otto/tools/clyso-nines
index 54cf043..7990eba 100755
--- a/otto/src/clyso/ceph/otto/tools/clyso-nines
+++ b/otto/src/clyso/ceph/otto/tools/clyso-nines
@@ -24,9 +24,7 @@ import random
 import time
 import shlex
 import itertools
-
-# hide cursor
-print('\033[?25l', end="")
+import sys
 
 VERBOSE = False
 
@@ -984,4 +982,15 @@ def main():
         vprint()
 
 if __name__ == "__main__":
-    main()
+    # Hide cursor
+    print('\033[?25l', end='', flush=True)
+
+    try:
+        main()
+    except KeyboardInterrupt:
+        print("\nInterrupted.", file=sys.stderr)
+    except Exception:
+        traceback.print_exc()
+    finally:
+        # Restore cursor
+        print('\033[?25h', end='', flush=True)

From 4ee3ed78706049cf80ac04b414fb65e47c840d4a Mon Sep 17 00:00:00 2001
From: Dan van der Ster <dan.vanderster@clyso.com>
Date: Mon, 8 Dec 2025 08:33:17 -0800
Subject: [PATCH 14/28] clyso-nines: add --debug

Signed-off-by: Dan van der Ster <dan.vanderster@clyso.com>
---
 otto/src/clyso/ceph/otto/tools/clyso-nines | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/otto/src/clyso/ceph/otto/tools/clyso-nines b/otto/src/clyso/ceph/otto/tools/clyso-nines
index 7990eba..ba8f8c5 100755
--- a/otto/src/clyso/ceph/otto/tools/clyso-nines
+++ b/otto/src/clyso/ceph/otto/tools/clyso-nines
@@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-
+#
 #   clyso-nines - A tool to estimate data loss risk in Ceph clusters
 #
 #   Copyright (C) 2025 Dan van der Ster <dan.vanderster@clyso.com>
@@ -27,11 +27,16 @@ import itertools
 import sys
 
 VERBOSE = False
+DEBUG = False
 
 # ---------------------------- Utilities ----------------------------
 
 def vprint(*args, **kwargs):
-    if VERBOSE:
+    if VERBOSE or DEBUG:
+        print(*args, **kwargs)
+
+def dprint(*args, **kwargs):
+    if DEBUG:
         print(*args, **kwargs)
 
 def reconstruct_command_line(parser, args):
@@ -919,6 +924,7 @@ def main():
 
     sim = subparsers.add_parser('simulate', help='Calculate durability of a randomized pool')
     sim.add_argument('--verbose', action='store_true', help='enable verbose output')
+    sim.add_argument('--debug', action='store_true', help='enable debug output')
     sim.add_argument('--failure-domain', type=str, default="host", help="Type of failure domain (host/rack/...) (default: host)")
     sim.add_argument('--num-domains', type=int, default=10, help="number of failure domains (default: 10)")
     sim.add_argument('--osds-per-domain', type=int, default=16, help="number of OSDs per failure domain (default: 16)")
@@ -942,6 +948,7 @@ def main():
 
     an = subparsers.add_parser('analyze', help='Calculate durability of the local pools')
     an.add_argument('--verbose', action='store_true', help='enable verbose output')
+    an.add_argument('--debug', action='store_true', help='enable debug output')
     an.add_argument('--cluster', type=str, default='ceph', help='Ceph cluster name (default: ceph)')
     an.add_argument('--pool', type=str, help='analyze only the given pool (default: all pools)')
     an.add_argument('--simulations', type=int, default=1000, help="number of incidents to simulate (default: 1000)")
@@ -958,6 +965,7 @@ def main():
 
     test = subparsers.add_parser('test', help='run internal tests')
     test.add_argument('--verbose', action='store_true', help='enable verbose output')
+    test.add_argument('--debug', action='store_true', help='enable debug output')
     test.set_defaults(func=run_tests)
 
     args = parser.parse_args()
@@ -965,7 +973,9 @@ def main():
         parser.print_help()
     else:
         global VERBOSE
+        global DEBUG
         VERBOSE = args.verbose
+        DEBUG = args.debug
         # Reconstruct command
         cmdline = reconstruct_command_line(parser, args)
         # Print header
@@ -990,6 +1000,7 @@ if __name__ == "__main__":
     except KeyboardInterrupt:
         print("\nInterrupted.", file=sys.stderr)
     except Exception:
+        import traceback
         traceback.print_exc()
     finally:
         # Restore cursor

From 49590bfd6d1f2514374d8e88355d5c011db4c78f Mon Sep 17 00:00:00 2001
From: Dan van der Ster <dan.vanderster@clyso.com>
Date: Mon, 8 Dec 2025 08:37:07 -0800
Subject: [PATCH 15/28] clyso-nines: fix comb() for k > n

Signed-off-by: Dan van der Ster <dan.vanderster@clyso.com>
---
 otto/src/clyso/ceph/otto/tools/clyso-nines | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/otto/src/clyso/ceph/otto/tools/clyso-nines b/otto/src/clyso/ceph/otto/tools/clyso-nines
index ba8f8c5..4593cba 100755
--- a/otto/src/clyso/ceph/otto/tools/clyso-nines
+++ b/otto/src/clyso/ceph/otto/tools/clyso-nines
@@ -85,8 +85,10 @@ def comb(n, k):
     """Return C(n, k) with math.comb-like semantics."""
     if not (isinstance(n, int) and isinstance(k, int)):
         raise TypeError("n and k must be integers")
-    if n < 0 or k < 0 or k > n:
-        raise ValueError("n must be >= 0 and 0 <= k <= n")
+    if n < 0:
+        raise ValueError("n must be non-negative")
+    if k < 0 or k > n:
+        return 0
     k = min(k, n - k)  # symmetry
     c = 1
     for i in range(1, k + 1):

From 5b8abb31f7dc48a9f11fbf0e0e8d140de60efced Mon Sep 17 00:00:00 2001
From: Dan van der Ster <dan.vanderster@clyso.com>
Date: Mon, 8 Dec 2025 08:38:28 -0800
Subject: [PATCH 16/28] clyso-nines: use 99.9% confidence interval

Signed-off-by: Dan van der Ster <dan.vanderster@clyso.com>
---
 otto/src/clyso/ceph/otto/tools/clyso-nines | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/otto/src/clyso/ceph/otto/tools/clyso-nines b/otto/src/clyso/ceph/otto/tools/clyso-nines
index 4593cba..b7e044e 100755
--- a/otto/src/clyso/ceph/otto/tools/clyso-nines
+++ b/otto/src/clyso/ceph/otto/tools/clyso-nines
@@ -201,11 +201,12 @@ def simple_crush(osd_ids, number_of_domains, size, num_pgs):
 
     return pgs
 
-def confidence_interval(successes, trials, confidence=0.95):
+def confidence_interval(successes, trials):
     if trials == 0:
         return (0, 0)
     p_hat = successes / trials
-    z = 1.96  # For 95% confidence
+    # 99.9% confidence interval using normal approximation
+    z = 3.2905  # z-score for 99.9% confidence
     margin_of_error = z * ((p_hat * (1 - p_hat)) / trials) ** 0.5
     return (max(0, p_hat - margin_of_error), min(1, p_hat + margin_of_error))
 

From 0317382a18b8e19732273d63fb332671a568f2da Mon Sep 17 00:00:00 2001
From: Dan van der Ster <dan.vanderster@clyso.com>
Date: Mon, 8 Dec 2025 08:42:46 -0800
Subject: [PATCH 17/28] clyso-nines: retain failure domains

Signed-off-by: Dan van der Ster <dan.vanderster@clyso.com>
---
 otto/src/clyso/ceph/otto/tools/clyso-nines | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/otto/src/clyso/ceph/otto/tools/clyso-nines b/otto/src/clyso/ceph/otto/tools/clyso-nines
index b7e044e..e1b6712 100755
--- a/otto/src/clyso/ceph/otto/tools/clyso-nines
+++ b/otto/src/clyso/ceph/otto/tools/clyso-nines
@@ -188,6 +188,8 @@ def simple_crush(osd_ids, number_of_domains, size, num_pgs):
         domains[domain].append(osd)
 
     domain_ids = list(domains.keys())
+    for domain in domain_ids:
+        dprint(f"Domain {domain}: OSDs {domains[domain]}")
 
     pgs = []
     for _ in range(num_pgs):
@@ -198,8 +200,9 @@ def simple_crush(osd_ids, number_of_domains, size, num_pgs):
             # Pick a random OSD from the selected domain
             pg_osds.append(random.choice(domains[domain]))
         pgs.append(tuple(pg_osds))
+        dprint(f"Generated PG: {pgs[-1]} from domains {chosen_domains}")
 
-    return pgs
+    return domains, pgs
 
 def confidence_interval(successes, trials):
     if trials == 0:
@@ -338,11 +341,8 @@ def simulate(args):
 
         num_pgs = round_to_nearest_power_of_two((num_osds * pgs_per_osd) / pg_size)
         osds = list(range(1, num_osds + 1))
-        vprint(f"Generating {num_pgs} PGs with size {pg_size} over {num_osds} OSDs...")
-        if not args.skip_monte_carlo:
-            pgs = simple_crush(osds,num_domains,pg_size,num_pgs) # [tuple(random.sample(osds, pg_size)) for _ in range(num_pgs)]
-        else:
-            pgs = [tuple(random.sample(osds, pg_size)) for _ in range(num_pgs)]
+        vprint(f"Generating {num_pgs} PGs using simple CRUSH-like algorithm...")
+        domains, pgs = simple_crush(osds,num_domains,pg_size,num_pgs)
 
         # Print simulation summary
         print(f"Simulating a {profile} pool with {num_osds} OSDs and {num_domains} {failure_domain}s ({osds_per_domain} OSDs per {failure_domain})")
@@ -358,7 +358,7 @@ def simulate(args):
         num_pgs = len(pgs)
         num_osds = len(osds)
         pgs_per_osd = num_pgs * pg_size / num_osds
-        
+
         # Print analyze summary
         print(f"Analyzing '{args.pool_name}':")
         print(f"    Info: {profile} pool with {num_osds} OSDs and {num_domains} {failure_domain}s")
@@ -707,7 +707,8 @@ def analyze_pool(args, pool, pg_dump):
     assert args.num_osds == len(osds), "OSD count mismatch!"
 
     # infer the number of failure domains
-    args.num_domains = infer_num_failure_domains(pgs)
+    domains = infer_failure_domains(pgs)
+    args.num_domains = len(domains)
     args.failure_domain = "domain"
 
     vprint(f"    Number of PGs: {args.pg_num}")

From 7cba895f6b45917abf7c0305f5a47b15391ea534 Mon Sep 17 00:00:00 2001
From: Dan van der Ster <dan.vanderster@clyso.com>
Date: Mon, 15 Dec 2025 15:31:46 -0800
Subject: [PATCH 18/28] clyso-nines: optimize simple_crush

Signed-off-by: Dan van der Ster <dan.vanderster@clyso.com>
---
 otto/src/clyso/ceph/otto/tools/clyso-nines | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/otto/src/clyso/ceph/otto/tools/clyso-nines b/otto/src/clyso/ceph/otto/tools/clyso-nines
index e1b6712..7862584 100755
--- a/otto/src/clyso/ceph/otto/tools/clyso-nines
+++ b/otto/src/clyso/ceph/otto/tools/clyso-nines
@@ -177,30 +177,31 @@ def format_time(seconds):
             return f"{round(seconds / thresholds[i], 1)} {units[i]}"
     return f"{round(seconds)} s"
 
-def simple_crush(osd_ids, number_of_domains, size, num_pgs):
-    if size > number_of_domains:
-        raise ValueError("Replication size cannot be greater than the number of failure domains.")
+def simple_crush(osd_ids, num_domains, pg_size, num_pgs):
+    if pg_size > num_domains:
+        raise ValueError("PG size cannot be greater than num failure domains.")
 
-    # Group OSDs into failure domains evenly
+    # Group OSDs into failure domains
     domains = defaultdict(list)
     for i, osd in enumerate(osd_ids):
-        domain = i % number_of_domains
+        domain = i % num_domains
         domains[domain].append(osd)
 
     domain_ids = list(domains.keys())
-    for domain in domain_ids:
-        dprint(f"Domain {domain}: OSDs {domains[domain]}")
+    if DEBUG:
+        for domain in domain_ids:
+            dprint(f"Domain {domain}: OSDs {domains[domain]}")
 
     pgs = []
     for _ in range(num_pgs):
         # Randomly choose failure domains for this PG
-        chosen_domains = random.sample(domain_ids, size)
+        chosen_domains = random.sample(domain_ids, pg_size)
         pg_osds = []
         for domain in chosen_domains:
             # Pick a random OSD from the selected domain
             pg_osds.append(random.choice(domains[domain]))
         pgs.append(tuple(pg_osds))
-        dprint(f"Generated PG: {pgs[-1]} from domains {chosen_domains}")
+        if DEBUG: dprint(f"Generated PG: {pgs[-1]} from domains {chosen_domains}")
 
     return domains, pgs
 

From da02104688dd019067f8b642a77581744d5a56fb Mon Sep 17 00:00:00 2001
From: Dan van der Ster <dan.vanderster@clyso.com>
Date: Mon, 15 Dec 2025 15:40:37 -0800
Subject: [PATCH 19/28] clyso-nines: time simple_crush method

Signed-off-by: Dan van der Ster <dan.vanderster@clyso.com>
---
 otto/src/clyso/ceph/otto/tools/clyso-nines | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/otto/src/clyso/ceph/otto/tools/clyso-nines b/otto/src/clyso/ceph/otto/tools/clyso-nines
index 7862584..8100bc0 100755
--- a/otto/src/clyso/ceph/otto/tools/clyso-nines
+++ b/otto/src/clyso/ceph/otto/tools/clyso-nines
@@ -343,7 +343,10 @@ def simulate(args):
         num_pgs = round_to_nearest_power_of_two((num_osds * pgs_per_osd) / pg_size)
         osds = list(range(1, num_osds + 1))
         vprint(f"Generating {num_pgs} PGs using simple CRUSH-like algorithm...")
-        domains, pgs = simple_crush(osds,num_domains,pg_size,num_pgs)
+        time_start = time.time()
+        domains, pgs = simple_crush(osds, num_domains, pg_size, num_pgs)
+        time_end = time.time()
+        dprint(f"    Generated {num_pgs} PGs in {time_end - time_start:.2f} seconds.")
 
         # Print simulation summary
         print(f"Simulating a {profile} pool with {num_osds} OSDs and {num_domains} {failure_domain}s ({osds_per_domain} OSDs per {failure_domain})")

From 4da96d5cbb25ff1707e9f779d00b6834a2832882 Mon Sep 17 00:00:00 2001
From: Dan van der Ster <dan.vanderster@clyso.com>
Date: Mon, 15 Dec 2025 15:41:38 -0800
Subject: [PATCH 20/28] clyso-nines: print PG distribution

Signed-off-by: Dan van der Ster <dan.vanderster@clyso.com>
---
 otto/src/clyso/ceph/otto/tools/clyso-nines | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/otto/src/clyso/ceph/otto/tools/clyso-nines b/otto/src/clyso/ceph/otto/tools/clyso-nines
index 8100bc0..ca69f82 100755
--- a/otto/src/clyso/ceph/otto/tools/clyso-nines
+++ b/otto/src/clyso/ceph/otto/tools/clyso-nines
@@ -368,6 +368,28 @@ def simulate(args):
         print(f"    Info: {profile} pool with {num_osds} OSDs and {num_domains} {failure_domain}s")
         print(f"    PGs: {num_pgs} total, ~{num_pgs * pg_size / num_osds:.3g} per OSD")
 
+    # --- Calculate OSD distribution stats ---
+    osd_counts = defaultdict(int)
+    for pg in pgs:
+        for osd in pg:
+            osd_counts[osd] += 1
+
+    # specific list of OSDs from your previous lines
+    counts = [osd_counts.get(osd, 0) for osd in osds]
+
+    if counts:
+        min_pgs = min(counts)
+        max_pgs = max(counts)
+        mean_pgs = sum(counts) / len(counts)
+        variance = sum((x - mean_pgs) ** 2 for x in counts) / len(counts)
+        std_dev = math.sqrt(variance)
+
+        print(f"    OSD Distribution (PGs per OSD):")
+        print(f"      Min:  {min_pgs}")
+        print(f"      Max:  {max_pgs}")
+        print(f"      Mean: {mean_pgs:.2f}")
+        print(f"      Std:  {std_dev:.2f}")
+
     # Re-derive the stored bytes/objects per PG from tb_per_osd
     bytes_per_osd = args.tb_per_osd * 1024**4
     total_bytes = num_osds * bytes_per_osd

From 59f2eee20540f37945ddb24cf49a7d865c0411e3 Mon Sep 17 00:00:00 2001
From: Dan van der Ster <dan.vanderster@clyso.com>
Date: Mon, 15 Dec 2025 15:42:40 -0800
Subject: [PATCH 21/28] clyso-nines: print backtrace on interrupt

Signed-off-by: Dan van der Ster <dan.vanderster@clyso.com>
---
 otto/src/clyso/ceph/otto/tools/clyso-nines | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/otto/src/clyso/ceph/otto/tools/clyso-nines b/otto/src/clyso/ceph/otto/tools/clyso-nines
index ca69f82..321cdf3 100755
--- a/otto/src/clyso/ceph/otto/tools/clyso-nines
+++ b/otto/src/clyso/ceph/otto/tools/clyso-nines
@@ -1029,6 +1029,8 @@ if __name__ == "__main__":
         main()
     except KeyboardInterrupt:
         print("\nInterrupted.", file=sys.stderr)
+        import traceback
+        traceback.print_exc()
     except Exception:
         import traceback
         traceback.print_exc()

From 5bfb1c88a999fdbd41e4875e843f5f7b75cefb82 Mon Sep 17 00:00:00 2001
From: Dan van der Ster <dan.vanderster@clyso.com>
Date: Tue, 16 Dec 2025 09:26:45 -0800
Subject: [PATCH 22/28] clyso-nines: infer failure domains

Signed-off-by: Dan van der Ster <dan.vanderster@clyso.com>
---
 otto/src/clyso/ceph/otto/tools/clyso-nines | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/otto/src/clyso/ceph/otto/tools/clyso-nines b/otto/src/clyso/ceph/otto/tools/clyso-nines
index 321cdf3..5b19587 100755
--- a/otto/src/clyso/ceph/otto/tools/clyso-nines
+++ b/otto/src/clyso/ceph/otto/tools/clyso-nines
@@ -239,8 +239,8 @@ def effective_recovery_rate(object_size_bytes,
     return chunk_size_bytes / total_time
 
 
-def infer_num_failure_domains(pgs):
-    """Infer number of failure domains from OSD and PG distribution."""
+def infer_failure_domains(pgs):
+    """Infer failure domains from OSD and PG distribution."""
 
     # discover all 'up' OSDs -- those are outputs of the crush map
     osds = set()
@@ -291,7 +291,14 @@ def infer_num_failure_domains(pgs):
                     osd_tree.append(list([osd]))
                 new_host_offset += 1
 
-    return len(osd_tree)
+    dprint("Inferred failure domains:")
+    for idx, host in enumerate(osd_tree):
+        dprint(f"  Domain {idx}: OSDs {host}")
+
+    return osd_tree
+
+
+
 
 # ---------------------------- Simulate Subcommand ----------------------------
 def simulate(args):

From 6fc994cc60ab33a68a9dba5b8faa65d618a10cb7 Mon Sep 17 00:00:00 2001
From: Dan van der Ster <dan.vanderster@clyso.com>
Date: Tue, 16 Dec 2025 12:07:56 -0800
Subject: [PATCH 23/28] clyso-nines: vprint the pg dist

Signed-off-by: Dan van der Ster <dan.vanderster@clyso.com>
---
 otto/src/clyso/ceph/otto/tools/clyso-nines | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/otto/src/clyso/ceph/otto/tools/clyso-nines b/otto/src/clyso/ceph/otto/tools/clyso-nines
index 5b19587..ac288f1 100755
--- a/otto/src/clyso/ceph/otto/tools/clyso-nines
+++ b/otto/src/clyso/ceph/otto/tools/clyso-nines
@@ -391,11 +391,11 @@ def simulate(args):
         variance = sum((x - mean_pgs) ** 2 for x in counts) / len(counts)
         std_dev = math.sqrt(variance)
 
-        print(f"    OSD Distribution (PGs per OSD):")
-        print(f"      Min:  {min_pgs}")
-        print(f"      Max:  {max_pgs}")
-        print(f"      Mean: {mean_pgs:.2f}")
-        print(f"      Std:  {std_dev:.2f}")
+        vprint(f"    OSD Distribution (PGs per OSD):")
+        vprint(f"      Min:  {min_pgs}")
+        vprint(f"      Max:  {max_pgs}")
+        vprint(f"      Mean: {mean_pgs:.2f}")
+        vprint(f"      Std:  {std_dev:.2f}")
 
     # Re-derive the stored bytes/objects per PG from tb_per_osd
     bytes_per_osd = args.tb_per_osd * 1024**4

From 82972eb994e42fd262e0e4a5f6b9a253119f623a Mon Sep 17 00:00:00 2001
From: Dan van der Ster <dan.vanderster@clyso.com>
Date: Thu, 29 Jan 2026 16:09:33 -0800
Subject: [PATCH 24/28] clyso-nines: improve time print

Signed-off-by: Dan van der Ster <dan.vanderster@clyso.com>
---
 otto/src/clyso/ceph/otto/tools/clyso-nines | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/otto/src/clyso/ceph/otto/tools/clyso-nines b/otto/src/clyso/ceph/otto/tools/clyso-nines
index ac288f1..b104b35 100755
--- a/otto/src/clyso/ceph/otto/tools/clyso-nines
+++ b/otto/src/clyso/ceph/otto/tools/clyso-nines
@@ -169,7 +169,7 @@ def format_int(val):
     return f"~{round(val)}"
 
 def format_time(seconds):
-    units = ["s", "min", "hr", "days", "years", "thousand years", "million years", "billion years"]
+    units = ["s", "m", "h", "d", "y", "thousand years", "million years", "billion years"]
     thresholds = [1, 60, 3600, 86400, 86400*365, 86400*365*1000, 86400*365*10**6, 86400*365*10**9]
 
     for i in reversed(range(1, len(units))):

From d7e0003ae5820dc9cbec3d20e67f5620931b4916 Mon Sep 17 00:00:00 2001
From: Dan van der Ster <dan.vanderster@clyso.com>
Date: Thu, 29 Jan 2026 16:10:43 -0800
Subject: [PATCH 25/28] clyso-nines: print command line for test output

Signed-off-by: Dan van der Ster <dan.vanderster@clyso.com>
---
 otto/src/clyso/ceph/otto/tools/clyso-nines | 29 ++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/otto/src/clyso/ceph/otto/tools/clyso-nines b/otto/src/clyso/ceph/otto/tools/clyso-nines
index b104b35..53e03c3 100755
--- a/otto/src/clyso/ceph/otto/tools/clyso-nines
+++ b/otto/src/clyso/ceph/otto/tools/clyso-nines
@@ -936,10 +936,39 @@ def run_tests(args):
                 max_mc_runtime=30
             ),
             ("80.8 billion years", "20-nines")
+        ],
+        [
+            argparse.Namespace(
+                verbose=False,
+                failure_domain="host",
+                num_domains=500,
+                osds_per_domain=40,
+                tb_per_osd=10,
+                object_size=4*1024,
+                pgs_per_osd=500,
+                size=None,
+                k=8,
+                m=3,
+                simulations=1000,
+                seed=42,
+                fudge=100,
+                afr=0.02,
+                recovery_rate=50,
+                osd_recovery_sleep=0.1,
+                osd_recovery_max_active=1,
+                domains_down=0,
+                skip_monte_carlo=True,
+                max_mc_runtime=30
+            ),
+            ("11.4 thousand years", "20-nines")
         ]
     ]
     for i, test_arg in enumerate(test_args):
         print(f"Running test case {i+1}...")
+        parser = argparse.ArgumentParser(prog='clyso-nines')
+        test_arg[0].command = 'simulate'
+        cmdline = reconstruct_command_line(parser, test_arg[0])
+        print(f"Command line: {cmdline}")
         (mttdl, nines) = simulate(test_arg[0])
         assert test_arg[1][0] == mttdl, f"MTTDL test case {i+1} failed! {mttdl} != {test_arg[1][0]}"
         assert test_arg[1][1] == nines, f"Durability test case {i+1} failed! {nines} != {test_arg[1][1]}"

From 0a67848a8e69c10329760c1ee89cfd471b8fd59d Mon Sep 17 00:00:00 2001
From: Dan van der Ster <dan.vanderster@clyso.com>
Date: Thu, 29 Jan 2026 16:13:22 -0800
Subject: [PATCH 26/28] clyso-nines: when a host is down for maintenance, do
 not amplify the likelihood of additional failures

Signed-off-by: Dan van der Ster <dan.vanderster@clyso.com>
---
 otto/src/clyso/ceph/otto/tools/clyso-nines | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/otto/src/clyso/ceph/otto/tools/clyso-nines b/otto/src/clyso/ceph/otto/tools/clyso-nines
index 53e03c3..8cc0215 100755
--- a/otto/src/clyso/ceph/otto/tools/clyso-nines
+++ b/otto/src/clyso/ceph/otto/tools/clyso-nines
@@ -1042,6 +1042,11 @@ def main():
         global DEBUG
         VERBOSE = args.verbose
         DEBUG = args.debug
+
+        # FIXME: The math breaks without this workaround
+        if args.domains_down > 0:
+            args.fudge = 1
+
         # Reconstruct command
         cmdline = reconstruct_command_line(parser, args)
         # Print header

From 1c0fd452574ebfdcf6d8c55c7b13bc8f11bf4d9a Mon Sep 17 00:00:00 2001
From: Dan van der Ster <dan.vanderster@clyso.com>
Date: Thu, 29 Jan 2026 16:20:53 -0800
Subject: [PATCH 27/28] clyso-nines: adjust the fudge factor for simulate and
 analyze subcommands

Signed-off-by: Dan van der Ster <dan.vanderster@clyso.com>
---
 otto/src/clyso/ceph/otto/tools/clyso-nines | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/otto/src/clyso/ceph/otto/tools/clyso-nines b/otto/src/clyso/ceph/otto/tools/clyso-nines
index 8cc0215..43c9501 100755
--- a/otto/src/clyso/ceph/otto/tools/clyso-nines
+++ b/otto/src/clyso/ceph/otto/tools/clyso-nines
@@ -305,6 +305,10 @@ def simulate(args):
     # Set random seed for reproducibility
     random.seed(args.seed)
 
+    # FIXME: The math breaks without this workaround
+    if args.domains_down > 0:
+        args.fudge = 1
+
     raw_to_stored_factor = 0
     profile = ""
     # Derive PG size and min_size
@@ -686,6 +690,10 @@ def analyze_pool(args, pool, pg_dump):
     import subprocess
     import json
 
+    # FIXME: The math breaks without this workaround
+    if args.domains_down > 0:
+        args.fudge = 1
+
     vprint(f"Analyzing pool '{pool['pool_name']}' (id {pool['pool_id']}, size {pool['size']}) ")
     if 'erasure_code_profile' in pool and pool['erasure_code_profile']:
         vprint(f"    Type: erasure-coded ({pool['erasure_code_profile']})")
@@ -1043,10 +1051,6 @@ def main():
         VERBOSE = args.verbose
         DEBUG = args.debug
 
-        # FIXME: The math breaks without this workaround
-        if args.domains_down > 0:
-            args.fudge = 1
-
         # Reconstruct command
         cmdline = reconstruct_command_line(parser, args)
         # Print header

From a76b622f0ac1eb7eeb922eed95d706920d43cb67 Mon Sep 17 00:00:00 2001
From: Dan van der Ster <dan.vanderster@clyso.com>
Date: Thu, 29 Jan 2026 16:22:49 -0800
Subject: [PATCH 28/28] clyso-nines: improve the calculation, which was broken
 for domains-down > 0

Signed-off-by: Dan van der Ster <dan.vanderster@clyso.com>
---
 otto/src/clyso/ceph/otto/tools/clyso-nines | 321 ++++++++++++++-------
 1 file changed, 219 insertions(+), 102 deletions(-)

diff --git a/otto/src/clyso/ceph/otto/tools/clyso-nines b/otto/src/clyso/ceph/otto/tools/clyso-nines
index 43c9501..109dbce 100755
--- a/otto/src/clyso/ceph/otto/tools/clyso-nines
+++ b/otto/src/clyso/ceph/otto/tools/clyso-nines
@@ -298,10 +298,135 @@ def infer_failure_domains(pgs):
     return osd_tree
 
 
+def calculate_pg_loss_rate(num_domains,
+                           osds_per_domain,
+                           pg_size,
+                           num_pgs,
+                           num_failures_for_data_loss,
+                           num_osds,
+                           prefailed_domains=0):
+
+    vprint(f"---------------------------")
+    vprint(f"Theoretical loss rate calculation (with {prefailed_domains} pre-failed domains):")
+
+    H = int(num_domains)
+    f_total_threshold = int(num_failures_for_data_loss)
+    f_new = max(0, f_total_threshold - int(prefailed_domains))
+
+    H_dead = int(prefailed_domains)
+    n = int(osds_per_domain)
+    r = int(pg_size)
+    N = int(num_osds)
+
+    if H_dead >= H: return 1.0
+
+    H_alive = H - H_dead
+    N_alive = H_alive * n
+
+    # 1. Eligibility (Global)
+    if f_new == 0:
+        eligibility = 1.0
+    elif f_new > H_alive:
+        eligibility = 0.0
+    else:
+        try:
+            eligibility = (comb(H_alive, f_new) * (n ** f_new)) / comb(N_alive, f_new)
+        except ValueError:
+            eligibility = 0.0
+
+    vprint(f"  1. Eligibility (New failures on distinct alive hosts): {eligibility:.3g}")
+
+    # 2. Iterate through PG "Vulnerability Buckets"
+    prob_pool_survives = 1.0
+
+    for k in range(r + 1):
+        # Safety checks
+        if k > H_dead: continue
+        if (r - k) > H_alive: continue
+
+        # Proportion of PGs in this state
+        try:
+            prob_pg_state_k = (comb(H_dead, k) * comb(H_alive, r - k)) / comb(H, r)
+        except ValueError: continue
+
+        if prob_pg_state_k == 0: continue
+
+        num_pgs_in_state = num_pgs * prob_pg_state_k
+
+        # The number of *additional* shards we need to lose to kill this PG
+        needed = max(0, f_total_threshold - k)
+
+        p_kill_pg = 0.0
+
+        if needed <= 0:
+            p_kill_pg = 1.0
+        elif f_new < needed:
+            p_kill_pg = 0.0
+        else:
+            # For EC, we might lose 'x' shards, where 'needed' <= x <= 'f_new'.
+            # Any overlap >= needed is fatal.
+            # We sum the probabilities of all fatal overlap sizes.
+
+            r_alive = r - k  # Healthy legs of this PG
+
+            # The denominator for picking hosts (how many ways to pick f_new hosts from H_alive)
+            total_host_combos = comb(H_alive, f_new)
+
+            # We iterate x (the number of hits on this PG's healthy legs)
+            # Max hits is min(f_new, r_alive) because we can't hit more hosts than we failed,
+            # and we can't hit more legs than the PG has.
+            for x in range(needed, min(f_new, r_alive) + 1):
+                # 1. Choose 'x' hosts that are part of the PG (from r_alive)
+                ways_hit_pg = comb(r_alive, x)
+
+                # 2. Choose the remaining 'f_new - x' failures from non-PG hosts (H_alive - r_alive)
+                ways_miss_pg = comb(H_alive - r_alive, f_new - x)
+
+                if total_host_combos > 0:
+                    prob_hosts = (ways_hit_pg * ways_miss_pg) / total_host_combos
+                else:
+                    prob_hosts = 0
+
+                # 3. Probability that we hit the specific OSDs on those 'x' hosts
+                # Since we assume 1 failure per host (eligibility), simply (1/n)^x
+                prob_osds = (1 / n) ** x
+
+                p_kill_pg += prob_hosts * prob_osds
+
+        # Aggregate Risk for this bucket
+        if p_kill_pg >= 1.0:
+             prob_survive_bucket = math.exp(-num_pgs_in_state)
+        elif p_kill_pg <= 0.0:
+            prob_survive_bucket = 1.0
+        else:
+            # High precision aggregation
+            prob_survive_bucket = math.exp(num_pgs_in_state * math.log1p(-p_kill_pg))
+
+        vprint(f"    Bucket [Overlap={k}]: {prob_pg_state_k*100:.1f}% of PGs. Kill Prob: {p_kill_pg:.4g}")
+        prob_pool_survives *= prob_survive_bucket
+
+    # Final Risk Calculation
+    risk_raw = 1.0 - prob_pool_survives
+
+    # Handle the Dead-On-Arrival edge case (where eligibility shouldn't reduce risk)
+    prob_survive_doa = 1.0
+    for k in range(r + 1):
+        if k > H_dead: continue
+        needed = max(0, f_total_threshold - k)
+        if needed <= 0:
+            try:
+                p_state = (comb(H_dead, k) * comb(H_alive, r - k)) / comb(H, r)
+                prob_survive_doa *= math.exp(- (num_pgs * p_state))
+            except ValueError: continue
+
+    risk_doa = 1.0 - prob_survive_doa
+    total_risk = max(risk_doa, eligibility * risk_raw)
+
+    return min(1.0, max(0.0, total_risk))
 
 
 # ---------------------------- Simulate Subcommand ----------------------------
-def simulate(args):
+def simulate(args, domains=None):
     # Set random seed for reproducibility
     random.seed(args.seed)
 
@@ -311,26 +436,25 @@ def simulate(args):
 
     raw_to_stored_factor = 0
     profile = ""
-    # Derive PG size and min_size
+    # Derive PG size
     if args.k is not None and args.m is not None:
         pg_size = args.k + args.m
-        min_size = args.k
         raw_to_stored_factor = args.k / (args.k + args.m)
         profile = f"{args.k}+{args.m} erasure"
-        num_failures = args.m + 1
+        num_failures_for_data_loss = args.m + 1
     elif args.size is not None:
         pg_size = args.size
-        min_size = 1
         raw_to_stored_factor = 1 / (args.size)
         profile = f"{args.size}x replicated"
-        num_failures = args.size
+        num_failures_for_data_loss = args.size
     else:
         raise ValueError("Specify either --size or both --k and --m.")
 
-    if not hasattr(args, 'pgs') and args.domains_down > 0:
-        num_failures -= args.domains_down
+    num_failures_for_data_loss = max(0, num_failures_for_data_loss)
 
-    num_failures = max(0, num_failures)
+    # how many (additional) OSD failures would cause data loss?
+    # adjust for pre-failed domains
+    num_osds_to_fail = max(0, num_failures_for_data_loss - args.domains_down)
 
     num_simulations = args.simulations
     if num_simulations < 100:
@@ -415,71 +539,25 @@ def simulate(args):
     print(f"    Data per PG: {format_bytes(bytes_per_pg)}, {format_int(objects_per_pg)} objects")
     vprint(f"    Example PG: {pgs[0]}\n")
 
-    vprint(f"---------------------------")
-    vprint()
 
-    vprint(f"Theoretical loss rate calculation:")
-    H = num_domains # num hosts
-    f = num_failures # normally m+1
-    n = osds_per_domain # osds per host
-    r = pg_size  # normally k + m
-    N = num_osds  # total osds
-
-    vprint("    PG loss rate calculation:")
-    vprint(f"      H (failure domains): {H}")
-    vprint(f"      f (failures to cause data loss): {f}")
-    vprint(f"      n (OSDs per failure domain): {n}")
-    vprint(f"      r (PG size): {r}")
-    vprint(f"      N (total OSDs): {N}")
-    vprint(f"      Number of PGs: {num_pgs}")
-
-    # Assume we have f concurrent failures. They will cause data loss if:
-    # - they land on f different failure domains
-    # - per-PG probability that those domains are hit
-    # - at least one PG is lost
-
-    # 1. Eligibility: Do the f failures land on f different domains?
-    elig = comb(H, f) * (n ** f) / comb(N, f)
-    vprint("    1. Eligibility (Do the f failures land on f different domains): C(H, f) * n^f / C(N, f)")
-    vprint(f"       C({H}, {f}) * {n}^{f} / C({N}, {f}) = {comb(H, f)} * {n**f} / {comb(N, f)} = {elig:.3g}")
-
-    # 2. Per-PG hit probability
-    #    PG picks r domains uniformly: probability it includes those f failed domains is
-    p_hosts_in_pg = comb(H - f, r - f) / comb(H, r)
-    vprint("    2. Per-PG hit probability: C(H - f, r - f) / C(H, r)")
-    vprint(f"       C({H} - {f}, {r} - {f}) / C({H}, {r}) = {comb(H - f, r - f)} / {comb(H, r)} = {p_hosts_in_pg:.3g}")
-
-    # 3. Within those f domains, PG must pick the failed OSD in each: 
-    p_osds_match = (1 / n) ** f
-    vprint("    3. PG picks failed OSDs within those domains: (1 / n)^f")
-    vprint(f"       (1 / {n})^{f} = {p_osds_match:.3g}")
-
-    # 4. Combine to get per-PG loss probability
-    p_pg_loss = p_hosts_in_pg * p_osds_match
-    vprint("    4. Per-PG loss probability: #2 * #3")
-    vprint(f"       {p_pg_loss:.3g}")
-
-    # 5. Probability that at least one PG is lost
-    p_at_least_one_lost = 1 - (1 - p_pg_loss) ** num_pgs
-    vprint("    5. Probability that at least one PG is lost: 1 - (1 - #4)^num_pgs")
-    vprint(f"    1 - (1 - {p_pg_loss:.3g})^{num_pgs} = {p_at_least_one_lost:.3g}")
-
-    loss_rate = elig * p_at_least_one_lost
-    vprint("    6. Overall data loss probability: #1 * #5")
-
-    assert 0 <= loss_rate <= 1, "Calculated loss rate is out of bounds!"
+    # Compute theoretical loss rate
+    # i.e. how often do random failures cause data loss
+    calculated_loss_rate = loss_rate = calculate_pg_loss_rate(
+        num_domains,
+        osds_per_domain,
+        pg_size,
+        num_pgs,
+        num_failures_for_data_loss,
+        num_osds,
+        args.domains_down
+    )
 
-    vprint()
-    vprint(f"    Finally, P. ≥1 PG lost: {loss_rate * 100:.3g}%")
-    print(f"    Calculated PG Loss Rate: {loss_rate * 100:.3g}%")
+    print(f"    Calculated PG Loss Rate (with {args.domains_down} pre-failed domains): {loss_rate * 100:.4g}%")
 
-    vprint()
-    vprint(f"---------------------------")
-    vprint()
 
     # simulate the major incidents
-    vprint(f"Monte Carlo Sim: n={num_simulations} {num_failures}x failures")
-    threshold = pg_size - min_size
+    vprint(f"Monte Carlo Sim: n={num_simulations} {num_failures_for_data_loss}x failures")
+
     num_batches = 50
     batch_size = num_simulations // num_batches
     if args.skip_monte_carlo:
@@ -507,17 +585,25 @@ def simulate(args):
                 spin_time = time.time()
             simulations_done += 1
             failed_osds = set()
+
             if args.domains_down > 0:
                 # pre-fail args.domains_down * osds_per_domain OSDs
-                failed_osds = set([osds[i] for i in range(args.domains_down * osds_per_domain)])
+                failed_osds = set()
+                failed_domains = random.sample(range(num_domains), args.domains_down)
+                for domain in failed_domains:
+                    failed_osds.update(domains[domain])
 
-            failed_osds.update(random.sample(list(set(osds) - failed_osds), num_failures))
+            dprint(f"Pre-failed OSDs: {failed_osds}")
+            failed_osds.update(random.sample(list(set(osds) - failed_osds), num_osds_to_fail))
+            dprint(f"Failed OSDs: {failed_osds}")
+
+            dead_pgs = 0
+            for pg in pgs:
+                if sum(1 for osd in pg if osd in failed_osds) >= num_failures_for_data_loss:
+                    dead_pgs += 1
+                    dprint(f"PG {pg} lost due to failed OSDs {failed_osds}")
 
-            assert len(failed_osds) == num_failures + args.domains_down * (osds_per_domain), f"Failed OSD count mismatch! {len(failed_osds)} != {num_failures + args.domains_down * osds_per_domain}"
 
-            dead_pgs = sum(
-                1 for pg in pgs if sum(1 for osd in pg if osd in failed_osds) > threshold
-            )
             total_pg_losses += dead_pgs
             if dead_pgs > 0:
                 simulations_with_loss += 1
@@ -538,23 +624,28 @@ def simulate(args):
         vprint(f"    ╰{'─' * num_batches}╯")
         vprint()
 
-    if simulations_with_loss > 0:
+    if args.skip_monte_carlo or simulations_with_loss < 1:
+        # use theoretical estimate
+        total_pg_losses = simulations_with_loss = calculated_loss_rate * num_simulations  # use theoretical estimate
+        confidence = (loss_rate, loss_rate)
+        plus_minus = 0.0
+        vprint(f"    No PGs lost in Monte Carlo simulation, using theoretical estimate.")
+    else:
+        # use simulated result
         loss_rate = simulations_with_loss / num_simulations
+        confidence = confidence_interval(simulations_with_loss, num_simulations)
+        plus_minus = confidence[1] - loss_rate
         vprint(f"    Observed Incidents with ≥1 PG lost: {simulations_with_loss}/{num_simulations} ({100 * loss_rate:.3f}%)")
-        print(f"    Monte Carlo PG Loss Rate: {loss_rate * 100:.3g}%")
-    elif args.skip_monte_carlo:
-        vprint(f"    Skipped Monte Carlo simulation as per user request. Using theoretical estimate.")
-        print(f"    Monte Carlo PG Loss Rate: - skipped. using theoretical estimate.")
-        total_pg_losses = simulations_with_loss = loss_rate * num_simulations  # use theoretical estimate
-    else:
-#        vprint(f"    Observed Incidents with ≥1 PG lost: 0/{num_simulations} (0%)")
-        vprint("      !! No data loss observed in simulations, using theoretical estimate.")
-        print(f"    Monte Carlo PG Loss Rate: - none found. using theoretical estimate.")
-        total_pg_losses = simulations_with_loss = loss_rate * num_simulations  # use theoretical estimate
-
-    confidence = confidence_interval(simulations_with_loss, num_simulations)
-    vprint(f"        95% Confidence Interval: {confidence[0] * 100:.3g}% - {confidence[1] * 100:.3g}%")
-    
+        print(f"    Monte Carlo Data Loss Rate: {loss_rate * 100:.3g}% ± {plus_minus * 100:.3g}%")
+
+        # warn if theoretical and simulated differ significantly
+        if calculated_loss_rate < confidence[0] or calculated_loss_rate > confidence[1]:
+            vprint()
+            vprint("    Warning: Theoretical and simulated loss rates differ significantly!")
+            vprint(f"      Theoretical: {calculated_loss_rate * 100:.3g}%, Simulated: {100 * loss_rate:.3g}% ± {plus_minus * 100:.3g}%")
+            vprint()
+
+
     pgs_lost_per_event = total_pg_losses / num_simulations
     pct_data_loss = total_pg_losses / (num_simulations * num_pgs)
     vprint(f"    Expected Data Loss Per Incident: {pgs_lost_per_event:.3g} PGs, {format_bytes(pgs_lost_per_event*bytes_per_pg)} ({pct_data_loss * 100:.3g}%)")
@@ -564,7 +655,7 @@ def simulate(args):
     vprint("---------------------------")
     vprint("")
 
-    vprint(f"Estimating the probability of {num_failures}x failures")
+    vprint(f"Estimating the probability of {num_osds_to_fail}x failures")
     afr = args.afr
     if pg_size > 1:
         vprint(f"    Estimated per-drive AFR: {afr}")
@@ -590,34 +681,35 @@ def simulate(args):
         # Estimate max OSD load during recovery when things are critical
         # e.g. for 3x repl: how long to recover 2 OSDs failed.
         # e.g. for 6+3 EC: how long to recover 3 OSDs failed.
+        # TODO: check this part
         if args.k is not None:
             critical_osds = args.m
         else:
             critical_osds = args.size - 1
-        num_osds_left = num_osds - num_failures + 1
+        num_osds_left = num_osds - args.domains_down*osds_per_domain - num_osds_to_fail + 1
         # How many PGs need to be recovered 
         max_osd_load = expected_max_osd_load(num_osds_left, critical_osds*pgs_per_osd)
 
-        vprint(f"    Expected most loaded OSD when recovering {format_int((num_failures-1)*pgs_per_osd)} PGs to {num_osds-num_failures+1} OSDs = {max_osd_load} PGs")
+        vprint(f"    Expected most loaded OSD when recovering {format_int(critical_osds*pgs_per_osd)} PGs to {num_osds_left} OSDs = {max_osd_load} PGs")
 
         recovery_time = max(max_osd_load * recovery_time_per_pg, 60) # minimum 1 minutes per PG
         vprint(f"    Expected critical period ({max_osd_load} PGs * {format_time(recovery_time_per_pg)}) = {format_time(recovery_time)}")
 
         critical_windows_per_year = 31536000 / recovery_time
-        vprint(f"    Expected critical periods annually (1yr / {format_time(recovery_time)}) = {critical_windows_per_year:.1f}")
+        vprint(f"    Expected critical periods annually (1 yr / {format_time(recovery_time)}) = {critical_windows_per_year:.1f}")
 
         failure_prob_per_disk = recovery_time / 31536000 * afr
         vprint(f"    Expected per-OSD critical period failure rate = {failure_prob_per_disk:.2g} (1 in {format_int(1/failure_prob_per_disk)})")
 
         vprint(f"    Failure correlation factor = {args.fudge}")
         
-        failure_prob = args.fudge * prob_k_or_more_failures(num_osds, failure_prob_per_disk, num_failures)
-        vprint(f"    Probability of {num_failures}+ failures in any critical period ({format_time(recovery_time)}) = 1 in {format_int(1/failure_prob)}")
+        failure_prob = args.fudge * prob_k_or_more_failures(num_osds, failure_prob_per_disk, num_osds_to_fail)
+        vprint(f"    Probability of {num_osds_to_fail}+ failures in any critical period ({format_time(recovery_time)}) = 1 in {format_int(1/failure_prob)}")
 
         expected_annual_events = critical_windows_per_year * failure_prob
-        vprint(f"    Expected annual {num_failures}+ failure events: {expected_annual_events:.3g} (1 every {format_time(31536000/expected_annual_events)})")
-        mtt_multi_failure = mttdl(num_osds, num_failures, afr, args.fudge, recovery_time)
-        vprint(f"    Alternate calculation (mean time to {num_failures}+ concurrent failures) = {format_time(mtt_multi_failure)}")
+        vprint(f"    Expected annual {num_osds_to_fail}+ failure events: {expected_annual_events:.3g} (1 every {format_time(31536000/expected_annual_events)})")
+        mtt_multi_failure = mttdl(num_osds, num_failures_for_data_loss, afr, args.fudge, recovery_time)
+        vprint(f"    Alternate calculation (mean time to {num_osds_to_fail}+ concurrent failures) = {format_time(mtt_multi_failure)}")
     else: # size == 1 (no redundancy)
         vprint("    No redundancy configured, data loss is certain on any failure.")
         expected_annual_events = num_osds * afr
@@ -637,8 +729,8 @@ def simulate(args):
     inv_max_loss_rate = 1/max_loss_rate if max_loss_rate > 0 else float('inf')
 
     vprint(f"MTTDL/Durability Estimates:")
-    vprint(f"    Mean time to {num_failures}x failure: {format_time(mtt_multi_failure)}")
-    vprint(f"    Probability that {num_failures}x failures causes data loss: {loss_rate*100:.3g}% (1 in {format_int(inv_loss_rate)})")
+    vprint(f"    Mean time to {num_osds_to_fail}x failure: {format_time(mtt_multi_failure)}")
+    vprint(f"    Probability that {num_osds_to_fail}x failures causes data loss: {loss_rate*100:.3g}% (1 in {format_int(inv_loss_rate)})")
     vprint(f"       95% CI: {min_loss_rate*100:.3g}%-{max_loss_rate*100:.3g}% (1 in {format_int(inv_max_loss_rate)} to 1 in {format_int(inv_min_loss_rate)})")
 
     vprint("")
@@ -774,7 +866,7 @@ def analyze_pool(args, pool, pg_dump):
     vprint(f"    Data per OSD: {format_bytes(args.tb_per_osd * 1024**4)} ({format_int(objects_raw / len(osds))} objects)")
     vprint(f"    Average object size: {format_bytes(args.object_size)}")
     vprint("")
-    [mttdl, nines] = simulate(args)
+    [mttdl, nines] = simulate(args, domains=domains)
 
 
 def analyze(args):
@@ -920,6 +1012,31 @@ def run_tests(args):
             ),
             ("15.4 thousand years", "10-nines")
         ],
+        [
+            argparse.Namespace(
+                verbose=False,
+                failure_domain="host",
+                num_domains=20,
+                osds_per_domain=16,
+                tb_per_osd=10.0,
+                object_size=4*1024*1024,
+                pgs_per_osd=100,
+                k=6,
+                m=3,
+                size=None,
+                simulations=1000,
+                seed=42,
+                fudge=100,
+                afr=0.02,
+                recovery_rate=50,
+                osd_recovery_sleep=0.1,
+                osd_recovery_max_active=1,
+                domains_down=1,
+                skip_monte_carlo=False,
+                max_mc_runtime=30
+            ),
+            ("173.4 y", "7-nines")
+        ],
         [
             argparse.Namespace(
                 verbose=False,