LohseLab · hossam26644 · May 16, 2025 · May 20, 2025 · May 20, 2025 · May 20, 2025
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+.venv
+.vscode
diff --git a/figure_one/archive/draw-figure.py b/figure_one/archive/draw-figure.py
@@ -0,0 +1,41 @@
+import msprime
+import numpy as np
+import matplotlib.pyplot as plt
+import concurrent.futures
+
+# Parameters
+sample_size = 10
+sequence_length = 1e6  # total length of genome
+recombination_rate = 1e-2
+num_replicates = 100
+
+def simulate_tmrcas(model):
+    ts = msprime.sim_ancestry(
+        samples=sample_size,
+        sequence_length=sequence_length,
+        recombination_rate=recombination_rate,
+        model=model,
+    )
+    # Return the TMRCA for all trees in this replicate
+    return [tree.time(tree.root) for tree in ts.trees()]
+
+def parallel_tmrcas(model):
+    tmrcas = []
+
+    with concurrent.futures.ProcessPoolExecutor() as executor:
+        results = executor.map(simulate_tmrcas, [model] * num_replicates)
+        for tmrcas_per_replicate in results:
+            tmrcas.extend(tmrcas_per_replicate)
+    return tmrcas
+
+# Standard model
+tmrcas_standard = parallel_tmrcas(msprime.StandardCoalescent())  # standard coalescent
+print(f"Standard model finished")
+# SMC' model
+tmrcas_smck = parallel_tmrcas(msprime.SmcKApproxCoalescent())
+
+# Box plot
+plt.boxplot([tmrcas_standard, tmrcas_smck], labels=["Standard", "SMC'"], showfliers=False)
+plt.ylabel("TMRCA")
+plt.title("Distribution of Coalescence Times")
+plt.show()
diff --git a/figure_one/archive/generate-data.py b/figure_one/archive/generate-data.py
@@ -0,0 +1,82 @@
+import msprime
+import numpy as np
+
+# Parameters
+sample_size = 10
+sequence_length = 1e5  # total length of genome
+recombination_rate = 1e-3
+mutation_rate = 0  # not needed here
+distance = 1000  # distance between sites (in bp)
+num_replicates = 1000  # to average over simulations
+
+def average_coalescence_time_at_distance_d(ts, d):
+    times = []
+    positions = np.arange(0, ts.sequence_length - d, d)
+    for pos in positions:
+        left_ts = ts.at(pos)
+        right_ts = ts.at(pos + d)
+
+        for tree in left_ts.trees():
+            if tree in right_ts:
+                print(f"Tree {tree} is in both left and right trees")
+
+        # Ensure both positions are covered by trees
+        if left_tree is None or right_tree is None:
+            continue
+
+        # Pick pairs of samples
+        for i in range(0, ts.num_samples, 2):
+            if i + 1 >= ts.num_samples:
+                break
+            n1, n2 = i, i + 1
+
+            # Get TMRCA at the two positions
+            t1 = left_tree.tmrca(n1, n2)
+            t2 = right_tree.tmrca(n1, n2)
+
+            # Average TMRCA for the pair of positions
+            times.append((t1 + t2) / 2)
+
+    return np.mean(times) if times else np.nan
+
+def _average_coalescence_time_at_distance_d(ts, d):
+    times = []
+    positions = np.arange(0, ts.sequence_length - d, d)
+    for pos in positions:
+        left_tree = ts.at(pos)
+        right_tree = ts.at(pos + d)
+
+        # Ensure both positions are covered by trees
+        if left_tree is None or right_tree is None:
+            continue
+
+        # Pick pairs of samples
+        for i in range(0, ts.num_samples, 2):
+            if i + 1 >= ts.num_samples:
+                break
+            n1, n2 = i, i + 1
+
+            # Get TMRCA at the two positions
+            t1 = left_tree.tmrca(n1, n2)
+            t2 = right_tree.tmrca(n1, n2)
+
+            # Average TMRCA for the pair of positions
+            times.append((t1 + t2) / 2)
+
+    return np.mean(times) if times else np.nan
+
+# Run simulation
+avg_tmrcas = []
+for _ in range(num_replicates):
+    ts = msprime.sim_ancestry(
+        samples=sample_size,
+        sequence_length=sequence_length,
+        recombination_rate=recombination_rate,
+        random_seed=None
+    )
+    avg_tmrcas.append(average_coalescence_time_at_distance_d(ts, distance))
+
+# Filter out None values
+avg_tmrcas = [tmrca for tmrca in avg_tmrcas if tmrca is not None]
+overall_average = np.nanmean(avg_tmrcas)
+print(f"Average coalescence time for site pairs {distance} bp apart: {overall_average}")
diff --git a/figure_one/extant_lineages_per_time.py b/figure_one/extant_lineages_per_time.py
@@ -0,0 +1,125 @@
+'''
+blue two panels never used
+'''
+
+import msprime
+import concurrent
+import pandas as pd
+import matplotlib.pyplot as plt
+import time
+import numpy as np
+import ast
+import warnings
+warnings.filterwarnings("ignore")
+
+sample_size = 2
+r = 1e-10
+Ne = 1e6
+L = 1e6
+max_workers=14
+max_time = 32497565 * 1.5
+filename = f'lineages_per_generation_{sample_size}samples_segments.csv'
+replicates  = 1
+rs = [1e-10]
+end_time = 100000
+
+def csv(x):
+    return ",".join(map(str, x)) + "\n"
+
+def save(name):
+    plt.tight_layout()
+    #plt.savefig(f"figures/{name}.png")
+    plt.savefig(f"figures/{name}.pdf")
+
+def get_lineages_per_generation(params):
+    model = params[0]; _sample_size = params[1]; _r = params[2]
+    if model=='smc_k': inner_model=msprime.SmcKApproxCoalescent()
+    else: inner_model=model
+
+    if type(model) == int: inner_model= msprime.SmcKApproxCoalescent(hull_offset=model)
+    ts = msprime.sim_ancestry(
+        samples=_sample_size,
+        recombination_rate=_r,
+        population_size=Ne,
+        sequence_length=L,
+        model=inner_model,
+        #additional_nodes=(msprime.NodeType.COMMON_ANCESTOR),
+        coalescing_segments_only=False,
+        #stop_at_local_mrca=False,
+        end_time=end_time
+
+
+    )
+    lineages = np.zeros(end_time + 1, dtype=int)
+    for tree in ts.trees():
+        for i in range(lineages.size):
+            lineages[i] += tree.num_lineages(i)
+    to_print = lambda x: f'\"{str(list(x))}\"'
+
+    return [Ne, L, _r, _sample_size, str(model), to_print(lineages)]
+
+def generate_data():
+
+    models = ['smc_k', 'Hudson']
+    tasks = []
+
+    for _r in rs:
+        for model in models:
+            for replicate in range(replicates):
+                tasks.append((model, sample_size, _r))
+
+    with open(filename, "w") as f:
+        f.write(csv(['N', 'L', 'r','num_samples', 'model', 'lineages']))
+
+        with concurrent.futures.ProcessPoolExecutor(max_workers=max_workers) as executor:
+            # Submit all tasks and get futures
+            future_results = {executor.submit(get_lineages_per_generation, params): params for params in tasks}
+
+            # Process results as they complete
+            for future in concurrent.futures.as_completed(future_results):
+                try:
+                    result = future.result()
+                    f.write(csv(result))
+                    f.flush()
+                except Exception as exc:
+                    params = future_results[future]
+                    print(f"Task {params} generated an exception: {exc}")
+
+def plot_lineages_by_model(infile=filename):
+    # Read CSV
+    df = pd.read_csv(infile)
+
+    # Models we care about
+    models = ["Hudson", "smc_k"]
+
+    # Parse list-like strings into numeric arrays
+    df["lineages"] = df["lineages"].apply(lambda x: np.array(ast.literal_eval(x), dtype=int))
+
+    # Create figure: 1 row × 2 columns
+    fig, axes = plt.subplots(1, 2, figsize=(10, 4), sharey=True)
+    fig.suptitle("Lineages along the genome", fontsize=14)
+
+    for ax, model in zip(axes, models):
+        sub = df[df["model"] == model]
+        if len(sub) == 0:
+            continue
+
+        # Assuming the positions are equally spaced along the sequence
+        # Create an x-axis: divide the genome into len(lineages) segments
+        L = sub.iloc[0]["L"]
+        lineage_array = sub.iloc[0]["lineages"]
+        positions = np.linspace(1, len(lineage_array), len(lineage_array))
+
+        ax.plot(positions, lineage_array, marker="o", markersize=3, lw=1)
+        ax.set_title(model)
+        ax.set_xlabel("generation")
+        ax.grid(True, linestyle="--", alpha=0.5)
+
+    axes[0].set_ylabel("Number of lineages")
+
+    plt.tight_layout(rect=[0, 0, 1, 0.96])
+    plt.show()
+if __name__ == "__main__":
+    generate_data()
+    plot_lineages_by_model()
+    #plot_ratio_bars('avg_adj_hap_len')