Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
.venv
.vscode
41 changes: 41 additions & 0 deletions figure_one/archive/draw-figure.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import msprime
import numpy as np
import matplotlib.pyplot as plt
import concurrent.futures

# Parameters
sample_size = 10
sequence_length = 1e6 # total length of genome
recombination_rate = 1e-2
num_replicates = 100

def simulate_tmrcas(model):
ts = msprime.sim_ancestry(
samples=sample_size,
sequence_length=sequence_length,
recombination_rate=recombination_rate,
model=model,
)
# Return the TMRCA for all trees in this replicate
return [tree.time(tree.root) for tree in ts.trees()]

def parallel_tmrcas(model):
tmrcas = []

with concurrent.futures.ProcessPoolExecutor() as executor:
results = executor.map(simulate_tmrcas, [model] * num_replicates)
for tmrcas_per_replicate in results:
tmrcas.extend(tmrcas_per_replicate)
return tmrcas

# Standard model
tmrcas_standard = parallel_tmrcas(msprime.StandardCoalescent()) # standard coalescent
print(f"Standard model finished")
# SMC' model
tmrcas_smck = parallel_tmrcas(msprime.SmcKApproxCoalescent())

# Box plot
plt.boxplot([tmrcas_standard, tmrcas_smck], labels=["Standard", "SMC'"], showfliers=False)
plt.ylabel("TMRCA")
plt.title("Distribution of Coalescence Times")
plt.show()
82 changes: 82 additions & 0 deletions figure_one/archive/generate-data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
import msprime
import numpy as np

# Parameters
sample_size = 10
sequence_length = 1e5 # total length of genome
recombination_rate = 1e-3
mutation_rate = 0 # not needed here
distance = 1000 # distance between sites (in bp)
num_replicates = 1000 # to average over simulations

def average_coalescence_time_at_distance_d(ts, d):
times = []
positions = np.arange(0, ts.sequence_length - d, d)
for pos in positions:
left_ts = ts.at(pos)
right_ts = ts.at(pos + d)

for tree in left_ts.trees():
if tree in right_ts:
print(f"Tree {tree} is in both left and right trees")

# Ensure both positions are covered by trees
if left_tree is None or right_tree is None:
continue

# Pick pairs of samples
for i in range(0, ts.num_samples, 2):
if i + 1 >= ts.num_samples:
break
n1, n2 = i, i + 1

# Get TMRCA at the two positions
t1 = left_tree.tmrca(n1, n2)
t2 = right_tree.tmrca(n1, n2)

# Average TMRCA for the pair of positions
times.append((t1 + t2) / 2)

return np.mean(times) if times else np.nan

def _average_coalescence_time_at_distance_d(ts, d):
times = []
positions = np.arange(0, ts.sequence_length - d, d)
for pos in positions:
left_tree = ts.at(pos)
right_tree = ts.at(pos + d)

# Ensure both positions are covered by trees
if left_tree is None or right_tree is None:
continue

# Pick pairs of samples
for i in range(0, ts.num_samples, 2):
if i + 1 >= ts.num_samples:
break
n1, n2 = i, i + 1

# Get TMRCA at the two positions
t1 = left_tree.tmrca(n1, n2)
t2 = right_tree.tmrca(n1, n2)

# Average TMRCA for the pair of positions
times.append((t1 + t2) / 2)

return np.mean(times) if times else np.nan

# Run simulation
avg_tmrcas = []
for _ in range(num_replicates):
ts = msprime.sim_ancestry(
samples=sample_size,
sequence_length=sequence_length,
recombination_rate=recombination_rate,
random_seed=None
)
avg_tmrcas.append(average_coalescence_time_at_distance_d(ts, distance))

# Filter out None values
avg_tmrcas = [tmrca for tmrca in avg_tmrcas if tmrca is not None]
overall_average = np.nanmean(avg_tmrcas)
print(f"Average coalescence time for site pairs {distance} bp apart: {overall_average}")
125 changes: 125 additions & 0 deletions figure_one/extant_lineages_per_time.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
'''
blue two panels never used
'''

import msprime
import concurrent
import pandas as pd
import matplotlib.pyplot as plt
import time
import numpy as np
import ast
import warnings
warnings.filterwarnings("ignore")

sample_size = 2
r = 1e-10
Ne = 1e6
L = 1e6
max_workers=14
max_time = 32497565 * 1.5
filename = f'lineages_per_generation_{sample_size}samples_segments.csv'
replicates = 1
rs = [1e-10]
end_time = 100000

def csv(x):
return ",".join(map(str, x)) + "\n"

def save(name):
plt.tight_layout()
#plt.savefig(f"figures/{name}.png")
plt.savefig(f"figures/{name}.pdf")

def get_lineages_per_generation(params):
model = params[0]; _sample_size = params[1]; _r = params[2]
if model=='smc_k': inner_model=msprime.SmcKApproxCoalescent()
else: inner_model=model

if type(model) == int: inner_model= msprime.SmcKApproxCoalescent(hull_offset=model)
ts = msprime.sim_ancestry(
samples=_sample_size,
recombination_rate=_r,
population_size=Ne,
sequence_length=L,
model=inner_model,
#additional_nodes=(msprime.NodeType.COMMON_ANCESTOR),
coalescing_segments_only=False,
#stop_at_local_mrca=False,
end_time=end_time


)
lineages = np.zeros(end_time + 1, dtype=int)
for tree in ts.trees():
for i in range(lineages.size):
lineages[i] += tree.num_lineages(i)
to_print = lambda x: f'\"{str(list(x))}\"'

return [Ne, L, _r, _sample_size, str(model), to_print(lineages)]

def generate_data():

models = ['smc_k', 'Hudson']
tasks = []

for _r in rs:
for model in models:
for replicate in range(replicates):
tasks.append((model, sample_size, _r))

with open(filename, "w") as f:
f.write(csv(['N', 'L', 'r','num_samples', 'model', 'lineages']))

with concurrent.futures.ProcessPoolExecutor(max_workers=max_workers) as executor:
# Submit all tasks and get futures
future_results = {executor.submit(get_lineages_per_generation, params): params for params in tasks}

# Process results as they complete
for future in concurrent.futures.as_completed(future_results):
try:
result = future.result()
f.write(csv(result))
f.flush()
except Exception as exc:
params = future_results[future]
print(f"Task {params} generated an exception: {exc}")

def plot_lineages_by_model(infile=filename):
# Read CSV
df = pd.read_csv(infile)

# Models we care about
models = ["Hudson", "smc_k"]

# Parse list-like strings into numeric arrays
df["lineages"] = df["lineages"].apply(lambda x: np.array(ast.literal_eval(x), dtype=int))

# Create figure: 1 row × 2 columns
fig, axes = plt.subplots(1, 2, figsize=(10, 4), sharey=True)
fig.suptitle("Lineages along the genome", fontsize=14)

for ax, model in zip(axes, models):
sub = df[df["model"] == model]
if len(sub) == 0:
continue

# Assuming the positions are equally spaced along the sequence
# Create an x-axis: divide the genome into len(lineages) segments
L = sub.iloc[0]["L"]
lineage_array = sub.iloc[0]["lineages"]
positions = np.linspace(1, len(lineage_array), len(lineage_array))

ax.plot(positions, lineage_array, marker="o", markersize=3, lw=1)
ax.set_title(model)
ax.set_xlabel("generation")
ax.grid(True, linestyle="--", alpha=0.5)

axes[0].set_ylabel("Number of lineages")

plt.tight_layout(rect=[0, 0, 1, 0.96])
plt.show()
if __name__ == "__main__":
generate_data()
plot_lineages_by_model()
#plot_ratio_bars('avg_adj_hap_len')
Loading