diff --git a/.github/workflows/ci-python.yml b/.github/workflows/ci-python.yml index 1e6bb06..be2b184 100644 --- a/.github/workflows/ci-python.yml +++ b/.github/workflows/ci-python.yml @@ -36,3 +36,19 @@ jobs: with: environments: py-phylo2vec - run: pixi run -e py-phylo2vec benchmark + notebooks: + name: Test main demo notebook + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v6 + - uses: prefix-dev/setup-pixi@v0.9.3 + with: + environments: py-phylo2vec + - name: Install package and dependencies + run: | + pixi run -e py-phylo2vec install-python + pixi run -e py-phylo2vec pip install nbconvert + - name: Execute demo notebook + run: | + cd docs + pixi run -e py-phylo2vec jupyter nbconvert --to notebook --execute demo.ipynb --output demo_executed.ipynb diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index c48681d..04ec4d4 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -52,6 +52,13 @@ repos: entry: cargo fmt pass_filenames: false + - repo: https://github.com/kynan/nbstripout + rev: "0.8.1" + hooks: + - id: nbstripout + args: [--keep-output] + files: ^docs/ + - repo: https://github.com/python-jsonschema/check-jsonschema rev: "0.36.0" hooks: diff --git a/docs/demo.ipynb b/docs/demo.ipynb index fe0d237..b247627 100644 --- a/docs/demo.ipynb +++ b/docs/demo.ipynb @@ -11,6 +11,8 @@ "* How to convert Phylo2Vec vectors to Newick format and vice versa\n", "* How to sample random trees with branch lengths (phylograms) as Phylo2Vec matrices\n", "* How to convert these matrices to Newick format and vice versa\n", + "* Tree traversal utilities: common ancestors and node depths\n", + "* Tree comparison metrics: Robinson-Foulds distance\n", "* Other useful operations on Phylo2Vec vectors\n", "\n", "Note that the current version of Phylo2Vec (1.x) relies on a core written in Rust, with bindings to Python and R. This comes with significant speed-ups, allowing manipulation large trees (up to ~100,000 to 1 million leaves). To become more familiar with Rust, we recommend this [interactive book](https://rust-book.cs.brown.edu/experiment-intro.html)." @@ -34,7 +36,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -175,7 +177,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -184,7 +186,7 @@ "'((((0,2)9,4)10,(1,3)8)11,(5,6)7)12;'" ] }, - "execution_count": 4, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -214,7 +216,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -223,7 +225,7 @@ "[(1, 6), (4, 5), (2, 3), (0, 1), (0, 4), (0, 2)]" ] }, - "execution_count": 7, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -245,7 +247,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -254,7 +256,7 @@ "'(((0,(1,6)7)10,(4,5)8)11,(2,3)9)12;'" ] }, - "execution_count": 8, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -283,7 +285,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -354,7 +356,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -394,7 +396,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -420,7 +422,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -444,7 +446,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -475,7 +477,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -501,7 +503,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -527,7 +529,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -553,7 +555,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -563,7 +565,7 @@ "" ] }, - "execution_count": 2, + "execution_count": null, "metadata": { "image/png": { "width": 600 @@ -587,7 +589,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -614,7 +616,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -633,7 +635,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -731,30 +733,124 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### 3.2 Optimisation\n", + "Another important tree distance metric is the [Robinson-Foulds (RF) distance](https://en.wikipedia.org/wiki/Robinson%E2%80%93Foulds_metric), which counts the number of bipartitions (splits) that differ between two tree topologies.\n", + "\n", + "Use `robinson_foulds` to compute the RF distance between two trees." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from phylo2vec.stats import robinson_foulds\n", + "\n", + "# Sample two random trees with 10 leaves\n", + "v1 = p2v.sample_vector(10)\n", + "v2 = p2v.sample_vector(10)\n", + "\n", + "print(f\"Tree 1: {repr(v1)}\")\n", + "print(f\"Tree 2: {repr(v2)}\")\n", + "\n", + "# Compute RF distance\n", + "rf_dist = robinson_foulds(v1, v2)\n", + "print(f\"Robinson-Foulds distance: {rf_dist}\")\n", + "\n", + "# Normalized RF distance (range [0, 1])\n", + "rf_norm = robinson_foulds(v1, v2, normalize=True)\n", + "print(f\"Normalized RF distance: {rf_norm:.3f}\")\n", + "\n", + "# RF distance of a tree with itself is always 0\n", + "rf_same = robinson_foulds(v1, v1)\n", + "print(f\"RF distance (same tree): {rf_same}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3.2. Tree traversal: common ancestors and node depths\n", + "\n", + "Phylo2Vec provides functions for tree traversal, such as finding the most recent common ancestor (MRCA) between two nodes and computing node depths.\n", + "\n", + "Use `get_common_ancestor` to find the MRCA between two nodes (similar to ape's `getMRCA` in R or ETE's `get_common_ancestor` in Python), and `get_node_depth` / `get_node_depths` to compute depths.\n", "\n", - "In the Phylo2Vec paper, we showcased a hill-climbing optimisation scheme to demonstrate the potential of phylo2vec for maximum likelihood-based phylogenetic inference.\n", + "- For **vectors**: topological depth is returned (all branch lengths = 1)\n", + "- For **matrices**: actual branch lengths are used\n", "\n", - "These optimisation schemes (to be written in ```opt```) are not thoroughly maintained as difficult to test. One notable goal is to integrate [GradME](https://github.com/Neclow/GradME) into phylo2vec" + "**Tip:** Use `get_node_depths` (plural) to compute all node depths at once, which is more efficient than calling `get_node_depth` repeatedly." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from phylo2vec.utils.vector import get_common_ancestor, get_node_depth, get_node_depths\n", + "\n", + "# Using v_fixed from earlier: [0, 2, 2, 5, 4, 1]\n", + "# Tree structure (7 leaves, so internal nodes are 7-12, root is 12):\n", + "# ╭─┬╴0\n", + "# ╭─┤ ╰─┬╴1\n", + "# │ │ ╰╴6\n", + "# ─┤ ╰─┬╴4\n", + "# │ ╰╴5\n", + "# ╰─┬╴2\n", + "# ╰╴3\n", + "\n", + "# Find MRCA of leaves 1 and 6 (they form a cherry, so MRCA is node 7)\n", + "mrca_1_6 = get_common_ancestor(v_fixed, 1, 6)\n", + "print(f\"MRCA of leaves 1 and 6: node {mrca_1_6}\")\n", + "assert mrca_1_6 == 7 # node 7 is the parent of leaves 1 and 6\n", + "\n", + "# Get the depth of this MRCA (topological, since v_fixed is a vector)\n", + "mrca_depth = get_node_depth(v_fixed, mrca_1_6)\n", + "print(f\"Depth of MRCA (node {mrca_1_6}): {mrca_depth}\")\n", + "assert mrca_depth == 3 # root(12) → 11 → 10 → 7, so depth is 3\n", + "\n", + "# Find MRCA of leaves 2 and 5 (MRCA is root, node 12)\n", + "mrca_2_5 = get_common_ancestor(v_fixed, 2, 5)\n", + "mrca_2_5_depth = get_node_depth(v_fixed, mrca_2_5)\n", + "print(f\"MRCA of leaves 2 and 5: node {mrca_2_5}, depth: {mrca_2_5_depth}\")\n", + "assert mrca_2_5 == 12 # root node\n", + "assert mrca_2_5_depth == 0 # root has depth 0\n", + "\n", + "# Get all node depths at once\n", + "all_depths = get_node_depths(v_fixed)\n", + "print(f\"All node depths: {all_depths}\")\n", + "assert all_depths[12] == 0 # root depth is 0" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### 3.3 Other utility functions" + "### 3.3 Optimisation\n", + "\n", + "In the Phylo2Vec paper, we showcased a hill-climbing optimisation scheme to demonstrate the potential of phylo2vec for maximum likelihood-based phylogenetic inference. We also contributed to [GradME](https://github.com/Neclow/GradME), a continuous extension of phylo2vec for gradient-based minimum evolution.\n", + "\n", + "These optimisation schemes (written in `phylo2vec.opt`) are demonstrated in the [demo_opt notebook](demo_opt.ipynb)." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "#### 3.3.1 Finding the number of leaves in a Newick" + "### 3.4 Other utility functions" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### 3.4.1 Finding the number of leaves in a Newick" ] }, { "cell_type": "code", - "execution_count": 33, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -767,7 +863,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "#### 3.3.2 Removing and adding a leaf in a tree\n", + "#### 3.4.2 Removing and adding a leaf in a tree\n", "\n", "One might want to prune or add nodes in an existing tree (a common example is the subtree-prune-and-regraft operation).\n", "\n", @@ -776,7 +872,7 @@ }, { "cell_type": "code", - "execution_count": 34, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -790,7 +886,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -833,7 +929,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -842,7 +938,7 @@ "True" ] }, - "execution_count": 36, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -863,14 +959,14 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "#### 3.3.3 Applying and create an integer mapping from a Newick string\n", + "#### 3.4.3 Applying and create an integer mapping from a Newick string\n", "\n", "* Newick strings usually do not contain integers but real-life taxa (e.g., animal species, languages...). So it is important to provide another layer of conversion, where we can take in a Newick with string taxa, and convert it to a Newick with integer taxa, with a unique integer → taxon mapping." ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -903,7 +999,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -942,7 +1038,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -951,7 +1047,7 @@ "True" ] }, - "execution_count": 6, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -984,7 +1080,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -997,10 +1093,10 @@ } ], "source": [ - "from glob import glob\n", - "\n", "import tempfile\n", "\n", + "from glob import glob\n", + "\n", "from phylo2vec.io._validation import FILE_EXTENSIONS\n", "\n", "\n", diff --git a/docs/demo_opt.ipynb b/docs/demo_opt.ipynb index 5467aa0..360c148 100644 --- a/docs/demo_opt.ipynb +++ b/docs/demo_opt.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "2ee97f3a", + "id": "0", "metadata": {}, "source": [ "# Phylogenetic tree inference using Phylo2Vec\n", @@ -14,7 +14,7 @@ }, { "cell_type": "markdown", - "id": "a9382e6e", + "id": "1", "metadata": {}, "source": [ "## 0. Imports & data" @@ -22,8 +22,8 @@ }, { "cell_type": "code", - "execution_count": 1, - "id": "a6126425", + "execution_count": null, + "id": "2", "metadata": {}, "outputs": [], "source": [ @@ -46,7 +46,7 @@ }, { "cell_type": "markdown", - "id": "00d58991", + "id": "3", "metadata": {}, "source": [ "All available optimisation schemes can be found using `list_models`" @@ -54,8 +54,8 @@ }, { "cell_type": "code", - "execution_count": 2, - "id": "8f92bcd5", + "execution_count": null, + "id": "4", "metadata": {}, "outputs": [ { @@ -64,7 +64,7 @@ "['GradME', 'HillClimbing']" ] }, - "execution_count": 2, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -77,7 +77,7 @@ }, { "cell_type": "markdown", - "id": "7247f975", + "id": "5", "metadata": {}, "source": [ "\n", @@ -86,8 +86,8 @@ }, { "cell_type": "code", - "execution_count": 3, - "id": "f36acbfa", + "execution_count": null, + "id": "6", "metadata": {}, "outputs": [ { @@ -97,7 +97,7 @@ " )" ] }, - "execution_count": 3, + "execution_count": null, "metadata": {}, "output_type": "execute_result" } @@ -114,7 +114,7 @@ }, { "cell_type": "markdown", - "id": "60667e96", + "id": "7", "metadata": {}, "source": [ "## 1. Hill-climbing\n", @@ -126,8 +126,8 @@ }, { "cell_type": "code", - "execution_count": 4, - "id": "c6460eae", + "execution_count": null, + "id": "8", "metadata": {}, "outputs": [ { @@ -242,7 +242,7 @@ }, { "cell_type": "markdown", - "id": "a9e3e499", + "id": "9", "metadata": {}, "source": [ "`hc_result` contains several objects:\n", @@ -254,8 +254,8 @@ }, { "cell_type": "code", - "execution_count": 5, - "id": "657dc4e3", + "execution_count": null, + "id": "10", "metadata": {}, "outputs": [ { @@ -279,8 +279,8 @@ }, { "cell_type": "code", - "execution_count": 6, - "id": "94412310", + "execution_count": null, + "id": "11", "metadata": {}, "outputs": [ { @@ -319,7 +319,7 @@ }, { "cell_type": "markdown", - "id": "c7de1d99", + "id": "12", "metadata": {}, "source": [ "## 2. GradME\n", @@ -339,8 +339,8 @@ }, { "cell_type": "code", - "execution_count": 4, - "id": "8aeab120", + "execution_count": null, + "id": "13", "metadata": {}, "outputs": [ { @@ -372,7 +372,7 @@ }, { "cell_type": "markdown", - "id": "039922d6", + "id": "14", "metadata": {}, "source": [ "`gradme_result` contains several objects:\n", @@ -385,8 +385,8 @@ }, { "cell_type": "code", - "execution_count": 5, - "id": "256caad2", + "execution_count": null, + "id": "15", "metadata": {}, "outputs": [ { @@ -410,8 +410,8 @@ }, { "cell_type": "code", - "execution_count": 6, - "id": "28072223", + "execution_count": null, + "id": "16", "metadata": {}, "outputs": [ { @@ -450,8 +450,8 @@ }, { "cell_type": "code", - "execution_count": 7, - "id": "74fc53c7", + "execution_count": null, + "id": "17", "metadata": {}, "outputs": [ { @@ -460,7 +460,7 @@ "" ] }, - "execution_count": 7, + "execution_count": null, "metadata": {}, "output_type": "execute_result" },