diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile new file mode 100644 index 0000000..fbd43e3 --- /dev/null +++ b/.devcontainer/Dockerfile @@ -0,0 +1,150 @@ +FROM r-base:latest + +############################### +# System dependencies +############################### +RUN apt-get update && apt-get install -y \ + git=1:2.51.0-1 \ + build-essential=12.12 \ + libcurl4-openssl-dev=8.17.0-2 \ + libssl-dev=3.5.4-1 \ + libxml2-dev=2.15.1+dfsg-0.4 \ + libicu-dev=76.1-4 \ + libxt-dev=1:1.2.1-1.3 \ + curl=8.17.0-2 \ + bash=5.3-1 \ + ca-certificates=20250419 \ + gnupg=2.4.8-4 \ + openssh-client=1:10.2p1-2 \ + ruby-full=1:3.3 \ + zlib1g-dev=1:1.3.dfsg+really1.3.1-1+b1 \ + && apt-get clean && rm -rf /var/lib/apt/lists/* + +############################### +# Ensure global R library +############################### +RUN mkdir -p /usr/local/lib/R/site-library && \ + chmod -R 777 /usr/local/lib/R/site-library + +############################### +# Install remotes +############################### +RUN R -q -e "install.packages('remotes', repos='https://cloud.r-project.org')" + +############################### +# Install specific R packages +############################### +RUN R -q -e "\ + remotes::install_version('ps', version = '1.9.1'); \ + remotes::install_version('evaluate', version = '1.0.5'); \ + remotes::install_version('highr', version = '0.11'); \ + remotes::install_version('xfun', version = '0.54'); \ + remotes::install_version('yaml', version = '2.3.10'); \ + remotes::install_version('lazyeval', version = '0.2.2'); \ + remotes::install_version('lifecycle', version = '1.0.4'); \ + remotes::install_version('pkgbuild', version = '1.4.8'); \ + remotes::install_version('R.methodsS3', version = '1.8.2'); \ + remotes::install_version('R.oo', version = '1.27.1'); \ + remotes::install_version('R.utils', version = '2.13.0'); \ + remotes::install_version('processx', version = '3.8.6'); \ + remotes::install_version('backports', version = '1.5.0'); \ + remotes::install_version('cli', version = '3.6.5'); \ + remotes::install_version('digest', version = '0.6.39'); \ + remotes::install_version('glue', version = '1.8.0'); \ + remotes::install_version('knitr', version = '1.50'); \ + remotes::install_version('rex', version = '1.2.1'); \ + remotes::install_version('brew', version = '1.0-10'); \ + remotes::install_version('commonmark', version = '2.0.0'); \ + remotes::install_version('desc', version = '1.4.3'); \ + remotes::install_version('pkgload', version = '1.4.1'); \ + remotes::install_version('purrr', version = '1.2.0'); \ + remotes::install_version('rlang', version = '1.1.6'); \ + remotes::install_version('stringr', version = '1.6.0'); \ + remotes::install_version('withr', version = '3.0.2'); \ + remotes::install_version('cpp11', version = '0.5.2'); \ + remotes::install_version('magrittr', version = '2.0.4'); \ + remotes::install_version('R.cache', version = '0.17.0'); \ + remotes::install_version('rprojroot', version = '2.1.1'); \ + remotes::install_version('vctrs', version = '0.6.5'); \ + remotes::install_version('callr', version = '3.7.6'); \ + remotes::install_version('collections', version = '0.3.9'); \ + remotes::install_version('fs', version = '1.6.6'); \ + remotes::install_version('jsonlite', version = '2.0.0'); \ + remotes::install_version('lintr', version = '3.2.0'); \ + remotes::install_version('R6', version = '2.6.1'); \ + remotes::install_version('roxygen2', version = '7.3.3'); \ + remotes::install_version('stringi', version = '1.8.7'); \ + remotes::install_version('styler', version = '1.11.0'); \ + remotes::install_version('xml2', version = '1.5.0'); \ + remotes::install_version('xmlparsedata', version = '1.0.5'); \ + remotes::install_version('languageserver', version = '0.3.16'); \ + remotes::install_version('checkpoint', version = '0.4.10'); \ + remotes::install_version('fastmap', version = '1.2.0'); \ + remotes::install_version('bit', version = '4.6.0'); \ + remotes::install_version('cachem', version = '1.1.0'); \ + remotes::install_version('bit64', version = '4.6.0-1'); \ + remotes::install_version('blob', version = '1.2.4'); \ + remotes::install_version('DBI', version = '1.2.3'); \ + remotes::install_version('memoise', version = '2.0.1'); \ + remotes::install_version('pkgconfig', version = '2.0.3'); \ + remotes::install_version('plogr', version = '0.2.0'); \ + remotes::install_version('Rcpp', version = '1.1.0'); \ + remotes::install_version('data.table', version = '1.17.8'); \ + remotes::install_version('RSQLite', version = '2.4.4'); \ + remotes::install_version('RcppEigen', version = '0.3.4.0.2'); \ + remotes::install_version('qtl2', version = '0.38'); \ +" + +############################### +# Install GitHub CLI +############################### +RUN mkdir -p /etc/apt/keyrings && \ + curl -fsSL https://cli.github.com/packages/githubcli-archive-keyring.gpg | \ + gpg --dearmor -o /etc/apt/keyrings/githubcli-archive-keyring.gpg && \ + chmod go+r /etc/apt/keyrings/githubcli-archive-keyring.gpg && \ + echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/githubcli-archive-keyring.gpg] https://cli.github.com/packages stable main" \ + > /etc/apt/sources.list.d/github-cli.list && \ + apt-get update && apt-get install -y gh && \ + rm -rf /var/lib/apt/lists/* + +############################### +# Create docker user +############################### +RUN usermod -d /home/docker docker || true +RUN mkdir -p /home/docker && chown -R docker:docker /home/docker +ENV HOME=/home/docker + +############################### +# Workspace setup +############################### +RUN mkdir -p /workspace && chown -R docker:docker /workspace +WORKDIR /workspace + +############################### +# Add colored Git PS1 +############################### +RUN printf '\n\ +parse_git_branch() {\n\ + branch=$(git branch --show-current 2>/dev/null)\n\ + if [ -n "$branch" ]; then echo -e " (\\e[0;33m$branch\\e[0m)"; fi\n\ +}\n\ +export PS1="\\[\\e[0;32m\\]\\u@\\h\\[\\e[0m\\]->\\[\\e[0;34m\\]\\w\\[\\e[0m\\]\\$(parse_git_branch) $ "' >> /home/docker/.bashrc + +############################### +# Install Jekyll and Bundler +############################### +RUN gem install jekyll bundler + +############################### +# Install Python and Pip +############################### +RUN apt-get update && apt-get install -y \ + python3 \ + python3-pip \ + python3-venv \ + python-is-python3 \ + python3-yaml \ + && apt-get clean && rm -rf /var/lib/apt/lists/* + +USER docker + diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json new file mode 100644 index 0000000..e04061f --- /dev/null +++ b/.devcontainer/devcontainer.json @@ -0,0 +1,20 @@ +{ + "name": "R Base Dev Container", + "dockerFile": "Dockerfile", + "workspaceFolder": "/workspace", + "remoteUser": "docker", + "workspaceMount": "source=${localWorkspaceFolder},target=/workspace,type=bind,consistency=cached", + "customizations": { + "vscode": { + "extensions": [ + "reditorsupport.r" + ], + "settings": { + "terminal.integrated.shell.linux": "bash" + } + } + }, + "remoteEnv": { + "SHELL": "/bin/bash" + } +} \ No newline at end of file diff --git a/.github/workflows/deploy-gh-pages.yml b/.github/workflows/deploy-gh-pages.yml new file mode 100644 index 0000000..fdecb96 --- /dev/null +++ b/.github/workflows/deploy-gh-pages.yml @@ -0,0 +1,67 @@ +name: Deploy to GitHub Pages + +on: + workflow_dispatch: + +jobs: + build-and-deploy: + runs-on: ubuntu-latest + permissions: + contents: write + + steps: + - uses: actions/checkout@v4 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Build Docker image (cached) + uses: docker/build-push-action@v5 + with: + context: . + file: .devcontainer/Dockerfile + tags: jekyll-site:latest + load: true # makes image available for docker run + cache-from: type=gha + cache-to: type=gha,mode=max + + - name: Download required data + run: | + docker run --rm \ + -u root \ + -v ${{ github.workspace }}:/workspace \ + -w /workspace \ + jekyll-site:latest \ + Rscript -e "options(timeout=1900); \ + dir.create('data', showWarnings=FALSE); \ + download.file('https://ndownloader.figshare.com/files/18533342', './data/cc_variants.sqlite'); \ + download.file('https://ndownloader.figshare.com/files/24607961', './data/mouse_genes.sqlite'); \ + download.file('https://ndownloader.figshare.com/files/24607970', './data/mouse_genes_mgi.sqlite'); \ + download.file('ftp://ftp.jax.org/dgatti/qtl2_workshop/qtl2_demo.Rdata', './data/qtl2_demo.Rdata');" + + + - name: Run Jekyll build inside container + run: | + docker run --rm \ + -u root \ + -v ${{ github.workspace }}:/workspace \ + -w /workspace \ + jekyll-site:latest \ + make site + + - name: Remove oversized data files before deploy + run: | + sudo chown -R $USER:$USER _site + rm -f _site/data/cc_multitissue_qtlviewer.Rdata + rm -f _site/data/qtl2_demo.Rdata + rm -f _site/data/cc_variants.sqlite + rm -f _site/data/mouse_genes.sqlite + rm -f _site/data/mouse_genes_mgi.sqlite + + - name: Deploy to GitHub Pages + uses: peaceiris/actions-gh-pages@v3 + with: + github_token: ${{ secrets.GITHUB_TOKEN }} + publish_dir: ./_site + publish_branch: gh-pages + force_orphan: true diff --git a/.gitignore b/.gitignore index 201fd3e..7ec2a34 100644 --- a/.gitignore +++ b/.gitignore @@ -13,4 +13,6 @@ data/mouse_genes_mgi.sqlite fig/.Rhistory files/Gatti_MammGen_2018.pptx scripts/ - +venv +fig/rmd-*.png +_episodes diff --git a/_episodes/01-introduction.md b/Introduction.md similarity index 100% rename from _episodes/01-introduction.md rename to Introduction.md diff --git a/_episodes/.gitkeep b/_episodes/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/_episodes/02-input-file-format.md b/_episodes/02-input-file-format.md deleted file mode 100644 index 79200f8..0000000 --- a/_episodes/02-input-file-format.md +++ /dev/null @@ -1,127 +0,0 @@ ---- -title: "Input File Format" -teaching: 15 -exercises: 30 -questions: -- "How are the data files formatted for qtl2?" -- "Which data files are required for qtl2?" -- "Where can I find sample data for mapping with the qtl2 package?" -objectives: -- To specify which input files are required for qtl2 and how they should be formatted. -- To locate sample data for qtl mapping. -keypoints: -- "QTL mapping data consists of a set of tables of data: marker genotypes, phenotypes, marker maps, etc." -- "These different tables are in separate comma-delimited (CSV) files." -- "In each file, the first column is a set of IDs for the rows, and the first row is a set of IDs for the columns." -- "In addition to primary data, a separate file with control parameters (or metadata) in either [YAML](http://www.yaml.org) or [JSON](http://json.org) format is required." -- "Published and public data already formatted for QTL mapping are available on the web." -- "These data can be used as a model for formatting your own QTL data." -source: Rmd ---- - - - -QTL mapping data consists of a set of tables of data: marker -genotypes, phenotypes, marker maps, etc. These different tables are in different comma-separated value (CSV) files. In each file, the first column is a set of IDs for the rows, and the first row is a set of IDs for the columns. For example, the genotype data file will have individual IDs in the first column, marker names for the rest of the column headers. - -![](../fig/iron-geno-sample.png) - -The sample genotype file above shows two alleles: S and B. These represent the founder strains for an intercross, which are C57BL/6 (BB) and SWR (SS) [(Grant et al. (2006) Hepatology 44:174-185)](https://pubmed.ncbi.nlm.nih.gov/16799992/). The B and S alleles themselves represent the haplotypes inherited from the parental strains C57BL/6 and SWR. - -In the Diversity Outbred (DO) and Collaborative Cross (CC), alleles A to H represent haplotypes of the 8 founder strains. - -![](../fig/cc-founder-alleles.png) - -CC lines have very low heterozygosity throughout their genomes. For most loci, -CC lines will be homozygous for one of the founder strains A-H above, and as -such will have one of only 8 genotypes (*e.g.* AA, BB, CC, ...). In contrast, -DO *mice* (not lines) have high heterozygosity throughout their genomes. Each -locus will have one of 36 possible genotypes (*e.g.* AA, AB, AC, ..., BB, BC, BD,...). - -![](../fig/cc-do-genome-comparison.png) -For the purposes of learning QTL mapping, this lesson begins with the simplest -cases: backcrosses (2 possible genotypes) and intercrosses (3 possible genotypes). -Once we have learned how to use `qtl2` for these simpler cases, we will advance -to the most complex case involving mapping in DO mice. - -R/qtl2 accepts the following files: -1. genotypes -2. phenotypes -3. phenotype covariates (*i.e.* tissue type, time points) -4. genetic map -5. physical map (optional) -6. control file (YAML or JSON format, not CSV) - -We use both a genetic marker map and a physical map (if available). A sample from a genetic map of MIT markers is shown here. - -![](../fig/iron-geno-map-sample.png) - -A physical marker map provides location in bases rather than centiMorgans. - -![](../fig/iron-phys-map-sample.png) - -Numeric phenotypes are separate from the often non-numeric covariates. - -![](../fig/iron-pheno-sample.png) - -Phenotype covariates are [metadata](https://en.wikipedia.org/wiki/Metadata) describing the phenotypes. For example, in the case of a phenotype measured over time, one column in the phenotype covariate data could be the time of measurement. For gene expression data, we would have columns representing chromosome and physical position of genes, as well as gene IDs. - -![](../fig/iron-phenocovar-sample.png) - -In addition to the set of CSV files with the primary data, we need a separate control file with various control parameters -(or metadata), including the names of all of the other data files and the genotype codes used in the genotype data file. The control file is in a specific format using either [YAML](http://www.yaml.org) or -[JSON](http://json.org); these are human-readable text files for -representing relatively complex data. - -![](../fig/iron-control-file-sample.png) - - -A big advantage of this control file scheme is that it greatly -simplifies the function for reading in the data. That function, -`read_cross2()`, has a _single_ argument: the name (with path) of the control file. - -For further details, see the separate [vignette on the input file format](http://kbroman.org/qtl2/assets/vignettes/input_files.html). - -> ## Challenge 1 -> 1. Which data files are required by `qtl2`? -> 2. Which ones are optional? -> 3. How should they be formatted? -> -> > ## Solution to Challenge 1 -> > 1. marker genotypes, phenotypes, genetic map -> > 2. physical map -> > 1. csv; JSON or YAML for control file -> {: .solution} -{: .challenge} - -## Sample data sets - -In this lesson, we'll work with data sets included in the `qtl2` package. You can find out more about the [sample data files](http://kbroman.org/qtl2/pages/sampledata.html) from the R/qtl2 web site. Zipped versions of these datasets are included with the [qtl2geno](https://github.com/rqtl/qtl2geno) package and can be loaded into R using the `read_cross2()` function. -Additional sample data sets, including data on Diversity Outbred (DO) mice, are available at . - -> ## Challenge 2 -> Go to to view additional sample data. -> 1). Find the Recla data and locate the phenotype data file. Open the file by clicking on the file name. What is in the first column? the first row? -> 2). Locate the genotype data file, click on the file name, and view the raw data. What is in the first column? the first row? -> 3). Locate the covariates file and open it by clicking on the file name. What kind of information does this file contain? -> 4). Locate the control file (YAML or JSON format) and open it. What kind of information does this file contain? -> -> > ## Solution to Challenge 2 -> > -> > 1). What is in the first column of the phenotype file? Animal ID. The first row? Phenotype variable names - OF_distance_first4, OF_distance, OF_corner_pct, OF_periphery_pct, ... -> > 2). What is in the first column of the genotype file? marker ID. the first row? Animal ID - 1,4,5,6,7,8,9,10, ... -> 3). Locate the covariates file and open it. What kind of information does this file contain? Animal ID, sex, cohort, group, subgroup, ngen, and coat color. -> 4). Locate the control file (YAML or JSON format) and open it. What kind of information does this file contain? Names of primary data files, genotype and allele codes, cross type, description, and other metadata. -> {: .solution} -{: .challenge} - - -## Preparing your Diversity Outbred (DO) data for qtl2 - -Karl Broman provides detailed instructions for [preparing DO mouse data for R/qtl2](https://kbroman.org/qtl2/pages/prep_do_data.html). - - - - - - diff --git a/_episodes/03-calc-genoprob.md b/_episodes/03-calc-genoprob.md deleted file mode 100644 index f1e785e..0000000 --- a/_episodes/03-calc-genoprob.md +++ /dev/null @@ -1,443 +0,0 @@ ---- -title: "Calculating Genotype Probabilities" -teaching: 30 -exercises: 30 -questions: -- "How do I calculate QTL at positions between genotyped markers?" -- "How do I calculate QTL genotype probabilities?" -- "How do I calculate allele probabilities?" -- "How can I speed up calculations if I have a large data set?" -objectives: -- To explain why the first step in QTL analysis is to calculate genotype probabilities. -- To insert pseudomarkers between genotyped markers. -- To calculate genotype probabilities. -keypoints: -- "The first step in QTL analysis is to calculate genotype probabilities." -- "Insert pseudomarkers to calculate QTL at positions between genotyped markers." -- "Calculate genotype probabilities between genotyped markers with calc_genoprob()." -source: Rmd ---- - - - -The first task in QTL analysis is to calculate conditional genotype probabilities, given the observed marker data, at each putative QTL position. For example, the first step would be to determine the probabilities for genotypes SS and SB at the locus indicated below. - -![adapted from Broman & Sen, 2009](../fig/unknown_genotype.png) - -The `calc_genoprob()` function calculates QTL genotype probabilities, conditional on the available marker data. These are needed for most of the QTL mapping functions. The result is returned as a list of three-dimensional arrays (one per chromosome). Each 3d array of probabilities is arranged as individuals × genotypes × positions. - -![](../fig/threeD_array.png) - -[See this page for a graphical review of data structures in R](http://venus.ifca.unican.es/Rintro/_images/dataStructuresNew.png). - -To find QTL at positions between markers (so called "pseudomarkers"), first insert pseudomarkers into the genetic map with the function `insert_pseudomarkers()`. - -We'll use the -[iron dataset](https://github.com/kbroman/qtl2/tree/gh-pages/assets/sampledata/iron) -from [Grant et al. (2006) Hepatology 44:174-185](https://www.ncbi.nlm.nih.gov/pubmed/16799992) -(an intercross) as an example. In this study spleen and liver iron levels were measured in an F2 cross between mouse strains C57BL/6J and SWR. C57BL/6J mice exhibit low levels of non-heme iron, while SWR mice exhibit high levels. Iron levels between spleen and liver in the F2s were poorly correlated, indicating tissue-specific regulation. Significant QTL were found on chromosomes 2 and 16 for liver, and on chromosomes 8 and 9 in spleen. Candidate genes included transferrin near the chromosome 9 peak, and β2-microglobulin near the chromosome 2 peak. - -We first load the data using the function `system.file`, which finds files located in packages. The iron data are built into the qtl2 package, so we can use the `system.file()` function to load them directly from the package. - - -~~~ -library(qtl2) -iron <- read_cross2(file = system.file("extdata", "iron.zip", package="qtl2") ) -~~~ -{: .r} - -To load your own data from your machine, you would use the file path to your data files. For example, if the file path to your data files is `/Users/myUserName/qtlProject/data`, the command to load your data would look like this: - - -~~~ -myQTLdata <- read_cross2(file = "/Users/myUserName/qtlProject/data/myqtldata.yaml" ) -~~~ -{: .r} - -The YAML file contains all control information for your data, including names of data files, cross type, column specifications for sex and cross information, and more. This can also be in JSON format. Alternatively, all data files can be zipped together for loading. - - -~~~ -myQTLdata <- read_cross2(file = "/Users/myUserName/qtlProject/data/myqtldata.zip" ) -~~~ -{: .r} - -Back to the iron data. Now look at a summary of the cross data and the names of each variable within the data. - - -~~~ -summary(iron) -~~~ -{: .r} - - - -~~~ -Object of class cross2 (crosstype "f2") - -Total individuals 284 -No. genotyped individuals 284 -No. phenotyped individuals 284 -No. with both geno & pheno 284 - -No. phenotypes 2 -No. covariates 2 -No. phenotype covariates 1 - -No. chromosomes 20 -Total markers 66 - -No. markers by chr: - 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 X - 3 5 2 2 2 2 7 8 5 2 7 2 2 2 2 5 2 2 2 2 -~~~ -{: .output} - - - -~~~ -names(iron) -~~~ -{: .r} - - - -~~~ - [1] "crosstype" "geno" "gmap" "pmap" "pheno" - [6] "covar" "phenocovar" "is_x_chr" "is_female" "cross_info" -[11] "alleles" -~~~ -{: .output} - -Have a look at the markers listed in the genetic map, `gmap`. Markers are listed by chromosome and described by cM position. View only the markers on the first several chromosomes. - - -~~~ -head(iron$gmap) -~~~ -{: .r} - - - -~~~ -$`1` -D1Mit18 D1Mit80 D1Mit17 - 27.3 51.4 110.4 - -$`2` -D2Mit379 D2Mit75 D2Mit17 D2Mit304 D2Mit48 - 38.3 48.1 56.8 58.0 73.2 - -$`3` -D3Mit22 D3Mit18 - 25.1 54.6 - -$`4` - D4Mit2 D4Mit352 - 10.9 53.6 - -$`5` -D5Mit11 D5Mit30 - 17.5 62.3 - -$`6` -D6Mit104 D6Mit15 - 41.5 66.7 -~~~ -{: .output} - -We then use `insert_pseudomarkers()` to insert pseudomarkers into the -genetic map, which we grab from the `iron` object as `iron$gmap`: - - -~~~ -map <- insert_pseudomarkers(map=iron$gmap, step=1) -~~~ -{: .r} - -Pseudomarkers are inserted at regular intervals, and genotype probabilities will be calculated at each of these intervals. Now have a look at the new object called `map`. View only the first two chromosomes. - - -~~~ -head(map, n=2) -~~~ -{: .r} - - - -~~~ -$`1` - D1Mit18 c1.loc28 c1.loc29 c1.loc30 c1.loc31 c1.loc32 c1.loc33 c1.loc34 - 27.3 28.3 29.3 30.3 31.3 32.3 33.3 34.3 - c1.loc35 c1.loc36 c1.loc37 c1.loc38 c1.loc39 c1.loc40 c1.loc41 c1.loc42 - 35.3 36.3 37.3 38.3 39.3 40.3 41.3 42.3 - c1.loc43 c1.loc44 c1.loc45 c1.loc46 c1.loc47 c1.loc48 c1.loc49 c1.loc50 - 43.3 44.3 45.3 46.3 47.3 48.3 49.3 50.3 - c1.loc51 D1Mit80 c1.loc52 c1.loc53 c1.loc54 c1.loc55 c1.loc56 c1.loc57 - 51.3 51.4 52.3 53.3 54.3 55.3 56.3 57.3 - c1.loc58 c1.loc59 c1.loc60 c1.loc61 c1.loc62 c1.loc63 c1.loc64 c1.loc65 - 58.3 59.3 60.3 61.3 62.3 63.3 64.3 65.3 - c1.loc66 c1.loc67 c1.loc68 c1.loc69 c1.loc70 c1.loc71 c1.loc72 c1.loc73 - 66.3 67.3 68.3 69.3 70.3 71.3 72.3 73.3 - c1.loc74 c1.loc75 c1.loc76 c1.loc77 c1.loc78 c1.loc79 c1.loc80 c1.loc81 - 74.3 75.3 76.3 77.3 78.3 79.3 80.3 81.3 - c1.loc82 c1.loc83 c1.loc84 c1.loc85 c1.loc86 c1.loc87 c1.loc88 c1.loc89 - 82.3 83.3 84.3 85.3 86.3 87.3 88.3 89.3 - c1.loc90 c1.loc91 c1.loc92 c1.loc93 c1.loc94 c1.loc95 c1.loc96 c1.loc97 - 90.3 91.3 92.3 93.3 94.3 95.3 96.3 97.3 - c1.loc98 c1.loc99 c1.loc100 c1.loc101 c1.loc102 c1.loc103 c1.loc104 c1.loc105 - 98.3 99.3 100.3 101.3 102.3 103.3 104.3 105.3 -c1.loc106 c1.loc107 c1.loc108 c1.loc109 c1.loc110 D1Mit17 - 106.3 107.3 108.3 109.3 110.3 110.4 - -$`2` -D2Mit379 c2.loc39 c2.loc40 c2.loc41 c2.loc42 c2.loc43 c2.loc44 c2.loc45 - 38.3 39.3 40.3 41.3 42.3 43.3 44.3 45.3 -c2.loc46 c2.loc47 D2Mit75 c2.loc48 c2.loc49 c2.loc50 c2.loc51 c2.loc52 - 46.3 47.3 48.1 48.3 49.3 50.3 51.3 52.3 -c2.loc53 c2.loc54 c2.loc55 c2.loc56 D2Mit17 c2.loc57 D2Mit304 c2.loc58 - 53.3 54.3 55.3 56.3 56.8 57.3 58.0 58.3 -c2.loc59 c2.loc60 c2.loc61 c2.loc62 c2.loc63 c2.loc64 c2.loc65 c2.loc66 - 59.3 60.3 61.3 62.3 63.3 64.3 65.3 66.3 -c2.loc67 c2.loc68 c2.loc69 c2.loc70 c2.loc71 c2.loc72 D2Mit48 - 67.3 68.3 69.3 70.3 71.3 72.3 73.2 -~~~ -{: .output} - -Notice that pseudomarkers are now spaced at 1 cM intervals from genotyped markers. The argument `step=1` generated pseudomarkers at these intervals. - -Next we use `calc_genoprob()` to calculate the QTL genotype probabilities. - - -~~~ -pr <- calc_genoprob(cross=iron, map=map, error_prob=0.002) -~~~ -{: .r} - -The argument `error_prob` supplies an assumed genotyping error probability of 0.002. If a value for `error_prob` is not supplied, the default probability is 0.0001. - -Recall that the result of `calc_genoprob`, `pr`, is a list of three-dimensional arrays (one per chromosome). - - -~~~ -names(pr) -~~~ -{: .r} - - - -~~~ - [1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12" "13" "14" "15" -[16] "16" "17" "18" "19" "X" -~~~ -{: .output} - -Each three-dimensional array of probabilities is arranged as individuals × genotypes × positions. Have a look at the names of each of the three dimensions for chromosome 19. - - -~~~ -dimnames(pr$`19`) -~~~ -{: .r} - - - -~~~ -[[1]] - [1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12" - [13] "13" "14" "15" "16" "17" "18" "19" "20" "21" "22" "23" "24" - [25] "25" "26" "27" "28" "29" "30" "31" "32" "33" "34" "35" "36" - [37] "37" "38" "39" "40" "41" "42" "43" "44" "45" "46" "47" "48" - [49] "49" "50" "51" "52" "53" "54" "55" "56" "57" "58" "59" "60" - [61] "61" "62" "63" "64" "65" "66" "67" "68" "69" "70" "71" "72" - [73] "73" "74" "75" "76" "77" "78" "79" "80" "81" "82" "83" "84" - [85] "85" "86" "87" "88" "89" "90" "91" "92" "93" "94" "95" "96" - [97] "97" "98" "99" "100" "101" "102" "103" "104" "105" "106" "107" "108" -[109] "109" "110" "111" "112" "113" "114" "115" "116" "117" "118" "119" "120" -[121] "121" "122" "123" "124" "125" "126" "127" "128" "129" "130" "131" "132" -[133] "133" "134" "135" "136" "137" "138" "139" "140" "141" "142" "143" "144" -[145] "145" "146" "147" "148" "149" "150" "151" "152" "153" "154" "155" "156" -[157] "157" "158" "159" "160" "161" "162" "163" "164" "165" "166" "167" "168" -[169] "169" "170" "171" "172" "173" "174" "175" "176" "177" "178" "179" "180" -[181] "181" "182" "183" "184" "185" "186" "187" "188" "189" "190" "191" "192" -[193] "193" "194" "195" "196" "197" "198" "199" "200" "201" "202" "203" "204" -[205] "205" "206" "207" "208" "209" "210" "211" "212" "213" "214" "215" "216" -[217] "217" "218" "219" "220" "221" "222" "223" "224" "225" "226" "227" "228" -[229] "229" "230" "231" "232" "233" "234" "235" "236" "237" "238" "239" "240" -[241] "241" "242" "243" "244" "245" "246" "247" "248" "249" "250" "251" "252" -[253] "253" "254" "255" "256" "257" "258" "259" "260" "261" "262" "263" "264" -[265] "265" "266" "267" "268" "269" "270" "271" "272" "273" "274" "275" "276" -[277] "277" "278" "279" "280" "281" "282" "283" "284" - -[[2]] -[1] "SS" "SB" "BB" - -[[3]] - [1] "D19Mit68" "c19.loc4" "c19.loc5" "c19.loc6" "c19.loc7" "c19.loc8" - [7] "c19.loc9" "c19.loc10" "c19.loc11" "c19.loc12" "c19.loc13" "c19.loc14" -[13] "c19.loc15" "c19.loc16" "c19.loc17" "c19.loc18" "c19.loc19" "c19.loc20" -[19] "c19.loc21" "c19.loc22" "c19.loc23" "c19.loc24" "c19.loc25" "c19.loc26" -[25] "c19.loc27" "c19.loc28" "c19.loc29" "c19.loc30" "c19.loc31" "c19.loc32" -[31] "c19.loc33" "c19.loc34" "c19.loc35" "c19.loc36" "c19.loc37" "D19Mit37" -~~~ -{: .output} - -View the first three rows of genotype probabilities for the first genotyped marker on chromosome 19, and the two adjacent pseudomarkers located at 1 cM intervals away. Compare the probabilities for each pseudomarker genotype with those of the genotyped marker. - - -~~~ -(pr$`19`)[1:3,,"D19Mit68"] # genotyped marker -~~~ -{: .r} - - - -~~~ - SS SB BB -1 0.0009976995 0.003298162 0.9957041387 -2 0.2500000000 0.500000000 0.2500000000 -3 0.0003029243 0.999394151 0.0003029243 -~~~ -{: .output} - - - -~~~ -(pr$`19`)[1:3,,"c19.loc4"] # pseudomarker 1 cM away -~~~ -{: .r} - - - -~~~ - SS SB BB -1 0.001080613 0.03581825 0.963101136 -2 0.250000000 0.50000000 0.250000000 -3 0.006141104 0.98771779 0.006141104 -~~~ -{: .output} - - - -~~~ -(pr$`19`)[1:3,,"c19.loc5"] # the next pseudomarker -~~~ -{: .r} - - - -~~~ - SS SB BB -1 0.001342511 0.06759555 0.93106194 -2 0.250000000 0.50000000 0.25000000 -3 0.011589058 0.97682188 0.01158906 -~~~ -{: .output} - -We can also view the genotype probabilities using [plot_genoprob](https://github.com/rqtl/qtl2/blob/master/R/plot_genoprob.R). The arguments to this function specify: - -1. pr: the genotype probabilities, -1. map: the marker map, -1. ind: the index of the individual to plot, -1. chr: the index of the chromosome to plot. - - -~~~ -plot_genoprob(pr, map, ind = 1, chr = 19) -~~~ -{: .r} - -plot of chunk plot_genoprob - -The coordinates along chromosome 19 are shown on the horizontal axis and the three genotypes are shown on the vertical axis. Higher genotype probabilities are plotted in darker shades. This mouse has a BB genotype on the proximal end of the chromosome and transitions to SB toward the distal end. - -> ## Challenge 1 -> Find a partner and explain why calculating genotype probabilities is the first step in QTL analysis. Why do you need to insert pseudomarkers first? Listen to your partner's explanation, then write your responses in the collaborative document. -> -> > ## Solution to Challenge 1 -> > -> {: .solution} -{: .challenge} - -> ## Challenge 2 -> 1). Load a second dataset from Arabidopsis recombinant inbred lines -([Moore et al, Genetics, 2013](https://www.genetics.org/content/195/3/1077)) -in a study of plant root response to gravity (gravitropism). -> `grav <- read_cross2(file = system.file('extdata', 'grav2.zip', package = 'qtl2'))` -> 2). How many individuals were in the study? How many phenotypes? -How many chromosomes? -> 3). Insert pseudomarkers at 1cM intervals and save the results -to an object called `gravmap`. Have a look at the first chromosome. -> 4). Calculate genotype probabilities and save the results to an object -called `gravpr`. View the genotypes for the first three markers and -pseudomarkers on chromosome 1 for the first five individuals. -> -> > ## Solution to Challenge 2 -> > -> > 1). `grav <- read_cross2(file = system.file('extdata', 'grav2.zip', package = 'qtl2'))` -> > 2). `summary(grav)` -> > 3). `gravmap <- insert_pseudomarkers(map = grav$gmap, step = 1)` -> > followed by `head(gravmap, n=1)` -> > 4). `gravpr <- calc_genoprob(cross = grav, map = gravmap)` followed by -> > `(gravpr$``1``)[1:5,,"PVV4"]`, `(gravpr$`1`)[1:5,,"c1.loc1"]`, and -> > `(gravpr$`1`)[1:5,,"c1.loc2"]` -> {: .solution} -{: .challenge} - -> ## Challenge 3 -> Calculate genotype probabilities for a different data set from the -[qtl2 data repository](https://github.com/rqtl/qtl2data), this one from a study -of obesity and diabetes in a C57BL/6 (B6) × BTBR intercross. -> 1). Create a new script in RStudio with File -> New File -> R Script. -> 2) Download the B6 x BTBR zip file from the -> [qtl2 data repository](https://github.com/rqtl/qtl2data) into an object -called `b6btbr` by running this code: -> `b6btbr <- read_cross2(file = "https://raw.githubusercontent.com/rqtl/qtl2data/master/B6BTBR/b6btbr.zip")` -> 3). View a summary of the `b6btbr` data. How many individuals? phenotypes? -chromosomes? markers? -> 4). View the genetic map for the `b6btbr` data. -> 5). Insert pseudomarkers at 2 cM intervals. Assign the results to an object -called `b6btbrmap`. -> 6). Calculate genotype probabilities assuming a genotyping error probability -of 0.001. Assign the results to an object called `b6btbrpr`. -> 7). View the first several rows of genotype probabilities for -> any marker on chromosome 18. -> -> > ## Solution to Challenge 3 -> > -> > 1). Create a new script in RStudio with File -> New File -> R Script. -> > 2). `b6btbr <- read_cross2(file = "https://raw.githubusercontent.com/rqtl/qtl2data/master/B6BTBR/b6btbr.zip` -> > 3). `summary(b6btbr)` shows 544 individuals, 3 phenotypes, 20 chromosomes, 2057 markers. -> > 4). `b6btbr$gmap` -> > 5). `b6btbrmap <- insert_pseudomarkers(map=b6btbr$gmap, step=2)` -> > 6). `b6btbrpr <- calc_genoprob(cross=b6btbr, map=b6btbrmap, error_prob=0.001)` -> > 7). `dimnames((b6btbrpr$`18`))` shows all marker names for chromosome 18. `head((b6btbrpr$`18`)[,,"c18.loc48"])` gives genotype probabilities for an example pseudomarker, while `head((b6btbrpr$`18`)[,,"rs6338896"])` gives genotype probabilities for a genotyped marker. -> {: .solution} -{: .challenge} - -**Parallel calculations (optional)** To speed up the calculations with large datasets on a multi-core machine, you can use the argument `cores`. With `cores=0`, the number of available cores will be detected via `parallel::detectCores()`. Otherwise, specify the number of cores as a positive integer. - - -~~~ -pr <- calc_genoprob(cross=iron, map=map, error_prob=0.002, cores=4) -~~~ -{: .r} - -**Allele probabilities (optional)** The genome scan functions use genotype probabilities as well as a matrix of phenotypes. If you wished to perform a genome scan via an additive allele model, you would first convert the genotype probabilities to allele probabilities, using the function `genoprob_to_alleleprob()`. - - -~~~ -apr <- genoprob_to_alleleprob(probs=pr) -~~~ -{: .r} - -The figure below shows genotype and allele probabilities for 3 samples. In the Diversity Outbred, -there are 36 possible genotype states -(AA, AB, AC, ..., BB, BC, BD, ..., CC, CD, CE, ..., DD, DE, DF, ..., EE,...) or 8 + 7 + 6 + 5 + 4 + 3 + 2 + 1. -The first SNP below has genotype BB. In the table describing alleles (8 state founder probabilities), the -probability that this SNP has a B allele is 1. The 2nd SNP has genotype BH, so the allele table shows a -probability of 0.5 for B and 0.5 for H. The third SNP is either BG or BH, and has a probability of 0.5 -for each of these genotypes. The allele table shows a probability of 0.5 for allele B, and 0.25 for both -G and H. - -![](../fig/geno-to-allele-probs.png) diff --git a/_episodes/04-special-x-covar.md b/_episodes/04-special-x-covar.md deleted file mode 100644 index 45b0076..0000000 --- a/_episodes/04-special-x-covar.md +++ /dev/null @@ -1,41 +0,0 @@ ---- -title: "Special covariates for the X chromosome" -teaching: 15 -exercises: 15 -questions: -- "How do I find the chromosome X covariates for a cross?" -objectives: -- Get the X covariates for a cross. -keypoints: -- "The X chromosome requires special treatment in QTL mapping." -- "Special covariates such as sex should be included to avoid spurious evidence of linkage." -source: Rmd ---- - - - - -The X chromosome must be treated differently from the autosomes in the mapping of quantitative trait loci (QTL). If the X chromosome is treated like an autosome, a sex difference in a phenotype, such as weight or height, can lead to spurious linkage on the X chromosome. The X chromosome varies depending on the sex of the animal and the direction of the cross, so accounting for these covariates is important under the null hypothesis of no QTL, to avoid spurious evidence of linkage. (See [Broman et al. (2006) Genetics 174:2151-2158](http://www.genetics.org/content/174/4/2151.long).) Specifically, [see the figures](https://www.genetics.org/content/174/4/2151.figures-only) for the behavior of the X chromosome in a backcross and in an intercross. - -The particular X chromosome covariates depends on the cross, and can be obtained with the function `get_x_covar()`. In the iron data, sex is indicated as 0 for females and 1 for males. For cross direction, see Figure 2 of [Broman et al. (2006) Genetics 174:2151-2158](http://www.genetics.org/content/174/4/2151.long). For cross direction, all samples are filled in with 0 except for females from the *reverse* direction, who are indicated with 1. - - - -~~~ -Xcovar <- get_x_covar(iron) -head(Xcovar) -~~~ -{: .r} - - - -~~~ - sex direction -1 1 0 -2 1 0 -3 1 0 -4 1 0 -5 1 0 -6 1 0 -~~~ -{: .output} diff --git a/_episodes/05-perform-genome-scan.md b/_episodes/05-perform-genome-scan.md deleted file mode 100644 index 8364c76..0000000 --- a/_episodes/05-perform-genome-scan.md +++ /dev/null @@ -1,137 +0,0 @@ ---- -title: "Performing a genome scan" -teaching: 30 -exercises: 30 -questions: -- "How do I perform a genome scan?" -- "How do I plot a genome scan?" -objectives: -- Create a genome scan for a set of traits. -- Plot a genome scan. -keypoints: -- "A qtl2 genome scan requires genotype probabilities and a phenotype matrix." -- "The output from a genome scan contains a LOD score matrix, map positions, and phenotypes." -- "LOD curve plots for a genome scan can be viewed with plot_scan1()." -source: Rmd ---- - - - - - -The freely available chapter on [single-QTL analysis](http://www.rqtl.org/book/rqtlbook_ch04.pdf) from Broman and Sen's [A Guide to QTL Mapping with R/qtl](http://www.rqtl.org/book/) describes different methods for QTL analysis. Two of these methods are described here using data from an experiment on hypertension in the mouse [Sugiyama et al., Genomics 71:70-77, 2001](https://s3.amazonaws.com/academia.edu.documents/45963759/geno.2000.640120160526-29022-36mpgg.pdf?AWSAccessKeyId=AKIAIWOWYYGZ2Y53UL3A&Expires=1513786158&Signature=rtodlYwe0LDmYZFOm1ejvZjZhQ0%3D&response-content-disposition=inline%3B%20filename%3DConcordance_of_murine_quantitative_trait.pdf). The study employed a backcross between two mouse strains resulting in two possible genotypes - AA for homozygotes, and AB for heterozygotes. - -Linear regression can be employed to identify presence of QTL in a cross. To identify QTL using regression, we compare the fit for two models: 1) the null hypothesis that there are no QTL anywhere in the genome; and 2) the alternative hypothesis that there is a QTL near a specific position. A sloped line indicates that there is a difference in mean phenotype between the two genotype groups, and that a QTL is present. A line with no slope indicates that there is no difference in mean phenotype between the two groups, and that no QTL exists. Regression aims to find the line of best fit to the data. In the case of a backcross with only two genotypes, a t-test is performed at the marker to determine whether the difference in phenotype means is zero. - -![adapted from Broman & Sen, 2009](../fig/nullvalt.png) - -To find the line of best fit, the residuals or errors are calculated, then squared for each data point. - -![](../fig/residual.png) -![](../fig/squared-residual.png) -![](../fig/null.png) - -The line of best fit will be the one that minimizes the sum of squared residuals, which maximizes the likelihood of the data. - -Marker regression produces a LOD (logarithm of odds) score comparing the null hypothesis to the alternative. The LOD score is calculated using the sum of squared residuals for the null and alternative hypotheses. The LOD score is the difference between the log10 likelihood of the null hypothesis and the log10 likelihood of the alternative hypothesis. It is related to the regression model above by identifying the line of best fit to the data. A higher LOD score indicates greater likelihood of the alternative hypothesis. A LOD score closer to zero favors the null hypothesis. - -Marker regression can identify the existence and effect of a QTL by comparing means between groups, however, it requires known marker genotypes and can't identify QTL in between typed markers. To identify QTL between typed markers, we use Haley-Knott regression. After [calculating genotype probabilities](https://smcclatchy.github.io/mapping/03-calc-genoprob/), we can regress the phenotypes for animals of unknown genotype on these conditional genotype probabilities (conditional on known marker genotypes). In Haley-Knott regression, phenotype values can be plotted and a regression line drawn through the phenotype mean for the untyped individuals. - -![](../fig/hk-regress.png) - -As shown by the green circle in the figure, an individual of unknown genotype is placed between known genotypes according to the probability of its genotype being AA or AB. In this case, the probability of this individual having genotype AA is 0.6, and the probability of having genotype AB is 0.4. - -To perform a genome scan by Haley-Knott regression -([Haley and Knott 1992](https://www.ncbi.nlm.nih.gov/pubmed/16718932)), -use the function `scan1()`. `scan1()` takes as input the genotype probabilities, a matrix of phenotypes, and then optional additive and interactive covariates, and the special X chromosome covariates. Another option is to provide a vector of weights. - - -~~~ -out <- scan1(genoprobs = pr, pheno = iron$pheno, Xcovar=Xcovar) -~~~ -{: .r} - -On a multi-core machine, you can get some speed-up via the `cores` argument, as with `calc_genoprob()` and `calc_kinship()`. - - -~~~ -out <- scan1(genoprobs = pr, pheno = iron$pheno, Xcovar=Xcovar, cores=4) -~~~ -{: .r} - -The output of `scan1()` is a matrix of LOD scores, positions × phenotypes. - -Take a look at the first ten rows of the scan object. The numerical values are the LOD scores for the marker or pseudomarker named at the beginning of the row. LOD values are given for liver and spleen. - - -~~~ -head(out, n=10) -~~~ -{: .r} - - - -~~~ - liver spleen -D1Mit18 0.2928511 0.4566189 -c1.loc28 0.2815294 0.4378754 -c1.loc29 0.2696594 0.4193363 -c1.loc30 0.2572780 0.4011646 -c1.loc31 0.2444340 0.3835257 -c1.loc32 0.2311884 0.3665798 -c1.loc33 0.2176149 0.3504744 -c1.loc34 0.2037988 0.3353358 -c1.loc35 0.1898359 0.3212614 -c1.loc36 0.1758305 0.3083145 -~~~ -{: .output} - -The function `plot_scan1()` can be used to plot the LOD curves. Use the argument `lodcolumn` to indicate which column to plot. - - -~~~ -plot_scan1(out, map = map, lodcolumn = "liver") -~~~ -{: .r} - - -![](../fig/lod-plot.png) - -The LOD plot for liver clearly shows the largest peak is on chromosome 16. There are smaller peaks on chromosomes 2, 7, and 8. Which of these peaks is significant, and why? We'll evaluate the significance of genome scan results in a later episode on [performing a permutation test](https://smcclatchy.github.io/mapping/10-perform-perm-test/). - -> ## Challenge 1 -> Use the `head()` function to view the first 30 rows of the scan output. What is the next genotyped marker in the scan output? What are its LOD scores for liver and spleen? -> -> > ## Solution to Challenge 1 -> > `head(out, n=30)` -> > D1Mit80 0.02154502 0.2216682 -> {: .solution} -{: .challenge} - -> ## Challenge 2 -> Use the `sort()` function to sort the LOD scores for liver. Hint: run `dim(out)` for the row and column dimensions, or `colnames(out)` for the column names. -Which pseudomarker has the highest LOD score? Which genotyped marker has the highest LOD score? What chromosome number are they on? -> -> > ## Solution to Challenge 2 -> > `sort(out[,1])` for column 1 or `sort(out[,"liver"])` for the column named "liver". -> > The pseudomarker with the largest score is c16.loc29, with a LOD of 7.68. The genotyped marker with the largest LOD is D16Mit30 with a score of 7.28. Both are located on chromosome 16. -> {: .solution} -{: .challenge} - -> ## Challenge 3 -> Use the `sort()` function to sort the LOD scores for spleen. Which pseudomarker has the highest LOD score? Which genotyped marker has the highest LOD score? What chromosome number are they on? -> -> > ## Solution to Challenge 3 -> > `sort(out[,2])` or `sort(out[,"spleen"])` -> > The pseudomarker with the largest score is c9.loc57, with a LOD of 12.1. The genotyped marker with the largest LOD is D9Mit182 with a score of 10.4. Both are located on chromosome 9. -> {: .solution} -{: .challenge} - -> ## Challenge 4 -> Plot the LOD scores for spleen. Does the genome scan for spleen share any large-ish peaks with the scan for liver? -> -> > ## Solution to Challenge 4 -> > `plot_scan1(out, map = map, lodcolumn = "spleen")` -> > Both liver and spleen genome scans share a peak on chromosome 8 with a LOD score near 4. -> {: .solution} -{: .challenge} diff --git a/_episodes/06-perform-perm-test.md b/_episodes/06-perform-perm-test.md deleted file mode 100644 index e246727..0000000 --- a/_episodes/06-perform-perm-test.md +++ /dev/null @@ -1,205 +0,0 @@ ---- -title: "Performing a permutation test" -teaching: 10 -exercises: 20 -questions: -- "How can I evaluate the statistical significance of genome scan results?" -objectives: -- Run a permutation test to establish LOD score thresholds. -keypoints: -- "A permutation test establishes the statistical significance of a genome scan." -source: Rmd ---- - - - - - -To establish the statistical significance of the results of a genome scan, a permutation test can identify the maximum LOD score that can occur by random chance. A permutation tests shuffles genotypes and phenotypes, essentially breaking the relationship between the two. The genome-wide maximum LOD score is then calculated on the permuted data, and this score used as a threshold of statistical significance. A genome-wide maximum LOD on shuffled, or permuted, data serves as the threshold because it represents the highest LOD score generated by random chance. - -The `scan1perm()` function takes the same arguments as `scan1()`, plus additional arguments to control the permutations: - -- `n_perm` is the number of permutation replicates. -- `perm_Xsp` controls whether to perform autosome/X chromosome specific permutations (with `perm_Xsp=TRUE`) or not (the default is FALSE). -- `perm_strata` is a vector that defines the strata for a stratified permutation test. -- `chr_lengths` is a vector of chromosome lengths, used in the case that `perm_Xsp=TRUE`. - -As with `scan1()`, you may provide a kinship matrix (or vector of kinship matrices, for the "leave one chromosome out" (loco) approach), in order to fit linear mixed models. If `kinship` is unspecified, the function performs ordinary Haley-Knott regression. - -To perform a permutation test with the `iron` data, we run `scan1perm()`, provide it with the genotype probabilities, the phenotype data, X covariates and number of permutations. For expediency, we'll use only 10 permutations, although 1000 is recommended. - - -~~~ -operm <- scan1perm(genoprobs = pr, pheno = iron$pheno, Xcovar = Xcovar, n_perm = 1000) # replace 1000 with 10 for expediency -~~~ -{: .r} - -Note the need to specify special covariates for the X chromosome (via `Xcovar`), to be included under the null hypothesis of no QTL. And note that when these are provided, the default is to perform a stratified permutation test, using strata defined by the rows in -`Xcovar`. In general, when the X chromosome is considered, one will wish to stratify at least by sex. - -Also note that, as with `scan1()`, you can speed up the calculations on a multi-core machine by specifying the argument `cores`. With `cores=0`, the number of available cores will be detected via `parallel::detectCores()`. Otherwise, specify the number of cores as a positive integer. For large datasets, be mindful of the amount of memory that will be needed; you may need to use fewer than the maximum number of cores, to avoid going beyond the available memory. - - -~~~ -operm <- scan1perm(pr, iron$pheno, Xcovar=Xcovar, n_perm=1000, cores=0) -~~~ -{: .r} - -`operm` now contains the maximum LOD score for each permutation for the liver and spleen phenotypes. There should be 1000 values for each phenotypes. We can view the liver permutation LOD scores by making a histogram. - - -~~~ -hist(operm[,'liver'], breaks = 50, xlab = "LOD", main = "LOD scores for liver scan with threshold in red") -abline(v = summary(operm)[,'liver'], col = 'red', lwd = 2) -~~~ -{: .r} - -plot of chunk hist_perm - -In the histogram above, you can see that most of the maximum LOD scores fall between 1 and 3. This means that we expect LOD scores less than 3 to occur by chance fairly often. The red line indicates the alpha = 0.05 threshold, which means that, under permutation, we only see LOD values as high or higher, 5% of the time. This is one way of estimating a significance threshold for QTL plots. - -To get estimated significance thresholds, use the function `summary()`. - - -~~~ -summary(operm) -~~~ -{: .r} - - - -~~~ -LOD thresholds (1000 permutations) - liver spleen -0.05 3.34 3.79 -~~~ -{: .output} - -The default is to return the 5% significance thresholds. Thresholds for other (or for multiple) significance levels can be obtained via the `alpha` argument. - - -~~~ -summary(operm, alpha=c(0.2, 0.05)) -~~~ -{: .r} - - - -~~~ -LOD thresholds (1000 permutations) - liver spleen -0.2 2.65 2.73 -0.05 3.34 3.79 -~~~ -{: .output} - -To obtain autosome/X chromosome-specific significance thresholds, specify `perm_Xsp=TRUE`. In this case, you need to provide chromosome lengths, which may be obtained with the function `chr_lengths()`. - - -~~~ -operm2 <- scan1perm(pr, iron$pheno, Xcovar=Xcovar, n_perm=1000, - perm_Xsp=TRUE, chr_lengths=chr_lengths(map)) -~~~ -{: .r} - -Separate permutations are performed for the autosomes and X chromosome, and considerably more permutation replicates are needed for the X chromosome. The computations take about twice as much time. -See [Broman et al. (2006) Genetics -174:2151-2158](https://www.ncbi.nlm.nih.gov/pubmed/17028340). - -The significance thresholds are again derived via `summary()`: - - -~~~ -summary(operm2, alpha=c(0.2, 0.05)) -~~~ -{: .r} - - - -~~~ -Autosome LOD thresholds (1000 permutations) - liver spleen -0.2 2.64 2.62 -0.05 3.33 3.32 - -X chromosome LOD thresholds (28243 permutations) - liver spleen -0.2 3.13 4.01 -0.05 3.74 5.03 -~~~ -{: .output} - -As with `scan1`, we can use `scan1perm` with binary traits, using the argument `model="binary"`. Again, this can't be used with a kinship matrix, but all of the other arguments can be applied. - - -~~~ -operm_bin <- scan1perm(pr, bin_pheno, Xcovar=Xcovar, n_perm=1000, - perm_Xsp=TRUE, chr_lengths=chr_lengths(map), - model="binary") -~~~ -{: .r} - -Here are the estimated 5% and 20% significance thresholds. - - -~~~ -summary(operm_bin, alpha=c(0.2, 0.05)) -~~~ -{: .r} - - - -~~~ -Autosome LOD thresholds (1000 permutations) - liver spleen -0.2 2.68 2.61 -0.05 3.45 3.33 - -X chromosome LOD thresholds (28243 permutations) - liver spleen -0.2 3.09 3.12 -0.05 3.80 3.81 -~~~ -{: .output} - -The code below shuffles the phenotypes so that they no longer match up with the genotypes. The purpose of this is to find out how high the LOD score can be due to random chance alone. - - -~~~ -shuffled_order <- sample(rownames(iron$pheno)) -pheno_permuted <- iron$pheno -rownames(pheno_permuted) <- shuffled_order -xcovar_permuted <- Xcovar -rownames(xcovar_permuted) <- shuffled_order -out_permuted <- scan1(genoprobs = pr, pheno = pheno_permuted, Xcovar = xcovar_permuted) -plot(out_permuted, map) -head(shuffled_order) -~~~ -{: .r} - -> ## Challenge 1 -> Run the preceding code to shuffle the phenotype data and plot a genome -> scan with this shuffled (permuted) data. -> -> What is the maximum LOD score in the scan from this permuted data? -> How does it compare to the maximum LOD scores obtained from the earlier scan? -> How does it compare to the 5% and 20% LOD thresholds obtained earlier? -> Paste the maximum LOD score in the scan from your permuted data into the etherpad. -> -> > ## Solution to Challenge 1 -> > -> {: .solution} -{: .challenge} - -> ## Challenge 2 -> 1) Find the 1% and 10% significance thresholds for the first set of -permutations contained in the object `operm`. -> 2) What do the 1% and 10% significance thresholds say about LOD scores? -> -> > ## Solution to Challenge 2 -> > 1) `summary(operm, alpha=c(0.01, 0.10))` -> > 2) These LOD thresholds indicate maximum LOD scores that can be obtained -by random chance at the 1% and 10% significance levels. We expect to see LOD -values this high or higher 1% and 10% of the time respectively. -> {: .solution} -{: .challenge} diff --git a/_episodes/07-find-lod-peaks.md b/_episodes/07-find-lod-peaks.md deleted file mode 100644 index 60bfa8e..0000000 --- a/_episodes/07-find-lod-peaks.md +++ /dev/null @@ -1,91 +0,0 @@ ---- -title: "Finding LOD peaks" -teaching: 30 -exercises: 30 -questions: -- "How do I locate LOD peaks above a certain threshold value?" -objectives: -- Locate LOD peaks above a threshold value throughout the genome. -- Identify the Bayesian credible interval for a QTL peak. -keypoints: -- "LOD peaks and support intervals can be identified with find_peaks()." -source: Rmd ---- - - - - - -Once we have LOD scores from a genome scan and a significance threshold, we can look for QTL peaks associated with the phenotype. High LOD scores indicate the neighborhood of a QTL but don't give its precise position. To find the exact position of a QTL, we define an interval that is likely to hold the QTL. - -We'll use the Bayesian credible interval, which is a method for defining QTL intervals. It describes the probability that the interval contains the true value. Credible intervals make a probabilistic statement about the true value, for example, a 95% credible interval states that there is a 95% chance that the true value lies within the interval. - -To find peaks above a given threshold LOD value, use the function `find_peaks()`. It can also provide Bayesian credible intervals by using the argument `prob` (the nominal coverage for the Bayes credible intervals). Set the argument `expand2markers = FALSE` to keep from expanding the interval out to typed markers, or exclude this argument if you'd like to include flanking markers. - -You need to provide both the `scan1()` output, the marker/pseudomarker map and a threshold. We will use the 95% threshold from the permutations in the previous lesson. - - -~~~ -operm <- scan1perm(pr, iron$pheno, Xcovar=Xcovar, n_perm=1000) -thr <- summary(operm) -find_peaks(scan1_output = out, map = map, threshold = thr, prob = 0.95, expand2markers = FALSE) -~~~ -{: .r} - - - -~~~ - lodindex lodcolumn chr pos lod ci_lo ci_hi -1 1 liver 2 56.8 4.957564 54.3 70.3 -2 1 liver 7 50.1 4.050766 17.1 53.6 -3 1 liver 8 40.0 3.802511 34.0 55.0 -4 1 liver 16 28.6 7.681569 21.6 32.6 -5 2 spleen 8 13.6 4.302919 5.0 23.0 -6 2 spleen 9 56.6 12.063226 54.6 58.6 -~~~ -{: .output} - -The `find_peaks()` function can also pick out multiple peaks on a chromosome: each peak must exceed the chosen threshold, and the argument `peakdrop` indicates the amount that the LOD curve must drop between the lowest of two adjacent peaks. Use this feature with caution. - - -~~~ -find_peaks(scan1_output = out, map = map, threshold = thr, peakdrop = 1.8, prob = 0.95, expand2markers = FALSE) -~~~ -{: .r} - - - -~~~ - lodindex lodcolumn chr pos lod ci_lo ci_hi -1 1 liver 2 56.8 4.957564 54.3 70.3 -2 1 liver 7 25.1 4.040021 15.1 27.1 -3 1 liver 7 50.1 4.050766 41.1 53.6 -4 1 liver 8 40.0 3.802511 34.0 55.0 -5 1 liver 16 28.6 7.681569 21.6 32.6 -6 2 spleen 8 13.6 4.302919 5.0 23.0 -7 2 spleen 9 56.6 12.063226 54.6 58.6 -~~~ -{: .output} - -Each row shows a different peak; the columns show the peak location, LOD score and the lower and upper interval endpoints. - -> ## Challenge 1 -> Find peaks in the genome scan object called `out` that meet a threshold of 3 and are in the interval described by a 2 point LOD drop from the peak. How many peaks meet the LOD threshold of 3 and lie within the interval defined by a 2 point LOD drop from the maximum peaks on each chromosome? -> -> > ## Solution to Challenge 1 -> > `find_peaks(out, map, threshold=3, drop=2)` produces 7 peaks on 6 different chromosomes that meet a LOD threshold of 3 and are within the interval defined by a 2-LOD drop from the maximum peak on each chromosome. -> {: .solution} -{: .challenge} - - -> ## Challenge 2 -> 1). Calculate the 90% Bayes credible interval. -For chromosome 16 in liver, what is the range of the interval that has a 90% chance of containing the true QTL position? -2). Compare with the 95% Bayes credible interval calculated earlier. How does the interval change as you increase the probability? Why? -> -> > ## Solution to Challenge 2 -> > -> > 1). `find_peaks(out, map, prob=0.90, expand2markers = FALSE)` produces a range from 25.1 to 40.4. -> > 2). `find_peaks(out, map, prob=0.95, expand2markers = FALSE)` produces a range from 6.6 to 40.4, which is much broader than that of a 90% probability. The interval widens because the probability that the interval contains the true QTL position has increased. -> {: .solution} -{: .challenge} diff --git a/_episodes/08-calc-kinship.md b/_episodes/08-calc-kinship.md deleted file mode 100644 index 04e980b..0000000 --- a/_episodes/08-calc-kinship.md +++ /dev/null @@ -1,126 +0,0 @@ ---- -title: "Calculating A Kinship Matrix" -teaching: 30 -exercises: 30 -questions: -- "Why would I calculate kinship between individuals?" -- "How do I calculate kinship between individuals?" -- "What does a kinship matrix look like?" -objectives: -- Explain why and when kinship calculation matters in mapping. -- Create a kinship matrix for individuals. -keypoints: -- "Kinship matrices account for relationships among individuals." -- "Kinship is calculated as the proportion of shared alleles between individuals." -- "Kinship calculation is a precursor to a genome scan via a linear mixed model." -source: Rmd ---- - - - - - -Population structure and kinship are common confounding factors in genome-wide association studies (GWAS), case-control studies, and other study types in genetics. They create false positive associations between genotype and phenotype at genetic markers that differ in genotype frequencies between subpopulations due to genetic relatedness between samples. Simple association tests assume statistical independence between individuals. Population structure and kinship confound associations when phenotype covariance between individuals results from genetic similarity. Accounting for relatedness between individuals helps to distinguish true associations from false positives generated by population structure or kinship. - -As an example see the table below for phenotype and genotype frequencies between two subpopulations in a case-control study. - - -| |subpop1|subpop2|overall pop -|:-----------------------------|:-----:|:-----:|:-----:| -| frequency | 0.5 | 0.5 | 1 | -| probability of AA genotype | 0.1 | 0.9 | 0.5 | -| probability of disease | 0.9 | 0.1 | 0.5 | -| probability of disease & AA | 0.09 | 0.09 | 0.09 | - -The full population consists of two equally represented subpopulations. In the overall population, the probability of the AA genotype is 0.5, and the probability of disease is also 0.5. The joint probability of both disease and AA genotype in the population (0.09) is less than either the probability of disease (0.5) or the probability of the AA genotype (0.5) alone, and is considerably less than the joint probability of 0.25 that would be calculated if subpopulations weren't taken into account. In a case-control study that fails to recognize subpopulations, most of the cases will come from subpopulation 1 since this subpopulation has a disease probability of 0.9. However, this subpopulation also has a low probability of the AA genotype. So a false association between AA genotype and disease would occur because only overall population probabilities would be considered. - -Linear mixed models (LMMs) consider genome-wide similarity between all pairs of individuals to account for population structure, known kinship and unknown relatedness. They model the covariance between individuals. Linear mixed models in association mapping studies can successfully correct for genetic relatedness between individuals in a population by incorporating kinship into the model. To perform a genome scan by a linear mixed model, accounting for the relationships among individuals (in other words, including a random polygenic effect), you'll need to calculate a kinship matrix for the individuals. This is accomplished with the `calc_kinship()` function. It takes the genotype probabilities as input. - - -~~~ -kinship <- calc_kinship(probs = pr) -~~~ -{: .r} - -Take a look at the kinship values calculated for the first 5 individuals. - - -~~~ -kinship[1:5, 1:5] -~~~ -{: .r} - - - -~~~ - 1 2 3 4 5 -1 0.6780378 0.5070425 0.4770823 0.5100762 0.5193062 -2 0.5070425 0.5612483 0.5139017 0.4777593 0.5049052 -3 0.4770823 0.5139017 0.7272491 0.5147456 0.5393396 -4 0.5100762 0.4777593 0.5147456 0.7153455 0.5359428 -5 0.5193062 0.5049052 0.5393396 0.5359428 0.5775571 -~~~ -{: .output} - -We can also look at the first 50 mice in the kinship matrix. - - -~~~ -n_samples <- 25 -heatmap(kinship[1:n_samples, 1:n_samples], symm = TRUE) -~~~ -{: .r} - -plot of chunk plot_kinship - -The mice are listed in the same order on both sides of the matrix. The comb-like structures are called "dendrograms" and they indicate how the mice are clustered together. Each cell represents the degree of allele sharing between mice. Red colors indicate higher kinship and yellow colors indicate lower kinship. Each mouse is closely related to itself, so the cells along the diagonal tend to be darker than the other cells. You can see some evidence of related mice, possibly siblings, in the orange-shaded blocks along the diagonal. - - -By default, the genotype probabilities are converted to allele probabilities, and the kinship matrix is calculated as the proportion of shared alleles. To use genotype probabilities instead, use `use_allele_probs=FALSE` in the call to `calc_kinship()`. Further, by default we omit the X chromosome and only use the autosomes. To include the X chromosome, use `omit_x=FALSE`. - -In calculating the kinship matrix, you can eliminate the effect of varying marker density across the genome, and only use the probabilities along the grid of pseudomarkers (defined by the `step` argument to `insert_pseudomarkers()`. To do so, we need to first use `calc_grid()` to determine the grid of pseudomarkers, and then `probs_to_grid()` to probabilities for positions that are not on the grid. - - -~~~ -grid <- calc_grid(map = iron$gmap, step=1) -pr_grid <- probs_to_grid(probs = pr, grid = grid) -kinship_grid <- calc_kinship(probs = pr_grid) -~~~ -{: .r} - -On a multi-core machine, you can get some speed-up via the `cores` argument, as with `calc_genoprob()`. - - -~~~ -kinship <- calc_kinship(pr, cores=4) -~~~ -{: .r} - -> ## Challenge 1 -> 1). Insert pseudomarkers into a new map called `sparser_map` at -2 cM intervals. -> 2). Calculate genotype probabilities and save as an object called `pr2`. -Leave the error probability at the default value. -> 3). Calculate kinship with these new probabilities. Save as `kinship2`. -> 4). View the first several rows and columns of the `kinship2` matrix and -compare to the original `kinship` matrix with a heatmap. -> -> -> > ## Solution to Challenge 1 -> > -> > 1). `sparser_map <- insert_pseudomarkers(map = map, step = 2)` -> > 2). `pr2 <- calc_genoprob(cross = iron, map = sparser_map)` -> > 3). `kinship2 <- calc_kinship(probs = pr2)` -> > 4). `kinship2[1:5, 1:5]` and `heatmap(kinship2[1:n_samples, 1:n_samples], symm = TRUE)` -> {: .solution} -{: .challenge} - -> ## Challenge 2 -> Think about what a kinship matrix is and what it represents. Share your -understanding with a neighbor. Write your explanation in the collaborative -document or in your own personal notes. -> -> > ## Solution to Challenge 2 -> > -> {: .solution} -{: .challenge} diff --git a/_episodes/09-perform-genome-scan-lmm.md b/_episodes/09-perform-genome-scan-lmm.md deleted file mode 100644 index f53bf1b..0000000 --- a/_episodes/09-perform-genome-scan-lmm.md +++ /dev/null @@ -1,208 +0,0 @@ ---- -title: "Performing a genome scan with a linear mixed model" -teaching: 20 -exercises: 10 -questions: -- "How do I use a linear mixed model in a genome scan?" -- "How do different mapping and kinship calculation methods differ?" -objectives: -- Create a genome scan with a linear mixed model. -- Compare LOD plots for Haley-Knott regression and linear mixed model methods. -- Compare LOD plots for the standard kinship matrix with the leave-one-chromosome-out (LOCO) method. -keypoints: -- "To perform a genome scan with a linear mixed model, supply a kinship matrix." -- "Different mapping and kinship calculation methods give different results." -source: Rmd ---- - - - - - - -Genetic mapping in mice presents a good example of why accounting for population structure is important. Laboratory mouse strains are descended from a small number of founders (fancy mice) and went through several population bottlenecks. Wild-derived strains are not descended from fancy mice and don't share the same history as laboratory strains. Linear mixed models were developed to solve problems with population structure created by differing ancestries, and to handle relatedness between individuals. Linear mixed models (LMMs) consider genome-wide similarity between all pairs of individuals to account for population structure, known kinship and unknown relatedness. Linear mixed models in mapping studies can successfully correct for genetic relatedness between individuals in a population by incorporating kinship into the model. Earlier we [calculated a kinship matrix](https://smcclatchy.github.io/mapping/04-calc-kinship/) for input to a linear mixed model to account for relationships among individuals. For a current review of mixed models in genetics, see this [preprint of Martin and Eskin, 2017](https://www.biorxiv.org/content/early/2017/01/28/092106). - -Simple linear regression takes the form - -y = μ + βX + ε - -which describes a line with slope β and y-intercept μ. The error (or residual) is represented by ε. - -To model data from a cross, we use - -![](../fig/linear-genetic-model.png) - - -where yj is the phenotype of the jth individual, μ -is the mean phenotype value, βk is the effect of the -kth genotype, Xjk is the genotype for individual j, and εj is the error for the jth individual. In the figure -below, μ equals 94.6, and β equals -15.4 for the alternative hypothesis -(QTL exists). This linear model is y = 94.6 + -15.4X + ε. The model intersects the -genotype groups at their group means. In contrast, the null hypothesis would -state that there is no difference in group means (no QTL anywhere). The linear -model for the null hypothesis would be y = 94.6 + 0X + ε. -This states that the phenotype is equal to the combined mean (94.6), plus -some error (ε). In other words, genotype doesn't affect the phenotype. - - -![](../fig/nullvalt.png) - -The linear models above describe the relationship between genotype and phenotype but are inadequate for describing the relationship between genotype and phenotype in large datasets. They don't account for relatedness among individuals. In real populations, the effect of a single genotype is influenced by many other genotypes that affect the phenotype. A true genetic model takes into account the effect of all variants on the phenotype. - -To model the phenotypes of all individuals in the data, we can adapt the simple linear model to include all individuals and their variants so that we capture the effect of all variants shared by individuals on their phenotypes. - -![](../fig/matrix-alg-model.png) - -Now, y represents the phenotypes of all individuals. The effect of the ith genotype on the phenotype is βi, the mean is μ times 1, (mean multiplied by a vector of 1s) and the error is ε. Here, the number of genotypes is M. - -To model the effect of all genotypes and to account for relatedness, we test the effect of a single genotype while bringing all other genotypes into the model. - -![](../fig/all-geno-model.png) - -βk is the effect of the genotype Xk, and Σi≠kβiXi sums the effects of all other genotypes except genotype k. For the leave one chromosome out (LOCO) method, βkXk is the effect of genotypes on chromosome k, and βiXi represents effect of genotypes on all other chromosomes. - - -If the sample contains divergent subpopulations, SNPs on different chromosomes will be correlated because of the difference in allele frequencies between subpopulations caused by relatedness. To correct for correlations between chromosomes, we model all genotypes on the other chromosomes when testing for the association of a SNP. - -To perform a genome scan using a linear mixed model you also use the function `scan1`; you just need to provide the argument `kinship`, a kinship matrix (or, for the LOCO method, a list of kinship matrices). - - -~~~ -out_pg <- scan1(pr, iron$pheno, kinship=kinship, Xcovar=Xcovar) -~~~ -{: .r} - -Again, on a multi-core machine, you can get some speed-up using the `cores` argument. - - -~~~ -out_pg <- scan1(pr, iron$pheno, kinship, Xcovar=Xcovar, cores=4) -~~~ -{: .r} - -If, for your linear mixed model genome scan, you wish to use the "leave one chromosome out" (LOCO) method (scan each chromosome using a kinship matrix that is calculated using data from all other chromosomes), use `type="loco"` in the call to `calc_kinship()`. - - -~~~ -kinship_loco <- calc_kinship(pr, "loco") -~~~ -{: .r} - -For the LOCO (leave one chromosome out) method, provide the list of kinship matrices as obtained from `calc_kinship()` with `method="loco"`. - - -~~~ -out_pg_loco <- scan1(pr, iron$pheno, kinship_loco, Xcovar=Xcovar) -~~~ -{: .r} - -To plot the results, we again use `plot_scan1()`. - -Here is a plot of the LOD scores by Haley-Knott regression and the linear mixed model using either the standard kinship matrix or the LOCO method. - -![](../fig/lod-plot-compare-liver.png) -![](../fig/lod-plot-compare-spleen.png) - -You can use the code below to generate overlaid plots for each method. - - -~~~ -plot_scan1(out_pg_loco, map = map, lodcolumn = "liver", col = "black") -plot_scan1(out_pg, map = map, lodcolumn = "liver", col = "blue", add = TRUE) -plot_scan1(out, map = map, lodcolumn = "liver", col = "green", add = TRUE) -~~~ -{: .r} - -For the liver phenotype (top panel), the three methods give quite different results. The linear mixed model with an overall kinship matrix gives much lower LOD scores than the other two methods. On chromosomes with some evidence of a QTL, the LOCO method gives higher LOD scores than Haley-Knott, except on chromosome 16 where it gives lower LOD scores. - -For the spleen phenotype (bottom panel), the linear mixed model with an overall kinship matrix again gives much lower LOD scores than the other two methods. However, in this case Haley-Knott regression and the LOCO method give quite similar results. - -> ## Challenge 1 -> Earlier you inserted pseudomarkers for the `grav` data and saved the results to an object called `gravmap`. Then you calculated genotype probabilities and saved the results to an object called `gravpr`. -> 1). Calculate kinship for the `grav` data using the LOCO method. -> 2). Run a genome scan with the genotype probabilities and kinship that you calculated. -> 3). Find the maximum LOD score for the scan using -`which(out_grav == maxlod(out_grav), arr.ind = TRUE)`. -> 4). Plot the genome scan for this phenotype (hint: use the column number as lodcolumn). -> > -> > ## Solution to Challenge 1 -> > -> > 1). `grav_kinship <- calc_kinship(gravpr, "loco")` -> > 2). `out_grav <- scan1(genoprobs = gravpr, -pheno = grav$pheno, kinship = grav_kinship)` -> > 3). `which(out_grav == maxlod(out_grav), arr.ind = TRUE)` row 166, col 133 -> > 4). `plot(out_grav, lodcolumn = 133, map = gravmap)` -> {: .solution} -{: .challenge} - -> ## Challenge 2 -> What are the benefits and disadvantages of the three -> methods for genome scanning (Haley-Knott regression, -> kinship matrix, and leave-one-chromosome out (LOCO)?) -> Which method would you use to scan, and why? -> Think about the advantages and disadvantages of each, -> discuss with a neighbor, and share your thoughts in the -> collaborative document. -> > -> > ## Solution to Challenge 2 -> > -> {: .solution} -{: .challenge} - - -~~~ -file <- paste0("https://raw.githubusercontent.com/rqtl/", - "qtl2data/master/B6BTBR/b6btbr.zip") -b6btbr <- read_cross2(file) -~~~ -{: .r} - -> ## Challenge 3 -> Pair programming exercise: with your partner, review and carry -> out all of the steps in QTL mapping that we have covered so far, -> using a new data set. One of you types the code, the other -> explains what needs to happen next, finds the relevant code in -> the lesson, suggests new names for objects (i.e. NOT the ones -> you've already used, such as "map", "pr", "out", etc.). -> -> 1. Run the code above to load the [B6 x BTBR intercross data](https://github.com/rqtl/qtl2data/tree/master/B6BTBR) -> into an object called b6btbr. -> 2. Insert pseudomarkers and calculate genotype probabilities. -> 3. Run a genome scan for the log10_insulin_10wk phenotype. -> 4. Calculate a kinship matrix. -> 5. Calculate a list of kinship matrices with the LOCO method. -> 6. Run genome scans with the regular kinship matrix and with the -> list of LOCO matrices. -> 7. Plot the 3 different genome scans in a single plot in -> different colors. -> 8. Which chromosomes appear to have peaks with a LOD score greater than 4? -> Which methods identify these peaks? Which don't? -> -> > -> > ## Solution to Challenge 3 -> > -> > `file <- paste0("https://raw.githubusercontent.com/rqtl/", "qtl2data/master/B6BTBR/b6btbr.zip")` -> > `b6btbr <- read_cross2(file)` -> > `summary(b6btbr)` -> > `head(b6btbr$pheno)` -> > `colnames(b6btbr$pheno)` -> > `b6bmap <- insert_pseudomarkers(map=b6btbr$gmap, step=1)` -> > `prb6b <- calc_genoprob(cross=b6btbr, map=b6bmap, error_prob=0.002)` -> > `b6bXcovar <- get_x_covar(b6btbr)` -> > `b6bout <- scan1(genoprobs = prb6b, pheno = b6btbr$pheno, Xcovar=b6bXcovar)` -> > `plot(b6bout, map = b6bmap)` -> > `b6bkinship <- calc_kinship(probs = prb6b)` -> > `out_pg_b6b <- scan1(prb6b, b6btbr$pheno, kinship=b6bkinship, Xcovar=b6bXcovar)` -> > `kinship_loco_b6b <- calc_kinship(prb6b, "loco")` -> > `out_pg_loco_b6b <- scan1(prb6b, b6btbr$pheno, kinship_loco_b6b, Xcovar=b6bXcovar)` -> > `plot_scan1(out_pg_loco_b6b, map = b6bmap, lodcolumn = "log10_insulin_10wk", col = "black")` -> > `plot_scan1(out_pg_b6b, map = b6bmap, lodcolumn = "log10_insulin_10wk", col = "blue", add = TRUE)` -> > `plot_scan1(b6bout, map = b6bmap, lodcolumn = "log10_insulin_10wk", col = "green", add = TRUE)` -> > -> {: .solution} -{: .challenge} - - - diff --git a/_episodes/10-perform-genome-scan-bin.md b/_episodes/10-perform-genome-scan-bin.md deleted file mode 100644 index a3741ec..0000000 --- a/_episodes/10-perform-genome-scan-bin.md +++ /dev/null @@ -1,56 +0,0 @@ ---- -title: "Performing a genome scan with binary traits" -teaching: 20 -exercises: 10 -questions: -- "How do I create a genome scan for binary traits?" -objectives: -- Convert phenotypes to binary values. -- Use logistic regression for genome scans with binary traits. -- Plot and compare genome scans for binary traits. -keypoints: -- "A genome scan for binary traits (0 and 1) requires special handling; scans for non-binary traits assume normal variation of the residuals." -- "A genome scan for binary traits is performed with logistic regression." -source: Rmd ---- - - - -The genome scans above were performed assuming that the residual variation followed a normal distribution. This will often provide reasonable results even if the residuals are not normal, but an important special case is that of a binary trait, with values 0 and 1, which is best treated differently. The `scan1` function can perform a genome scan with binary traits by logistic regression, using the argument `model="binary"`. (The default value for the `model` argument is `"normal"`.) At present, we _can not_ account for relationships among individuals in this analysis. - -Let's first turn our two phenotypes into binary traits by thresholding at the median. One would generally _not_ do this in practice; this is just for illustration. - - -~~~ -bin_pheno <- apply(iron$pheno, 2, function(a) as.numeric(a > median(a))) -rownames(bin_pheno) <- rownames(iron$pheno) -~~~ -{: .r} - -We now perform the genome scan as before, including `model="binary"` to indicates that the phenotypes are binary traits with values 0 and 1. - - -~~~ -out_bin <- scan1(pr, bin_pheno, Xcovar=Xcovar, model="binary") -~~~ -{: .r} - -Here is a plot of the two LOD curves. - - -~~~ -par(mar=c(5.1, 4.1, 1.1, 1.1)) -ymx <- maxlod(out_bin) -plot(out_bin, map, lodcolumn=1, col="slateblue", ylim=c(0, ymx*1.02)) -plot(out_bin, map, lodcolumn=2, col="violetred", add=TRUE) -legend("topleft", lwd=2, col=c("slateblue", "violetred"), colnames(out_bin), bg="gray90") -~~~ -{: .r} - -We can use `find_peaks` as before. - - -~~~ -find_peaks(out_bin, map, threshold=3.5, drop=1.5) -~~~ -{: .r} diff --git a/_episodes/11-est-qtl-effects.md b/_episodes/11-est-qtl-effects.md deleted file mode 100644 index 253825c..0000000 --- a/_episodes/11-est-qtl-effects.md +++ /dev/null @@ -1,336 +0,0 @@ ---- -title: "Estimated QTL effects" -teaching: 10 -exercises: 20 -questions: -- "How do I find the estimated effects of a QTL on a phenotype?" -objectives: -- Obtain estimated QTL effects. -- Plot estimated QTL effects. -keypoints: -- "Estimated founder allele effects can be plotted from the mapping model coefficients." -- "Additive and dominance effects can be plotted using contrasts." -source: Rmd ---- - - - - - -Recall that to model data from a cross, we use - -![](../fig/linear-genetic-model.png) - - -where yj is the phenotype of the jth individual, μ -is the mean phenotype value, βk is the effect of the -kth genotype, Xjk is the genotype for individual j, and εj is the error for the jth individual. In the figure -below, μ equals 94.6, and β equals -15.4 for the alternative hypothesis -(QTL exists). - -![](../fig/nullvalt.png) - -This linear model is y = 94.6 + -15.4X + ε. The model intersects the -genotype groups at their group means, and is based on μ and -βk for chromosome 2 marker D2Mit17 located at 56.8 cM. - -The effect of genotype BB (the β for the -BB genotype) at marker D2Mit17 is -15.5, while the effect of the -SS genotype is -15.4 on -the liver phenotype. The effect of the SB genotype is --0.1 relative to μ equals -94.6. - -The `scan1()` function returns only LOD scores. To obtain estimated QTL effects, -use the function `scan1coef()`. This function takes a single phenotype and the -genotype probabilities for a single chromosome and returns a matrix with the -estimated coefficients at each putative QTL location along the chromosome. - -For example, to get the estimated QTL effects on chromosome 2 for the liver -phenotype, we would provide the chromosome 2 genotype probabilities and the -liver phenotype to the function `scan1coef()` as follows: - - -~~~ -c2eff <- scan1coef(pr[,"2"], iron$pheno[,"liver"]) -~~~ -{: .r} - -The result is a matrix of 39 positions × 3 -genotypes. An additional column contains the intercept values (μ). - - -~~~ -dim(c2eff) -~~~ -{: .r} - - - -~~~ -[1] 39 4 -~~~ -{: .output} - - - -~~~ -head(c2eff) -~~~ -{: .r} - - - -~~~ - SS SB BB intercept -D2Mit379 -8.627249 -1.239968 9.867217 95.10455 -c2.loc39 -8.802858 -1.405183 10.208040 95.15023 -c2.loc40 -8.926486 -1.552789 10.479275 95.18752 -c2.loc41 -8.994171 -1.670822 10.664993 95.21294 -c2.loc42 -9.004121 -1.748738 10.752859 95.22372 -c2.loc43 -8.956705 -1.779555 10.736260 95.21841 -~~~ -{: .output} - -To plot the QTL effects, use the function `plot_coef()`. - - -~~~ -plot_coef(c2eff, map, legend = "topright") -~~~ -{: .r} - -plot of chunk plot_effects_liver_simple - -The plot shows effect values on the y-axis and cM values on the x-axis. The -value of the intercept (μ) appears at the top. The effect of the SB genotype is centered around zero, with the effects -of the other two genotypes above and below. - -To plot only the effects, use the argument `columns` to indicate which -coefficients to plot. Add `scan1_output` to include a LOD plot at the bottom. - - -~~~ -plot_coef(c2eff, map, columns = 1:3, scan1_output = out, - main = "Chromosome 2 QTL effects and LOD scores", - legend = "topright") -~~~ -{: .r} - -plot of chunk plot_effects_liver_c2 - -If instead you want additive and dominance effects, you can provide a square matrix of _contrasts_, as follows: - - -~~~ -c2effB <- scan1coef(pr[,"2"], iron$pheno[,"liver"], - contrasts=cbind(mu=c(1,1,1), a=c(-1, 0, 1), d=c(-0.5, 1, -0.5))) -~~~ -{: .r} - -The result will then contain the estimates of `mu`, `a` (the additive effect), and `d` (the dominance effect). - - -~~~ -dim(c2effB) -~~~ -{: .r} - - - -~~~ -[1] 39 3 -~~~ -{: .output} - - - -~~~ -head(c2effB) -~~~ -{: .r} - - - -~~~ - mu a d -D2Mit379 95.10455 9.247233 -1.239968 -c2.loc39 95.15023 9.505449 -1.405183 -c2.loc40 95.18752 9.702880 -1.552789 -c2.loc41 95.21294 9.829582 -1.670822 -c2.loc42 95.22372 9.878490 -1.748738 -c2.loc43 95.21841 9.846482 -1.779555 -~~~ -{: .output} -For marker D2Mit17, `mu`, `a`, and `d` are 94.616915, 15.4026314, -0.1033269. - -Here's a plot of the chromosome 2 additive and dominance effects, which are in the second and third columns. - - -~~~ -plot_coef(c2effB, map["2"], columns=2:3, col=col) -~~~ -{: .r} - -![](../fig/chr2_effects_contrasts.png) - -If you provide a kinship matrix to `scan1coef()`, it fits a linear mixed model (LMM) to account for a residual polygenic effect. Here let's use the kinship matrix from the LOCO method. - - -~~~ -c2eff_pg <- scan1coef(pr[,"2"], iron$pheno[,"liver"], kinship_loco[["2"]]) -dim(c2eff_pg) -~~~ -{: .r} - - - -~~~ -[1] 39 4 -~~~ -{: .output} - - - -~~~ -head(c2eff_pg) -~~~ -{: .r} - - - -~~~ - SS SB BB intercept -D2Mit379 -9.552159 -0.5954469 10.14761 94.54643 -c2.loc39 -9.868033 -0.7068007 10.57483 94.57886 -c2.loc40 -10.130889 -0.8028380 10.93373 94.60481 -c2.loc41 -10.332864 -0.8737574 11.20662 94.62137 -c2.loc42 -10.468249 -0.9112466 11.37950 94.62626 -c2.loc43 -10.533817 -0.9103486 11.44417 94.61832 -~~~ -{: .output} - -Here's a plot of the estimates. - - -~~~ -plot_coef(c2eff_pg, map, columns = 1:3, scan1_output = out_pg_loco, main = "Chromosome 2 QTL effects and LOD scores", legend = "topright") -~~~ -{: .r} - -plot of chunk plot_effects_pg_liver_c2 - -You can also get estimated additive and dominance effects, using a matrix of contrasts. - - -~~~ -c2effB_pg <- scan1coef(pr[,"2"], iron$pheno[,"liver"], kinship_loco[["2"]], - contrasts=cbind(mu=c(1,1,1), a=c(-1, 0, 1), d=c(-0.5, 1, -0.5))) -~~~ -{: .r} - -Here's a plot of the results. - - -~~~ -plot(c2effB_pg, map["2"], columns=2:3, col=col) -~~~ -{: .r} - -![](../fig/chr2_effects_pg_add_dom.png) - -Another option for estimating the QTL effects is to treat them as [random effects](https://stats.stackexchange.com/questions/4700/what-is-the-difference-between-fixed-effect-random-effect-and-mixed-effect-mode#151800) and calculate [Best Linear Unbiased Predictors](https://en.wikipedia.org/wiki/Best_linear_unbiased_prediction) (BLUPs). This is particularly valuable for multi-parent populations such as the Collaborative Cross and Diversity Outbred mice, where the large number of possible genotypes at a QTL leads to considerable variability in the effect estimates. To calculate BLUPs, use `scan1blup()`; it takes the same arguments as `scan1coef()`, including -the option of a kinship matrix to account for a residual polygenic effect. - - -~~~ -c2blup <- scan1blup(pr[,"2"], iron$pheno[,"liver"], kinship_loco[["2"]]) -~~~ -{: .r} - -Here is a plot of the BLUPs (as dashed curves) alongside the standard estimates. - - -~~~ -plot_coef(c2eff, map["2"], columns=1:3) -plot(c2blup, map["2"], columns=1:3, add=TRUE, lty=2, legend = "topright") -~~~ -{: .r} - -plot of chunk plotblup - -The `scan1coef` function can also provide estimated QTL effects for binary traits, with `model="binary"`. (However, `scan1blup` has not yet been implemented for binary traits.) - - -~~~ -c2eff_bin <- scan1coef(pr[,"2"], bin_pheno[,"liver"], model="binary") -~~~ -{: .r} - -Here's a plot of the effects. They're a bit tricky to interpret, as they are basically log odds ratios. - -![](../fig/chr2_effects_binary.png) - -Finally, to plot the raw phenotypes against the genotypes at a single putative QTL position, you can use the function `plot_pxg()`. This takes a vector of genotypes as produced by the `maxmarg()` function, which picks the most likely genotype from a set of genotype probabilities, provided it is greater than some specified value (the argument `minprob`). Note that the “marg” in “maxmarg” stands for “marginal”, as this function is selecting the genotype at each position that has maximum marginal probability. - -For example, we could get inferred genotypes at the chr 2 QTL for the liver phenotype (at 28.6 cM) as follows: - - -~~~ -g <- maxmarg(pr, map, chr=2, pos=28.6, return_char=TRUE) -~~~ -{: .r} - -We use `return_char=TRUE` to have `maxmarg()` return a vector of character strings with the genotype labels. - -We then plot the liver phenotype against these genotypes as follows: - - -~~~ -par(mar=c(4.1, 4.1, 0.6, 0.6)) -plot_pxg(g, iron$pheno[,"liver"], ylab="Liver phenotype") -~~~ -{: .r} - -plot of chunk plot_pheno_geno - - -~~~ -par(mar=c(4.1, 4.1, 0.6, 0.6)) -plot_pxg(g, iron$pheno[,"liver"], SEmult=2, swap_axes=TRUE, xlab="Liver phenotype") -~~~ -{: .r} - -plot of chunk plot_pheno_geno_se - - -> ## Challenge 1 -> Find the QTL effects for chromosome 16 for the liver phenotype. -> 1) Create an object `c16eff` to contain the effects. -> 2) Plot the chromosome 16 coefficients and add the LOD plot at bottom. -> -> > -> > ## Solution to Challenge 1 -> > 1) `c16eff <- scan1coef(pr[,"16"], iron$pheno[,"liver"])` -> > 2) `plot_coef(c16eff, map, legend = "topright", scan1_output = out)` -> {: .solution} -{: .challenge} - -> ## Challenge 2 -> In the code block above, we use `swap_axes=TRUE` to place the phenotype -> on the x-axis. -> We can use `SEmult=2` to include the mean ± 2 SE intervals. -> 1) How would you decide which chromosome to plot? Discuss this with -> your neighbor, then write your responses in the collaborative document. -> 2) Calculate and plot the best linear unbiased predictors (blups) for -> spleen on the chromosome of your choice. -> -> > -> > ## Solution to Challenge 2 -> > -> > -> {: .solution} -{: .challenge} diff --git a/_episodes/12-snp-assoc.md b/_episodes/12-snp-assoc.md deleted file mode 100644 index 1717cc7..0000000 --- a/_episodes/12-snp-assoc.md +++ /dev/null @@ -1,359 +0,0 @@ ---- -title: "SNP association mapping" -teaching: 30 -exercises: 30 -questions: -- "How do I identify SNPs in a QTL?" -objectives: -- Perform a basic QTL analysis. -- Identify QTL with a genome scan. -- Find SNPs within a QTL. -- Convert founder genotypes to a strain distribution pattern (SDP). -- Infer SNP genotypes for Diversity Outbred mice. -- Perform SNP association analysis -keypoints: -- "SNP association analysis with DO mice requires SNPs in the QTL regions." -- "." -source: Rmd ---- - - - -For multi-parent crosses, it can be useful to collapse the genotype or allele probabilities according to the founder genotypes of the various -SNPs in the region of a QTL. - -### QTL analysis in Diversity Outbred mice - -To illustrate this sort of SNP association analysis, we'll consider some Diversity Outbred mouse data. The Diversity Outcross (DO) mice are an advanced intercross population derived from the same eight founder strains as the Collaborative Cross (CC). See -[Svenson et al. (2012)](https://www.ncbi.nlm.nih.gov/pubmed/22345611) -and [Gatti et al. (2014)](https://www.ncbi.nlm.nih.gov/pubmed/25237114). - -We'll consider a subset of the data from -[Recla et al. (2014)](https://www.ncbi.nlm.nih.gov/pubmed/24700285), available as part of the -[qtl2data github repository](https://github.com/rqtl/qtl2data). (The -full data are in -[`DO_Recla`](https://github.com/rqtl/qtl2data/tree/master/DO_Recla); the directory -[`DOex`](https://github.com/rqtl/qtl2data/tree/master/DOex) contains a reduced set, with just three chromosomes, one phenotype (`OF_immobile_pct`, percent immobile in the open field test), and a -reduced set of markers. - -You can download the data from a single zip file, as follows: - - -~~~ -DOex <- read_cross2(file = "https://raw.githubusercontent.com/rqtl/qtl2data/master/DOex/DOex.zip") -~~~ -{: .r} - -Let's quickly whip through a basic analysis. - -We first calculate genotype probabilities and convert them to allele probabilities. We'll just use marker locations and not insert any pseudomarkers. - - -~~~ -pr <- calc_genoprob(DOex, error_prob=0.002) -apr <- genoprob_to_alleleprob(pr) -~~~ -{: .r} - -We calculate kinship matrices (using the "LOCO" method, though with the caveat that here we are only considering genotypes on three chromosomes). - - -~~~ -k <- calc_kinship(apr, "loco") -~~~ -{: .r} - -We create a numeric covariate for sex; be sure to include the individual IDs as names. - - -~~~ -sex <- (DOex$covar$Sex == "male")*1 -names(sex) <- rownames(DOex$covar) -~~~ -{: .r} - -Note that you can create the vector and assign the names in one step using the basic R function setNames(). - - -~~~ -sex <- setNames( (DOex$covar$Sex == "male")*1, rownames(DOex$covar) ) -~~~ -{: .r} - -We perform a genome scan with a linear mixed model (adjusting for a residual polygenic effect), with sex as an additive covariate. - - -~~~ -out <- scan1(apr, DOex$pheno, k, sex) -~~~ -{: .r} - -Here's a plot of the results. - - -~~~ -par(mar=c(4.1, 4.1, 0.6, 0.6)) -plot(out, DOex$gmap) -~~~ -{: .r} - -plot of chunk plot_DOex_scan - -There's a strong peak on chromosome 2. Let's look at the QTL effects. We estimate them with `scan1coef()`. We need to subset the allele probabilities and the list of kinship matrices. - - -~~~ -coef_c2 <- scan1coef(apr[,"2"], DOex$pheno, k[["2"]], sex) -~~~ -{: .r} - -For the DO, with 8 QTL alleles, we can use the function `plot_coefCC` in the [R/qtl2plot](https://github.com/rqtl/qtl2plot) package, which plots the 8 allele effects in the "official" Collaborative Cross (CC) -colors. (Well, actually _slightly_ modified colors, because I think the official colors are kind of ugly.) The strong locus seems to be mostly -due to the NZO allele. Note that `CCcolors` is a vector of colors included in the qtl2plot package; there's also a `CCorigcolors` object -with the _official_ colors. - - -~~~ -par(mar=c(4.1, 4.1, 0.6, 0.6)) -plot_coefCC(coef_c2, DOex$gmap["2"], bgcolor="gray95", legend="bottomleft") -~~~ -{: .r} - -plot of chunk plot_DOex_effects - -If you provide `plot_coefCC()` with the genome scan output, it will display the LOD curve below the coefficient estimates. - - -~~~ -par(mar=c(4.1, 4.1, 0.6, 0.6)) -plot_coefCC(coef_c2, DOex$gmap["2"], scan1_output=out, bgcolor="gray95", legend="bottomleft") -~~~ -{: .r} - -plot of chunk plot_DOex_lod_curve - -### Connecting to SNP and gene databases -To perform SNP association analysis in the region of a QTL, we’ll need to grab data on all of the SNPs in the region, including their genotypes in the eight founder strains for the CC and DO populations. As a related task, we’ll also want to identify the genes in the region. - -We want R/qtl2 to permit different ways of storing this information. For the CC and DO populations, we’ve prepared SQLite database files for the variants in the CC founders and for the MGI mouse gene annotations. But others might wish to use a different kind of database, or may wish to query an online database. - -We provide a template for how to use R/qtl2 to connect to SNP and gene databases, with the functions `create_variant_query_func()` and `create_gene_query_func()`. Each returns a function for querying variant and gene databases, respectively. The query functions that are returned take just three arguments (chr, start, end end), and themselves return a data frame of variants on the one hand and genes on the other. - -During [setup](https://smcclatchy.github.io/mapping/setup/) you would have downloaded the SQLite databases from Figshare and placed these in your `data` directory: - -+ [SQLite database of variants in Collaborative Cross founder mouse strains (v3)](https://figshare.com/articles/SQLite_database_of_variants_in_Collaborative_Cross_founder_mouse_strains/5280229/3): SNP, indel, and structural variants in the Collaborative Cross founders (3.87 GB) -+ [SQLite database with mouse gene annotations from Mouse Genome Informatics (v6)](https://figshare.com/articles/dataset/SQLite_database_with_mouse_gene_annotations_from_Mouse_Genome_Informatics_MGI_at_The_Jackson_Laboratory/5280238): full set of mouse gene annotations from build 38 mm10 (529.89 MB) -+ [SQLite database with MGI mouse gene annotations from Mouse Genome Informatics (v7)](https://figshare.com/articles/dataset/SQLite_database_with_MGI_mouse_gene_annotations_from_Mouse_Genome_Informatics_MGI_at_The_Jackson_Laboratory/5286019): like the previous, but including only non-duplicate gene records sourced from MGI (11.16 MB) - -To create a function for querying the CC variants, call `create_variant_query_func()` with the path to the `cc_variants.sqlite` file: - - -~~~ -query_variants <- create_variant_query_func("../data/cc_variants.sqlite") -~~~ -{: .r} - -To grab the variants in the interval 97-98 Mbp on chromosome 2, you’d then do the following: - - -~~~ -variants_2_97.5 <- query_variants(2, 97, 98) -~~~ -{: .r} - -Similarly, to create a function for querying the MGI mouse gene annotations, you call `create_gene_query_func()` with the path to the `mouse_genes_mgi.sqlite` file: - - -~~~ -query_genes <- create_gene_query_func("../data/mouse_genes_mgi.sqlite") -~~~ -{: .r} - -To grab the genes overlapping the interval 97-98 Mbp on chromosome 2, you’d then do the following: - - -~~~ -genes_2_97.5 <- query_genes(2, 97, 98) -~~~ -{: .r} - -The way we’ve set this up is a bit complicated, but it allows greatest flexibility on the part of the user. And for our own work, we like to have a local SQLite database, for rapid queries of SNPs and genes. - -### SNP associations - -Okay, now finally we get to the SNP associations. We have a large peak on chromosome 2, and we want to look at individual SNPs in the region of the locus. - -Well, actually, we first need to find the location of the inferred QTL. The peak LOD score on chromosome 2 occurs at 52.4 cM. But to find nearby SNPs, we really want to know the Mbp position. The calculations were only performed at the marker positions, and so we can use `max()`, giving both the `scan1()` output and the physical map, and then pull out the position from the results. - - -~~~ -peak_Mbp <- max(out, DOex$pmap)$pos -~~~ -{: .r} - -The marker is at 97.5 Mbp. We’ll focus on a 2 Mbp interval centered at 97.5 Mbp. - -We can pull out the variants in the 2 Mbp interval centered at 97.5 on chr 2 using the query function we defined above: - - -~~~ -variants <- query_variants(2, peak_Mbp - 1, peak_Mbp + 1) -~~~ -{: .r} - -There are 27355 variants in the interval, including 27108 SNPs, 127 indels, and 120 structural variants. We’re treating all of them as biallelic markers (all but the major allele in the eight founder strains binned into a single allele). In the following, we’re going to just say “SNP” rather than “variant”, even though the variants include indels and structural variants. - -After identifying the variants in the interval of interest, we use our genotype probabilities and the founder SNP genotypes to infer the SNP genotypes for the DO mice. That is, at each SNP, we want to collapse the eight founder allele probabilities to two SNP allele probabilities, using the SNP genotypes of the founders. - -We do this assuming that the genotype probabilities were calculated sufficiently densely that they can be assumed to be constant in intervals. With this assumption, we then: - -- Find the interval for each SNP. -- Reduce the SNPs to a “distinct” set: if two SNPs have the same strain distribution pattern (SDP; the pattern of alleles in the eight founders) and are in the same interval, by our assumption their allele probabilities will be the same. -- Take the average of the allele probabilities at the two endpoints of each interval. -- Collapse the 8 allele probabilities to two according to each observed SDP in the interval. -- We further create a look-up table relating the full set of SNPs to the reduced set (one of each observed SDP in each interval). - -All of these steps are combined into a single function `scan1snps()`, which takes the genotype probabilities, a physical map of those locations, the phenotype, the kinship matrix, covariates, the query function for grabbing the SNPs in a given interval, and the chromosome, start, and end positions for the interval. (If `insert_pseudomarkers()` was used to insert pseudomarkers, you will need to use `interp_map()` to get interpolated Mbp positions for the pseudomarkers.) - - -~~~ -out_snps <- scan1snps(pr, DOex$pmap, DOex$pheno, k[["2"]], sex, query_func=query_variants, - chr=2, start=peak_Mbp-1, end=peak_Mbp+1, keep_all_snps=TRUE) -~~~ -{: .r} - -The output is a list with two components: `lod` is a matrix of LOD scores (with a single column, since we’re using just one phenotype), and `snpinfo` is a data frame with SNP information. With the argument `keep_all_snps=TRUE`, the `snpinfo` data frame contains information about all of the variants in the region with an index column indicating the equivalence classes. - -The function `plot_snpasso()` can be used to plot the results, with points at each of the SNPs. The default is to plot *all* SNPs. In this case, there are 27737 variants in the region, but only 150 distinct ones. - - -~~~ -par(mar=c(4.1, 4.1, 0.6, 0.6)) -plot_snpasso(out_snps$lod, out_snps$snpinfo) -~~~ -{: .r} - -plot of chunk plot_snp_asso - -We can actually just type plot() rather than plot_snpasso(), because with the snpinfo table in place of a genetic map, plot() detects that a SNP association plot should be created. - - -~~~ -par(mar=c(4.1, 4.1, 0.6, 0.6)) -plot(out_snps$lod, out_snps$snpinfo) -~~~ -{: .r} - -plot of chunk plot_snp_asso_wplot - -We can use our `query_genes()` function to identify the genes in the region, and `plot_genes()` to plot their locations. But also `plot_snpasso()` can take the gene locations with the argument genes and then display them below the SNP association results. Here, we are also highlighting the top SNPs in the SNP association plot using the `drop_hilit` argument. SNPs with LOD score within `drop_hilit` of the maximum are shown in pink. - - -~~~ -genes <- query_genes(2, peak_Mbp - 1, peak_Mbp + 1) -par(mar=c(4.1, 4.1, 0.6, 0.6)) -plot(out_snps$lod, out_snps$snpinfo, drop_hilit=1.5, genes=genes) -~~~ -{: .r} - -plot of chunk id_and_plot_genes - -To get a table of the SNPs with the largest LOD scores, use the function `top_snps()`. This will show all SNPs with LOD score within some amount (the default is 1.5) of the maximum SNP LOD score. We’re going to display just a subset of the columns. - - -~~~ -top <- top_snps(out_snps$lod, out_snps$snpinfo) -print(top[,c(1, 8:15, 20)], row.names=FALSE) -~~~ -{: .r} - - - -~~~ - snp_id A_J C57BL_6J 129S1_SvImJ NOD_ShiLtJ NZO_HlLtJ CAST_EiJ - rs33297153 1 1 2 1 2 3 - rs226751615 1 1 1 1 2 2 - rs230653109 1 1 1 1 2 2 - rs250407660 1 1 1 1 2 2 - rs248216267 1 1 1 1 2 2 - rs237564299 1 1 1 1 2 2 - rs255011323 1 1 1 1 2 2 - rs212448521 1 1 1 1 2 2 - rs27379206 1 1 1 1 2 2 - rs27379194 1 1 1 1 2 2 - rs244484646 1 1 1 1 2 2 - rs215855380 1 1 1 1 2 2 - rs236942088 1 1 1 1 2 2 - rs212414861 1 1 1 1 2 2 - rs229578122 1 1 1 1 2 2 - rs254318131 1 1 1 1 2 2 - rs217679969 1 1 1 1 2 2 - rs238404461 1 1 1 1 2 2 - rs262749925 1 1 1 1 2 2 - rs231282689 1 1 1 1 2 2 - rs260286709 1 1 1 1 2 2 - rs27396282 1 1 1 1 2 2 - rs263841533 1 1 1 1 2 2 - rs231205920 1 1 1 1 2 2 - rs242885221 1 1 1 1 2 2 - rs586746690 1 1 2 1 2 2 - rs49002164 1 1 2 1 2 2 - SV_2_96945406_96945414 1 1 1 1 2 3 - rs244595995 1 1 2 1 2 2 - SV_2_96993116_96993118 1 1 1 1 2 2 - PWK_PhJ WSB_EiJ lod - 1 1 8.213182 - 1 1 7.949721 - 1 1 7.949721 - 1 1 7.949721 - 1 1 7.949721 - 1 1 7.949721 - 1 1 7.949721 - 1 1 7.949721 - 1 1 7.949721 - 1 1 7.949721 - 1 1 7.949721 - 1 1 7.949721 - 1 1 7.949721 - 1 1 7.949721 - 1 1 7.949721 - 1 1 7.949721 - 1 1 7.949721 - 1 1 7.949721 - 1 1 7.949721 - 1 1 7.949721 - 1 1 7.949721 - 1 1 7.949721 - 1 1 7.949721 - 1 1 7.949721 - 1 1 7.949721 - 1 1 8.631497 - 1 1 8.631497 - 1 1 8.280942 - 1 1 8.631497 - 1 1 8.280942 -~~~ -{: .output} - -The top SNPs all have NZO and CAST with a common allele, different from the other 6 founders. The next-best SNPs have NZO, CAST, and 129 with a common allele, and then there’s a group of SNPs where NZO has a unique allele. - -The `scan1snps()` function can also be used to perform a genome-wide SNP association scan, by providing a variant query function but leaving chr, start, and end unspecified. In this case it’s maybe best to use keep_all_snps=FALSE (the default) and only save the index SNPs. - - -~~~ -out_gwas <- scan1snps(pr, DOex$pmap, DOex$pheno, k, sex, query_func=query_variants, cores=0) -~~~ -{: .r} - -We can make a Manhattan plot of the results as follows. We use `altcol` to define a color for alternate chromosomes and `gap=0` to have no gap between chromosomes. - - -~~~ -par(mar=c(4.1, 4.1, 0.6, 0.6)) -plot(out_gwas$lod, out_gwas$snpinfo, altcol="green4", gap=0) -~~~ -{: .r} - -plot of chunk plot_gwas_scan - -Note that while there are LOD scores on the X chromosome that are as large as those on chromosome 2, we’re allowing a genotype × sex interaction on the X chromosome and so the test has 3 degrees of freedom (versus 2 degrees of freedom on the autosomes) and so the LOD scores will naturally be larger. If we’d used the allele probabilities (apr above, calculated from `genoprob_to_alleleprob()`) rather than the genotype probabilities, we would be performing a test with 1 degree of freedom on both the X chromosome and the autosomes. diff --git a/_episodes/13-qtl-in-do.md b/_episodes/13-qtl-in-do.md deleted file mode 100644 index 3931a80..0000000 --- a/_episodes/13-qtl-in-do.md +++ /dev/null @@ -1,538 +0,0 @@ ---- -title: "QTL analysis in Diversity Outbred Mice" -teaching: 30 -exercises: 30 -questions: -- "How do I bring together each step in the workflow?" -- "How is the workflow implemented in an actual study?" -objectives: -- Work through an actual QTL mapping workflow from a published study. -keypoints: -- "." -- "." -source: Rmd ---- - - - - - -This tutorial will take you through the process of mapping a QTL and searching for candidate genes for DNA damage from benzene exposure. The adverse health effects of benzene, including leukemia and aplastic anemia, were first studied in occupational settings in which workers were exposed to high benzene concentrations. Environmental sources of benzene exposure typically come from the petrochemical industry, however, a person’s total exposure can be increased from cigarettes, consumer products, gas stations, and gasoline powered engines or tools stored at home. - -Exposure thresholds for toxicants are often determined using animal models that have limited genetic diversity, including standard inbred and outbred rats and mice. These animal models fail to capture the influence of genetic diversity on toxicity response, an important component of human responses to chemicals such as benzene. The Diversity Outbred (DO) mice reflect a level of genetic diversity similar to that of humans. - -The data comes from a toxicology study in which Diversity Outbred (DO) mice were exposed to benzene via inhalation for 6 hours a day, 5 days a week for 4 weeks [(French, J. E., et al. (2015) Environ Health Perspect 123(3): 237-245)](http://ehp.niehs.nih.gov/1408202/). The study was conducted in two equally sized cohort of 300 male mice each, for a total of 600 mice. They were then sacrificed and reticulocytes (red blood cell precursors) were isolated from bone marrow. The number of micro-nucleated reticulocytes, a measure of DNA damage, was then measured in each mouse. The goal is to map gene(s) that influence the level of DNA damage in the bone marrow. -![](../fig/benzene_study_design.png) - - -### Load and explore the data - -Make sure that you are in your scripts directory. If you're not sure where you are working right now, you can check your working directory with `getwd()`. If you are not in your scripts directory, run `setwd("scripts")` in the Console or Session -> Set Working Directory -> Choose Directory in the RStudio menu to set your working directory to the scripts directory. - -Once you are in your scripts directory, create a new R script with File -> New File -> R script, or use the +document icon at upper left. - -The data for this tutorial has been saved as an R binary file that contains several data objects. Load it in now by running the following command in your new script. - - -~~~ -load("../data/qtl2_demo.Rdata") -sessionInfo() -~~~ -{: .r} - -The call to `sessionInfo` provides information about the R version running on your machine and the R packages that are installed. This information can help you to troubleshoot. If you can't load the data, try downloading the data again by copying and pasting the following into your browser: ftp://ftp.jax.org/dgatti/qtl2_workshop/qtl2_demo.Rdata - -We loaded in three data objects. Check the Environment tab to see what was loaded. You should see a phenotypes object called `pheno` with 598 observations (in rows) of 12 variables (in columns), an object called `map` (physical map), and an object called `probs` (genotype probabilities). - -#### Phenotypes -`pheno` is a data frame containing the phenotype data. Click on the triangle to the left of `pheno` in the Environment pane to view its contents. Run `names(pheno)` to list the variables. - -**NOTE:** the sample IDs must be in the rownames of `pheno`. For more information about data file format, see the [Setup](../setup.md) instructions. - -`pheno` contains the sample ID, sex, the study cohort, the concentration of benzene and the proportion of bone marrow reticulocytes that were micro-nucleated `(prop.bm.MN.RET)`. Note that the sample IDs are also stored in the rownames of `pheno`. In order to save time for this tutorial, we will only map with 598 samples from the 100 ppm dosing group. - -> ## Challenge 1 Look at the Data -> Determine the dimensions of `pheno`. -> 1). How many rows and columns does it have? -> 2). What are the names of the variables it contains? -> 3). What does the distribution of the `prop.bm.MN.RET` column look like? Is it normally distributed? -> -> > ## Solution to Challenge 1 -> > `dim(pheno)` -> > 1). `dim(pheno)[1]` gives the number of rows and `dim(pheno)[2]` the number of columns. -You can also use the structure command `str(pheno)`, which will tell you that this is a data frame with 598 observations of 12 variables. It will also list the 12 variable names for you. -> > 2). Use `colnames(pheno)` or `names(pheno)`, or click the triangle to the left of `pheno` -> > in the Environment tab. -> > 3). Use `hist(pheno$prop.bm.MN.RET)` to plot the distribution of the data. The data has a long right -> > tail and is not normally distributed. -> {: .solution} -{: .challenge} - -Many statistical models, including the QTL mapping model in qtl2, expect that the incoming data will be normally distributed. You may use transformations such as log or square root to make your data more normally distributed. We will be mapping the proportion of micronucleated reticulocytes in bone marrow (`prop.bm.MN.RET`) and, since the data does not contain zeros or negative numbers, we will log transform the data. Here is a histogram of the untransformed data. - - -~~~ -hist(pheno$prop.bm.MN.RET, main = "Proportion of micronucleated reticulocytes") -~~~ -{: .r} - -plot of chunk hist_untransformed - -Apply the `log()` function to this data. - - -~~~ -pheno$log.MN.RET <- log(pheno$prop.bm.MN.RET) -~~~ -{: .r} - -Now, let's make a histogram of the log-transformed data. - - -~~~ -hist(pheno$log.MN.RET, main = "Proportion of micronucleated reticulocytes (log-transformed)") -~~~ -{: .r} - -plot of chunk hist_log_transform - -This looks much better! - -Some researchers are concerned about the reproducibility of DO studies. The argument is that each DO mouse is unique and therefore can never be reproduced. But this misses the point of using the DO. While mice are the sampling unit, in QTL mapping we are sampling the founder alleles at each locus. An average of 1/8th of the alleles should come from each founder at any given locus. Also, DO mice are a *population* of mice, not a single strain. While it is true that results in an individual DO mouse may not be reproducible, results at the population level should be reproducible. This is similar to the human population in that results from one individual may not represent all humans, but results at the population level should be reproducible. - -The benzene inhalation study was conducted in two separate cohorts (termed *Study* in the `pheno` object). Below, we plot the proportion of micronucleated reticulocytes in bone marrow versus the benzene concentration for each study cohort. As you can see, while individual mice have varying micronucleated reticulocytes, there is a dose-dependent increase in micronucleated reticulocytes in both cohorts. This is an example of how results in the DO reproduce at the population level. - -![](../fig/bm_mnret_by_dose_cohort.png) - - -#### The Marker Map - -The marker map for each chromosome is stored in the `map` object. Each list element is a numeric vector with each marker position in megabases (Mb). This is used to plot the LOD scores calculated at each marker during QTL mapping. - -The markers are from a genotyping array called the Mouse Universal Genotyping Array (MUGA) and contains 7,856 SNP markers. Marker locations for the MUGA and other mouse arrays are available from [The Jackson Laboratory's FTP site](ftp://ftp.jax.org/MUGA). - -Look at the structure of `map` in the Environment tab by clicking the triangle to the left or by running `str(map)` in the Console. - -> ## Challenge 2 Data Dimensions II -> 1). Determine the length of `map`. -> 2). How many markers are on chromosome 1? -> -> > ## Solution to Challenge 2 -> > 1). `length(map)` -> > 2). `length(map[[1]])` -> {: .solution} -{: .challenge} - -#### Genotype (allele) probabilities -Each element of `probs` is a 3 dimensional array containing the founder allele dosages for each sample at each marker on one chromosome. These probabilities have been pre-calculated for you, so you can skip the step for [calculating genotype probabilities](https://smcclatchy.github.io/mapping/03-calc-genoprob/) and the optional step for calculating allele probabilities. - -Next, we look at the dimensions of `probs` for chromosome 1: - - -~~~ -dim(probs[[1]]) -~~~ -{: .r} - - - -~~~ -[1] 590 8 537 -~~~ -{: .output} - -`probs` is a three dimensional array containing the proportion of each founder haplotype at each marker for each DO sample. The 590 samples are in the first dimension, the 8 founders in the second and the 537 markers along chromosome 1 are in the third dimension. -Let's return to the `probs` object. Look at the contents for the first 500 markers of one sample. - -**NOTE:** the sample IDs must be in the rownames of `probs`. - - -~~~ -plot_genoprob(probs, map, ind = 1, chr = 1) -~~~ -{: .r} - -plot of chunk geno_plot - -In the plot above, the founder contributions, which range between 0 and 1, are colored from white (= 0) to black (= 1.0). A value of ~0.5 is grey. The markers are on the X-axis and the eight founders (denoted by the letters A through H) on the Y-axis. Starting at the left, we see that this sample has genotype BB because the row for B is black, indicating values of 1.0. Moving along the genome to the right, the genotype becomes CE where rows C and E are gray, followed by CD, FH, AG, GH, etc. The values at each marker sum to 1.0. - - -### [Calculating A Kinship Matrix](https://smcclatchy.github.io/mapping/04-calc-kinship/) -Next, we need to create a matrix that accounts for the kinship relationships between the mice. We do this by looking at the correlation between the founder haplotypes for each sample at each SNP. For each chromosome, we create a kinship matrix using all markers *except* the ones on the current chromosome using the loco (leave-one-chromosome-out) method. Simulations suggest that mapping using this approach increases the power to detect QTL. - - -~~~ -K <- calc_kinship(probs = probs, type = "loco") -~~~ -{: .r} - -Kinship values between pairs of samples range between 0 (no relationship) and 1.0 (completely identical). Let's look at the kinship matrix. - - -~~~ -n_samples <- 50 -heatmap(K[[1]][1:n_samples, 1:n_samples]) -~~~ -{: .r} - -plot of chunk kinship_probs - -The figure above shows kinship between all pairs of samples. Light yellow indicates low kinship and dark red indicates higher kinship. Orange values indicate varying levels of kinship between 0 and 1. The dark red diagonal of the matrix indicates that each sample is identical to itself. The orange blocks along the diagonal may indicate close relatives (i.e. siblings or cousins). - -#### Covariates -Next, we need to create additive covariates that will be used in the mapping model. We will use study cohort as a covariate in the mapping model. If we were mapping with *all* mice, we would also add benzene concentration to the model. This study contained only male mice, but in most cases, you would include sex as an additive covariate as well. - - -~~~ -addcovar <- model.matrix(~Study, data = pheno)[,-1] -~~~ -{: .r} - -The code above copies the `rownames(pheno)` to `rownames(addcovar)` as a side-effect. - -**NOTE:** the sample IDs must be in the rownames of `pheno`, `addcovar`, `genoprobs` and `K`. `qtl2` uses the sample IDs to align the samples between objects. For more information about data file format, see [Karl Broman's vignette on input file format](http://kbroman.org/qtl2/assets/vignettes/input_files.html). - -### [Performing a genome scan](https://smcclatchy.github.io/mapping/06-perform-genome-scan/) - -Before we run the mapping function, let's look at the mapping model. At each marker on the genotyping array, we will fit a model that regresses the phenotype (micronucleated reticulocytes) on covariates and the founder allele proportions. - -![](../fig/equation1.png) - - where: - - - -Note that this model will give us an estimate of the effect of each founder allele at each marker. There are eight founder strains that contributed to the DO, so we will get eight founder allele effects. - -There are almost 600 samples in this data set and it may take several minutes to map one trait. In order to save some time, we will map using only the samples in the 100 ppm concentration group. - - -~~~ -c100 <- which(pheno$Conc == 100) -~~~ -{: .r} - -In order to map the proportion of bone marrow reticulocytes that were micro-nucleated, you will use the [scan1](https://github.com/rqtl/qtl2/blob/master/R/plot_scan1.R) function. To see the arguments for [scan1](https://github.com/rqtl/qtl2/blob/master/R/plot_scan1.R), you can type `help(scan1)`. First, let's map the *untransformed* phenotype. (Recall that we log-transformed it above). - - -~~~ -index <- which(colnames(pheno) == "prop.bm.MN.RET") -qtl <- scan1(genoprobs = probs, pheno = pheno[c100,index, drop = FALSE], kinship = K, addcovar = addcovar) -~~~ -{: .r} - -Next, we plot the genome scan. - - -~~~ -plot_scan1(x = qtl, map = map, main = "Proportion of Micro-nucleated Bone Marrow Reticulocytes") -~~~ -{: .r} - -plot of chunk qtl_plot - -> ## Challenge 3 How does a log-tranformation change the QTL plot? -> 1). Perform a genome scan on the column called `log.MN.RET`. (Hint: set `index` to the column index in `pheno`.) -> 2). How does the LOD score for the peak on Chr 10 change? -> > ## Solution to Challenge 3 -> > 1). `index <- which(colnames(pheno) == "prop.bm.MN.RET")` -> > `qtl <- scan1(genoprobs = probs, pheno = pheno[c100,index, drop = FALSE], kinship = K, addcovar = addcovar)` -> > `plot_scan1(x = qtl, map = map, main = "Log-Transformed BM micronucleated reticulocytes")` -> > 2). The LOD increases from ~16 to ~26. -> {: .solution} -{: .challenge} - - -This challenge shows the importance of looking at your data and transforming it to meet the expectations of the mapping model. In this case, a log transformation improved the model fit and increased the LOD score. We will continue the rest of this lesson using the log-transformed data. Set your `index` variable equal to the column index of `log.MN.RET`. Redo the genome scan with the log-transformed reticulocytes. - - -~~~ -index <- which(colnames(pheno) == "log.MN.RET") -qtl <- scan1(genoprobs = probs, pheno = pheno[c100, index, drop = FALSE], kinship = K, addcovar = addcovar) -~~~ -{: .r} - -### [Performing a permutation test](https://smcclatchy.github.io/mapping/10-perform-perm-test/) - -There is clearly a large peak on Chr 10. Next, we must assess its statistical significance. This is most commonly done via [permutation](http://www.genetics.org/content/178/1/609.long). We advise running at least 1,000 permutations to obtain significance thresholds. In the interest of time, we perform 100 permutations here. - - -~~~ -perms <- scan1perm(genoprobs = probs, pheno = pheno[c100,index, drop = FALSE], addcovar = addcovar, n_perm = 100) -~~~ -{: .r} - -The `perms` object contains the maximum LOD score from each genome scan of permuted data. - -> ## Challenge 4 What is significant? -> 1). Create a histogram of the LOD scores `perms`. Hint: use the `hist()` function. -> 2). Estimate the value of the LOD score at the 95th percentile. -> 3). Then find the value of the LOD score at the 95th percentile using the `summary()` function. -> > ## Solution to Challenge 4 -> > 1). `hist(x = perms)` or `hist(x = perms, breaks = 15)` -> > 2). By counting number of occurrences of each LOD value (frequencies), we can approximate the 95th percentile at ~7.5. -> > 3). `summary(perms)` -> {: .solution} -{: .challenge} - -We can now add thresholds to the previous QTL plot. We use a significance threshold of p < 0.05. To do this, we select the 95th percentile of the permutation LOD distribution. - - -~~~ -plot(x = qtl, map = map, main = "Proportion of Micro-nucleated Bone Marrow Reticulocytes") -thr = summary(perms) -abline(h = thr, col = "red", lwd = 2) -~~~ -{: .r} - -plot of chunk qtl_plot_thr - -The peak on Chr 10 is well above the red significance line. - -### [Finding LOD peaks](https://smcclatchy.github.io/mapping/07-find-lod-peaks/) -We can then plot the QTL scan. Note that you must provide the marker map, which we loaded earlier in the MUGA SNP data. - - -We can find all of the peaks above the significance threshold using the [find_peaks](https://github.com/rqtl/qtl2/blob/master/R/find_peaks.R) function. - - -~~~ -find_peaks(scan1_output = qtl, map = map, threshold = thr) -~~~ -{: .r} - - - -~~~ - lodindex lodcolumn chr pos lod -1 1 log.MN.RET 10 33.64838 26.22947 -~~~ -{: .output} - -> ## Challenge 5 Find all peaks -> Find all peaks for this scan whether or not they meet the 95% significance threshold. -> > ## Solution to Challenge 5 -> > `find_peaks(scan1_output = qtl, map = map)` -> > Notice that some peaks are missing because they don't meet the default threshold value of 3. See `help(find_peaks)` for more information about this function. -> {: .solution} -{: .challenge} - -The support interval is determined using the [Bayesian Credible Interval](http://www.ncbi.nlm.nih.gov/pubmed/11560912) and represents the region most likely to contain the causative polymorphism(s). We can obtain this interval by adding a `prob` argument to [find_peaks](https://github.com/rqtl/qtl2/blob/master/R/find_peaks.R). We pass in a value of `0.95` to request a support interval that contains the causal SNP 95% of the time. - - -~~~ -find_peaks(scan1_output = qtl, map = map, threshold = thr, prob = 0.95) -~~~ -{: .r} - - - -~~~ - lodindex lodcolumn chr pos lod ci_lo ci_hi -1 1 log.MN.RET 10 33.64838 26.22947 31.8682 34.17711 -~~~ -{: .output} - -From the output above, you can see that the support interval is 5.5 Mb wide (30.16649 to 35.49352 Mb). The location of the maximum LOD score is at 34.17711 Mb. - -### [Estimated QTL effects](https://smcclatchy.github.io/mapping/11-est-qtl-effects/) - -We will now zoom in on Chr 10 and look at the contribution of each of the eight founder alleles to the proportion of bone marrow reticulocytes that were micro-nucleated. Remember, the mapping model above estimates the effect of each of the eight DO founders. We can plot these effects (also called 'coefficients') across Chr 10 using [scan1coef](https://github.com/rqtl/qtl2/blob/master/R/scan1coef.R). - - -~~~ -chr <- 10 -coef10 <- scan1blup(genoprobs = probs[,chr], pheno = pheno[c100,index, drop = FALSE], kinship <- K[[chr]], addcovar = addcovar) -~~~ -{: .r} - -This produces an object containing estimates of each of the eight DO founder allele effect. These are the βj values in the mapping equation above. - - -~~~ -plot_coefCC(x = coef10, map = map, scan1_output = qtl, main = "Proportion of Micro-nucleated Bone Marrow Reticulocytes") -~~~ -{: .r} - -plot of chunk coef_plot - -The top panel shows the eight founder allele effects (or model coefficients) along Chr 10. The founder allele effects are centerd at zero and the units are the same as the phenotype. You can see that DO mice containing the CAST/EiJ allele near 34 Mb have lower levels of micro-nucleated reticulocytes. This means that the CAST allele is associated with less DNA damage and has a protective effect. The bottom panel shows the LOD score, with the support interval for the peak shaded blue. - -### [SNP Association Mapping](https://smcclatchy.github.io/mapping/12-snp-assoc/) - -At this point, we have a 6 Mb wide support interval that contains a polymorphism(s) that influences benzene-induced DNA damage. Next, we will impute the DO founder sequences onto the DO genomes. The [Sanger Mouse Genomes Project](http://www.sanger.ac.uk/resources/mouse/genomes/) has sequenced the eight DO founders and provides SNP, insertion-deletion (indel), and structural variant files for the strains (see [Baud et.al., Nat. Gen., 2013](http://www.nature.com/ng/journal/v45/n7/full/ng.2644.html)). We can impute these SNPs onto the DO genomes and then perform association mapping. The process involves several steps and I have provided a function to encapsulate the steps. To access the Sanger SNPs, we use a SQLlite database provided by [Karl Broman](https://github.com/kbroman). You should have downloaded this during [Setup](../setup.md). It is available from the [JAX FTP site](ftp://ftp.jax.org/dgatti/CC_SNP_DB/cc_variants.sqlite), but the file is 3 GB, so it may take too long to download right now. - -![](../fig/DO.impute.founders.sm.png) - -Association mapping involves imputing the founder SNPs onto each DO genome and fitting the mapping model at each SNP. At each marker, we fit the following model: - -![](../fig/equation2.png) - - - where: - - - -We can call [scan1snps](https://github.com/rqtl/qtl2/blob/master/R/scan1snps.R) to perform association mapping in the QTL interval on Chr 10. We first create variables for the chromosome and support interval where we are mapping. We then create a function to get the SNPs from the founder SNP database The path to the SNP database (`snpdb_file` argument) points to the data directory on your computer. Note that it is important to use the `keep_all_snps = TRUE` in order to return all SNPs. - - -~~~ -chr <- 10 -start <- 30 -end <- 36 -query_func <- create_variant_query_func("../data/cc_variants.sqlite") -assoc <- scan1snps(genoprobs = probs[,chr], map = map, pheno = pheno[c100,index,drop = FALSE], kinship = K, addcovar = addcovar, query_func = query_func, chr = chr, start = start, end = end, keep_all_snps = TRUE) -~~~ -{: .r} - -The `assoc` object is a list containing two objects: the LOD scores for each unique SNP and a `snpinfo` object that maps the LOD scores to each SNP. To plot the association mapping, we need to provide both objects to the [plot_snpasso](https://github.com/rqtl/qtl2/blob/master/R/plot_snpasso.R) function. - - -~~~ -plot_snpasso(scan1output = assoc$lod, snpinfo = assoc$snpinfo, main = "Proportion of Micro-nucleated Bone Marrow Reticulocytes") -~~~ -{: .r} - -plot of chunk assoc_fig - -This plot shows the LOD score for each SNP in the QTL interval. The SNPs occur in "shelves" because all of the SNPs in a haplotype block have the same founder strain pattern. The SNPs with the highest LOD scores are the ones for which CAST/EiJ contributes the alternate allele. - -We can add a plot containing the genes in the QTL interval using the `plot_genes` function. We get the genes from another SQLlite database created by [Karl Broman](https://github.com/kbroman) called `mouse_genes.sqlite`. You should have downloaded this from the [JAX FTP Site](ftp://ftp.jax.org/dgatti/CC_SNP_DB/mouse_genes.sqlite) during [Setup](../setup.md). - -First, we must query the database for the genes in the interval. The path of the first argument points to the data directory on your computer. - - -~~~ -query_genes <- create_gene_query_func(dbfile = "../data/mouse_genes.sqlite", filter = "source='MGI'") -genes <- query_genes(chr, start, end) -head(genes) -~~~ -{: .r} - - - -~~~ - chr source type start stop score strand phase ID -1 10 MGI pseudogene 30.01095 30.01730 NA + NA MGI:MGI:2685078 -2 10 MGI pseudogene 30.08426 30.08534 NA - NA MGI:MGI:3647013 -3 10 MGI pseudogene 30.17971 30.18022 NA + NA MGI:MGI:3781001 -4 10 MGI gene 30.19457 30.20060 NA - NA MGI:MGI:1913561 -5 10 MGI pseudogene 30.37292 30.37556 NA + NA MGI:MGI:3643405 -6 10 MGI gene 30.45052 30.45170 NA + NA MGI:MGI:5623507 - Name Parent Dbxref -1 Gm232 NCBI_Gene:212813,ENSEMBL:ENSMUSG00000111554 -2 Gm8767 NCBI_Gene:667696,ENSEMBL:ENSMUSG00000111001 -3 Gm2829 NCBI_Gene:100040542,ENSEMBL:ENSMUSG00000110776 -4 Cenpw NCBI_Gene:66311,ENSEMBL:ENSMUSG00000075266 -5 Gm4780 NCBI_Gene:212815,ENSEMBL:ENSMUSG00000111047 -6 Gm40622 NCBI_Gene:105245128 - mgiName bioType Alias -1 predicted gene 232 pseudogene -2 predicted gene 8767 pseudogene -3 predicted gene 2829 pseudogene -4 centromere protein W protein coding gene -5 predicted gene 4780 pseudogene -6 predicted gene%2c 40622 lncRNA gene -~~~ -{: .output} - -The `genes` object contains annotation information for each gene in the interval. - -Next, we will create a plot with two panels: one containing the association mapping LOD scores and one containing the genes in the QTL interval. We do this by passing in the `genes` argument to [plot_snpasso](https://github.com/rqtl/qtl2/blob/master/R/plot_snpasso.R). We can also adjust the proportion of the plot allocated for SNPs (on the top) and genes (on th bottom) using the 'top_panel_prop' argument. - - -~~~ -plot_snpasso(assoc$lod, assoc$snpinfo, main = "Proportion of Micro-nucleated Bone Marrow Reticulocytes", genes = genes, top_panel_prop = 0.5) -~~~ -{: .r} - -plot of chunk plot_assoc2 - -### Searching for Candidate Genes - -One strategy for finding genes related to a phenotype is to search for genes with expression QTL (eQTL) in the same location. Ideally, we would have liver and bone marrow gene expression data in the DO mice from this experiment. Unfortunately, we did not collect this data. However, we have liver gene expression for a separate set of untreated DO mice [Liver eQTL Viewer](https://churchilllab.jax.org/qtlviewer/svenson/DOHFD). We searched for genes in the QTL interval that had an eQTL in the same location. Then, we looked at the pattern of founder effects to see if CAST stood out. We found two genes that met these criteria. - -![](../fig/French.et.al.Figure3.png) - -As you can see, both Sult3a1 and Gm4794 have eQTL in the same location on Chr 10 and mice with CAST allele (in green) express these genes more highly. Sult3a1 is a sulfotransferase that may be involved in adding a sulphate group to phenol, one of the metabolites of benzene. Go to the Ensembl web page for [Gm4794](http://www.ensembl.org/Mus_musculus/Gene/Summary?db=core;g=ENSMUSG00000090298;r=10:33766424-33786704). Note that Gm4794 has a new name: Sult3a2. In the menu on the left, click on the "Gene Tree (image)" link. - -![](../fig/EnsEMBL_Sult3a1_Gm4794_paralog.png) - -As you can see, Sult3a2 is a paralog of Sult3a1 and hence both genes are sulfotransferases. These genes encode enzymes that attach a sulfate group to other compounds. - -We also looked at a public gene expression database in which liver, spleen and kidney gene expression were measured in 26 inbred strains, including the eight DO founders. You can search for Sult3a1 and Gm4794 in this [strain survey data](http://cgd.jax.org/gem/strainsurvey26/v1). We did this and plotted the spleen and liver expression values. We did not have bone marrow expression data from this experiment. We also plotted the expression of all of the genes in the QTL support interval that were measured on the array (data not shown). Sult3a1 and its paralog Gm4794 were the only genes with a different expression pattern in CAST. Neither gene was expressed in the spleen. - -![](../fig/French.et.al.Sup.Figure2.png) - -Next, go to the [Sanger Mouse Genomes](http://www.sanger.ac.uk/sanger/Mouse_SnpViewer/rel-1505) website and enter Sult3a1 into the Gene box. Scroll down and check only the DO/CC founders (129S1/SvImJ, A/J, CAST/EiJ, NOD/ShiLtJ, NZO/HlLtJ & WSB/EiJ) and then scroll up and press `Search`. This will show you SNPs in Sult3a1. Select the `Structural Variants` tab and note the copy number gain in CAST from 33,764,194 to 33,876,194 bp. Click on the G to see the location, copy this position (10:33764194-33876194) and go to the [Ensembl website](http://useast.ensembl.org/Mus_musculus/Info/Index). Enter the position into the search box and press `Go`. You will see a figure similar to the one below. - -![](../fig/EnsEMBL.Sult3a1.png) - -Note that both Gm4794 (aka Sult3a2) and part of Sult3a1 are in the copy number gain region. - -In order to visualize the size of the copy number gain, we queried the [Sanger Mouse Genomes alignment files](ftp://ftp-mouse.sanger.ac.uk/current_bams/) for the eight founders. We piled up the reads at each base (which is beyond the scope of this tutorial) and made the figure below. - -![](../fig/French.et.al.Sup.Figure3.png) - -As you can see, there appears to be a duplicatation in the CAST founders that covers four genes: Clvs2, Gm15939, Gm4794 and Sult3a1. Clvs2 is expressed in neurons and Gm15939 is a predicted gene that may not produce a transcript. - -Hence, we have three pieces of evidence that narrows our candidate gene list to Sult3a1 and Gm4794: - -1. Both genes have a liver eQTL in the same location as the micronucleated reticulocytes QTL. -2. Among genes in the micronucleated reticulocytes QTL interval, only Sult3a1 and Gm4794 have differential expression of the CAST allele in the liver. -3. There is a copy number gain of these two genes in CAST. - -Sulfation is a prominent detoxification mechanism for benzene as well. The diagram below shows the metabolism pathway for benzene [(Monks, T. J., et al. (2010). Chem Biol Interact 184(1-2): 201-206.)](http://europepmc.org/articles/PMC4414400) Hydroquinone, phenol and catechol are all sulfated and excreted from the body. - -![](../fig/Monks_ChemBiolInter_2010_Fig1.jpg) - -This analysis has led us to the following hypothesis. Inhaled benzene is absorbed by the lungs into the bloodstream and transported to the liver. There, it is metabolized, and some metabolites are transported to the bone marrow. One class of genes that is involved in toxicant metabolism are sulfotransferases. [Sult3a1](http://www.informatics.jax.org/marker/MGI:1931469) is a phase II enzyme that conjugates compounds (such as phenol, which is a metabolite of benzene) with a sulfate group before transport into the bile. It is possible that a high level of Sult3a1 expression could remove benzene by-products and be protective. Our hypothesis is that the copy number gain in the CAST allele increases liver gene expression of Sult3a1 and Gm4794. High liver expression of these genes allows mice containing the CAST allele to rapidly conjugate harmful benzene metabolites and excrete them from the body before they can reach the bone marrow and cause DNA damage. Further experimental validation is required, but this is a plausible hypothesis. - -![](../fig/benzene_hypothesis.png) - -We hope that this tutorial has shown you how the DO can be used to map QTL and use the founder effects and bioinformatics resources to narrow down the candidate gene list. Here, we made used of external gene expression databases and the founder sequence data to build a case for a pair of genes. - - -> ## Challenge 6 Map another trait. -> 1). Make a histogram of the column `pre.prop.MN.RET` in `pheno`. Does it look like it should be log transformed? If so, add a column to `pheno` that contains the log of `pre.prop.MN.RET`. -> 2). Perform a genome scan *using all samples* on the column called `pre.prop.MN.RET`. Since you're using all samples, the scan will take longer. (Hint: set `index` to the column index in `pheno`.) -> 3). Which chromosome has the higest peak? Use `find_peaks` to get the location and support interval for the highest peak. -> 4). Calculate and plot the BLUPs for the chromosome with the highest peak. (This may take a few minutes.) -> 5). Perform association mapping in the support interval for the QTL peak, plot the results and plot the genes beneath the association mapping plot. -> > ## Solution to Challenge 6 -> > 1). `hist(pheno$pre.prop.MN.RET)` -> > `pheno$log.pre.MN.RET <- log(pheno$pre.prop.MN.RET)` -> > 2). `index <- which(colnames(pheno) == "log.pre.MN.RET")` -> > `addcovar <- model.matrix(~Study + Conc, data = pheno)[,-1]` -> > `qtl_pre <- scan1(genoprobs = probs, pheno = pheno[,index, drop = FALSE], kinship = K, addcovar = addcovar)` -> > `plot_scan1(x = qtl_pre, map = map, main = "Log-Transformed Pre-dose micronucleated reticulocytes")` -> > 3). `find_peaks(qtl_pre, map, threshold = 10, prob = 0.95)` -> > 4). `chr <- 4` -> > `coef4 <- scan1blup(genoprobs = probs[,chr], pheno = pheno[,index, drop = FALSE], kinship = K[[chr]], addcovar = addcovar)` -> > `plot_coefCC(x = coef4, map = map, scan1_output = qtl, main = "Log-Transformed Pre-dose micronucleated reticulocytes")` -> > 5). `chr <- 4` -> > `start <- 132.5` -> > `end <- 136` -> > `assoc4 = scan1snps(genoprobs = probs[,chr], map = map, pheno = pheno[,index,drop = FALSE], kinship = K, addcovar = addcovar, query_func = query_func, chr = chr, start = start, end = end, keep_all_snps = TRUE)` -> > `genes <- query_genes(chr, start, end)` -> > `plot_snpasso(assoc4$lod, assoc4$snpinfo, main = "Log-Transformed Pre-dose micronucleated reticulocytes", genes = genes)` -> {: .solution} -{: .challenge} - - diff --git a/_layouts/base.html b/_layouts/base.html index 365d2e5..34a4aae 100644 --- a/_layouts/base.html +++ b/_layouts/base.html @@ -13,11 +13,11 @@ {% if site.carpentry == "swc" %} - + {% elsif site.carpentry == "dc" %} - + {% elsif site.carpentry == "lc" %} - + {% endif %} diff --git a/_layouts/workshop.html b/_layouts/workshop.html index b873e4c..5b89ceb 100644 --- a/_layouts/workshop.html +++ b/_layouts/workshop.html @@ -29,11 +29,11 @@ {% if site.carpentry == "swc" %} - + {% elsif site.carpentry == "dc" %} - + {% elsif site.carpentry == "lc" %} - + {% endif %} diff --git a/favicon-dc.ico b/assets/img/favicon-dc.ico similarity index 100% rename from favicon-dc.ico rename to assets/img/favicon-dc.ico diff --git a/favicon-lc.ico b/assets/img/favicon-lc.ico similarity index 100% rename from favicon-lc.ico rename to assets/img/favicon-lc.ico diff --git a/favicon-swc.ico b/assets/img/favicon-swc.ico similarity index 100% rename from favicon-swc.ico rename to assets/img/favicon-swc.ico diff --git a/bin/generate_md_episodes.R b/bin/generate_md_episodes.R index f2a40ba..f90c347 100644 --- a/bin/generate_md_episodes.R +++ b/bin/generate_md_episodes.R @@ -22,8 +22,15 @@ generate_md_episodes <- function() { install.packages(missing_pkgs) } - ## find all the Rmd files, and generate the paths for their respective outputs + ## find all the Rmd files src_rmd <- list.files(pattern = "??-*.Rmd$", path = "_episodes_rmd", full.names = TRUE) + + ## Create _episodes directory if it does not exist + if (!dir.exists("_episodes")) { + dir.create("_episodes") + } + + ## generate the paths for their respective outputs dest_md <- file.path("_episodes", gsub("Rmd$", "md", basename(src_rmd))) ## knit the Rmd into markdown @@ -33,4 +40,4 @@ generate_md_episodes <- function() { } -generate_md_episodes() +generate_md_episodes() \ No newline at end of file diff --git a/bin/lesson_initialize.py b/bin/lesson_initialize.py index fc7baf7..a78535e 100755 --- a/bin/lesson_initialize.py +++ b/bin/lesson_initialize.py @@ -380,7 +380,7 @@ ('reference.md', ROOT_REFERENCE_MD), ('setup.md', ROOT_SETUP_MD), ('aio.md', ROOT_AIO_MD), - ('_episodes/01-introduction.md', EPISODES_INTRODUCTION_MD), + ('Introduction.md', EPISODES_INTRODUCTION_MD), ('_extras/about.md', EXTRAS_ABOUT_MD), ('_extras/discuss.md', EXTRAS_DISCUSS_MD), ('_extras/figures.md', EXTRAS_FIGURES_MD), diff --git a/fig/rmd-03-plot_genoprob-1.png b/fig/rmd-03-plot_genoprob-1.png deleted file mode 100644 index c15290e..0000000 Binary files a/fig/rmd-03-plot_genoprob-1.png and /dev/null differ diff --git a/fig/rmd-06-hist_perm-1.png b/fig/rmd-06-hist_perm-1.png deleted file mode 100644 index b8a852b..0000000 Binary files a/fig/rmd-06-hist_perm-1.png and /dev/null differ diff --git a/fig/rmd-06-plot_lod-1.png b/fig/rmd-06-plot_lod-1.png deleted file mode 100644 index bb32ba7..0000000 Binary files a/fig/rmd-06-plot_lod-1.png and /dev/null differ diff --git a/fig/rmd-06-regression_plot-1.png b/fig/rmd-06-regression_plot-1.png deleted file mode 100644 index d1e96d8..0000000 Binary files a/fig/rmd-06-regression_plot-1.png and /dev/null differ diff --git a/fig/rmd-08-plot_kinship-1.png b/fig/rmd-08-plot_kinship-1.png deleted file mode 100644 index a465e6f..0000000 Binary files a/fig/rmd-08-plot_kinship-1.png and /dev/null differ diff --git a/fig/rmd-11-plot_DOex_effects-1.png b/fig/rmd-11-plot_DOex_effects-1.png deleted file mode 100644 index 27ab11b..0000000 Binary files a/fig/rmd-11-plot_DOex_effects-1.png and /dev/null differ diff --git a/fig/rmd-11-plot_DOex_scan-1.png b/fig/rmd-11-plot_DOex_scan-1.png deleted file mode 100644 index 580865b..0000000 Binary files a/fig/rmd-11-plot_DOex_scan-1.png and /dev/null differ diff --git a/fig/rmd-11-plot_effects_liver_c2-1.png b/fig/rmd-11-plot_effects_liver_c2-1.png deleted file mode 100644 index f7839a6..0000000 Binary files a/fig/rmd-11-plot_effects_liver_c2-1.png and /dev/null differ diff --git a/fig/rmd-11-plot_effects_liver_simple-1.png b/fig/rmd-11-plot_effects_liver_simple-1.png deleted file mode 100644 index db6f7dd..0000000 Binary files a/fig/rmd-11-plot_effects_liver_simple-1.png and /dev/null differ diff --git a/fig/rmd-11-plot_effects_pg_liver_c2-1.png b/fig/rmd-11-plot_effects_pg_liver_c2-1.png deleted file mode 100644 index b1e7859..0000000 Binary files a/fig/rmd-11-plot_effects_pg_liver_c2-1.png and /dev/null differ diff --git a/fig/rmd-11-plot_pheno_geno-1.png b/fig/rmd-11-plot_pheno_geno-1.png deleted file mode 100644 index 3047a05..0000000 Binary files a/fig/rmd-11-plot_pheno_geno-1.png and /dev/null differ diff --git a/fig/rmd-11-plot_pheno_geno_se-1.png b/fig/rmd-11-plot_pheno_geno_se-1.png deleted file mode 100644 index 7d36042..0000000 Binary files a/fig/rmd-11-plot_pheno_geno_se-1.png and /dev/null differ diff --git a/fig/rmd-11-plot_snp_asso-1.png b/fig/rmd-11-plot_snp_asso-1.png deleted file mode 100644 index ede1a72..0000000 Binary files a/fig/rmd-11-plot_snp_asso-1.png and /dev/null differ diff --git a/fig/rmd-11-plot_snp_asso_hilit-1.png b/fig/rmd-11-plot_snp_asso_hilit-1.png deleted file mode 100644 index 4cca296..0000000 Binary files a/fig/rmd-11-plot_snp_asso_hilit-1.png and /dev/null differ diff --git a/fig/rmd-11-plotblup-1.png b/fig/rmd-11-plotblup-1.png deleted file mode 100644 index e3fb697..0000000 Binary files a/fig/rmd-11-plotblup-1.png and /dev/null differ diff --git a/fig/rmd-12-geno_plot-1.png b/fig/rmd-12-geno_plot-1.png deleted file mode 100644 index b973e91..0000000 Binary files a/fig/rmd-12-geno_plot-1.png and /dev/null differ diff --git a/fig/rmd-12-id_and_plot_genes-1.png b/fig/rmd-12-id_and_plot_genes-1.png deleted file mode 100644 index a5fe704..0000000 Binary files a/fig/rmd-12-id_and_plot_genes-1.png and /dev/null differ diff --git a/fig/rmd-12-kinship_probs-1.png b/fig/rmd-12-kinship_probs-1.png deleted file mode 100644 index 006dc01..0000000 Binary files a/fig/rmd-12-kinship_probs-1.png and /dev/null differ diff --git a/fig/rmd-12-plot_DOex_effects-1.png b/fig/rmd-12-plot_DOex_effects-1.png deleted file mode 100644 index a560f7c..0000000 Binary files a/fig/rmd-12-plot_DOex_effects-1.png and /dev/null differ diff --git a/fig/rmd-12-plot_DOex_lod_curve-1.png b/fig/rmd-12-plot_DOex_lod_curve-1.png deleted file mode 100644 index 29ed22f..0000000 Binary files a/fig/rmd-12-plot_DOex_lod_curve-1.png and /dev/null differ diff --git a/fig/rmd-12-plot_DOex_scan-1.png b/fig/rmd-12-plot_DOex_scan-1.png deleted file mode 100644 index 1b59126..0000000 Binary files a/fig/rmd-12-plot_DOex_scan-1.png and /dev/null differ diff --git a/fig/rmd-12-plot_gwas_scan-1.png b/fig/rmd-12-plot_gwas_scan-1.png deleted file mode 100644 index 53d0b5f..0000000 Binary files a/fig/rmd-12-plot_gwas_scan-1.png and /dev/null differ diff --git a/fig/rmd-12-plot_snp_asso-1.png b/fig/rmd-12-plot_snp_asso-1.png deleted file mode 100644 index 32b12b1..0000000 Binary files a/fig/rmd-12-plot_snp_asso-1.png and /dev/null differ diff --git a/fig/rmd-12-plot_snp_asso_wplot-1.png b/fig/rmd-12-plot_snp_asso_wplot-1.png deleted file mode 100644 index 32b12b1..0000000 Binary files a/fig/rmd-12-plot_snp_asso_wplot-1.png and /dev/null differ diff --git a/fig/rmd-13-assoc_fig-1.png b/fig/rmd-13-assoc_fig-1.png deleted file mode 100644 index e83bb04..0000000 Binary files a/fig/rmd-13-assoc_fig-1.png and /dev/null differ diff --git a/fig/rmd-13-coef_plot-1.png b/fig/rmd-13-coef_plot-1.png deleted file mode 100644 index 413ab80..0000000 Binary files a/fig/rmd-13-coef_plot-1.png and /dev/null differ diff --git a/fig/rmd-13-geno_plot-1.png b/fig/rmd-13-geno_plot-1.png deleted file mode 100644 index 63beddc..0000000 Binary files a/fig/rmd-13-geno_plot-1.png and /dev/null differ diff --git a/fig/rmd-13-hist_log_transform-1.png b/fig/rmd-13-hist_log_transform-1.png deleted file mode 100644 index 97d92aa..0000000 Binary files a/fig/rmd-13-hist_log_transform-1.png and /dev/null differ diff --git a/fig/rmd-13-hist_untransformed-1.png b/fig/rmd-13-hist_untransformed-1.png deleted file mode 100644 index 888519b..0000000 Binary files a/fig/rmd-13-hist_untransformed-1.png and /dev/null differ diff --git a/fig/rmd-13-kinship_probs-1.png b/fig/rmd-13-kinship_probs-1.png deleted file mode 100644 index f9c1ef5..0000000 Binary files a/fig/rmd-13-kinship_probs-1.png and /dev/null differ diff --git a/fig/rmd-13-plot_assoc2-1.png b/fig/rmd-13-plot_assoc2-1.png deleted file mode 100644 index 3c67e1a..0000000 Binary files a/fig/rmd-13-plot_assoc2-1.png and /dev/null differ diff --git a/fig/rmd-13-qtl_plot-1.png b/fig/rmd-13-qtl_plot-1.png deleted file mode 100644 index d1620a7..0000000 Binary files a/fig/rmd-13-qtl_plot-1.png and /dev/null differ diff --git a/fig/rmd-13-qtl_plot_thr-1.png b/fig/rmd-13-qtl_plot_thr-1.png deleted file mode 100644 index a503d57..0000000 Binary files a/fig/rmd-13-qtl_plot_thr-1.png and /dev/null differ diff --git a/fig/rmd-15-assoc_fig-1.png b/fig/rmd-15-assoc_fig-1.png deleted file mode 100644 index 863179e..0000000 Binary files a/fig/rmd-15-assoc_fig-1.png and /dev/null differ diff --git a/fig/rmd-15-coef_plot-1.png b/fig/rmd-15-coef_plot-1.png deleted file mode 100644 index 24c9b38..0000000 Binary files a/fig/rmd-15-coef_plot-1.png and /dev/null differ diff --git a/fig/rmd-15-geno_plot-1.png b/fig/rmd-15-geno_plot-1.png deleted file mode 100644 index b973e91..0000000 Binary files a/fig/rmd-15-geno_plot-1.png and /dev/null differ diff --git a/fig/rmd-15-kinship_probs-1.png b/fig/rmd-15-kinship_probs-1.png deleted file mode 100644 index 006dc01..0000000 Binary files a/fig/rmd-15-kinship_probs-1.png and /dev/null differ diff --git a/fig/rmd-15-plot_assoc2-1.png b/fig/rmd-15-plot_assoc2-1.png deleted file mode 100644 index 4f87ecc..0000000 Binary files a/fig/rmd-15-plot_assoc2-1.png and /dev/null differ diff --git a/fig/rmd-15-qtl_plot-1.png b/fig/rmd-15-qtl_plot-1.png deleted file mode 100644 index 2e934de..0000000 Binary files a/fig/rmd-15-qtl_plot-1.png and /dev/null differ diff --git a/fig/rmd-15-qtl_plot_thr-1.png b/fig/rmd-15-qtl_plot_thr-1.png deleted file mode 100644 index 277e4cf..0000000 Binary files a/fig/rmd-15-qtl_plot_thr-1.png and /dev/null differ diff --git a/fig/rmd-16-assoc_fig-1.png b/fig/rmd-16-assoc_fig-1.png deleted file mode 100644 index 863179e..0000000 Binary files a/fig/rmd-16-assoc_fig-1.png and /dev/null differ diff --git a/fig/rmd-16-coef_plot-1.png b/fig/rmd-16-coef_plot-1.png deleted file mode 100644 index 24c9b38..0000000 Binary files a/fig/rmd-16-coef_plot-1.png and /dev/null differ diff --git a/fig/rmd-16-geno_plot-1.png b/fig/rmd-16-geno_plot-1.png deleted file mode 100644 index b973e91..0000000 Binary files a/fig/rmd-16-geno_plot-1.png and /dev/null differ diff --git a/fig/rmd-16-kinship_probs-1.png b/fig/rmd-16-kinship_probs-1.png deleted file mode 100644 index 006dc01..0000000 Binary files a/fig/rmd-16-kinship_probs-1.png and /dev/null differ diff --git a/fig/rmd-16-plot_assoc2-1.png b/fig/rmd-16-plot_assoc2-1.png deleted file mode 100644 index 4f87ecc..0000000 Binary files a/fig/rmd-16-plot_assoc2-1.png and /dev/null differ diff --git a/fig/rmd-16-qtl_plot-1.png b/fig/rmd-16-qtl_plot-1.png deleted file mode 100644 index 2e934de..0000000 Binary files a/fig/rmd-16-qtl_plot-1.png and /dev/null differ diff --git a/fig/rmd-16-qtl_plot_thr-1.png b/fig/rmd-16-qtl_plot_thr-1.png deleted file mode 100644 index 277e4cf..0000000 Binary files a/fig/rmd-16-qtl_plot_thr-1.png and /dev/null differ