From 779f8a914ca0d863644179f70e470590b1dad65c Mon Sep 17 00:00:00 2001 From: Wenji Ma % Yufeng Date: Thu, 14 Aug 2014 16:33:06 -0400 Subject: [PATCH 1/3] star pipe --- PCGC/PCGC_code/cufflinks.sh | 17 ++++++ PCGC/PCGC_code/handleStar.sh | 18 ++++++ PCGC/PCGC_code/run_GTEx_star.sh | 15 +++++ PCGC/PCGC_code/star.sh | 82 ++++++++++++++++++++++++++++ PCGC/PCGC_code/star_RNA-SeQC_pipe.sh | 62 +++++++++++++++++++++ 5 files changed, 194 insertions(+) create mode 100644 PCGC/PCGC_code/cufflinks.sh create mode 100644 PCGC/PCGC_code/handleStar.sh create mode 100644 PCGC/PCGC_code/run_GTEx_star.sh create mode 100644 PCGC/PCGC_code/star.sh create mode 100644 PCGC/PCGC_code/star_RNA-SeQC_pipe.sh diff --git a/PCGC/PCGC_code/cufflinks.sh b/PCGC/PCGC_code/cufflinks.sh new file mode 100644 index 0000000..91085f5 --- /dev/null +++ b/PCGC/PCGC_code/cufflinks.sh @@ -0,0 +1,17 @@ +#!/bin/bash +#$ -cwd +bam=$1 +gtf=$2 +dir=$3 +out="ensembleCufflinksout" + +if [ ! -d $out ]; then mkdir -p $out; fi +if [ ! -d $dir ]; then mkdir -p $dir; fi + +bam_base=`basename $bam .bam` +individualOut=$bam_base"_cufflinks" +cmd="/ifs/data/c2b2/ngs_lab/ngs/usr/bin/cufflinks -o $out/$individualOut --compatible-hits-norm --GTF $gtf $bam" +echo -e "do cufflinks with ref genes: \n $cmd" +$cmd +cp ./$out/$individualOut/genes.fpkm_tracking $dir/${bam_base}.genes.fpkm_tracking + diff --git a/PCGC/PCGC_code/handleStar.sh b/PCGC/PCGC_code/handleStar.sh new file mode 100644 index 0000000..5b4570d --- /dev/null +++ b/PCGC/PCGC_code/handleStar.sh @@ -0,0 +1,18 @@ +#!/bin/bash -l +#$-cwd + +r1=$1 +r2=$2 +code=/ifs/home/c2b2/ys_lab/wm2313/code/ +log=starlogs + +if [ ! -d $log ] ; then mkdir -p $log; fi + +r1_base=`basename $r1 .gz` +r2_base=`basename $r2 .gz` + +echo $r1_base + +# echo qsub -l mem=26G,time=10:: -m e -pe smp 2-4 -o iit_logs/$genome.$r1_base.o -e iit_logs/$genome.$r1_base.e -N do.$r1_base $code/gsnap.sh $genome $r1 $r2 +## qsub -l mem=11G,time=16:: -P shen-chung -pe smp 4 -m e -o $log/star.$r1_base.o -e $log/star.$r1_base.e -N star.$r1_base $code/star.sh $r1 $r2 +qsub -l mem=6G,time=14:: -P ys_lab -pe smp 6 -o $log/star.$r1_base.o -e $log/star.$r1_base.e -N star.$r1_base $code/star.sh $r1 $r2 diff --git a/PCGC/PCGC_code/run_GTEx_star.sh b/PCGC/PCGC_code/run_GTEx_star.sh new file mode 100644 index 0000000..33e8a42 --- /dev/null +++ b/PCGC/PCGC_code/run_GTEx_star.sh @@ -0,0 +1,15 @@ +#!/bin/bash +#$ -cwd + +dir=$1 ## the dir of fastq files +## /ifs/home/c2b2/ys_lab/wm2313/gtex/Heart_reads/fastq/ +for r1 in $dir/*_1.fastq.gz; do + r1_base=`basename $r1 _1.fastq.gz` + r2=$dir/${r1_base}_2.fastq.gz + + if [[ -e $r2 ]]; then + echo $r1 + echo $r2 + sh handleStar.sh $r1 $r2; + fi +done diff --git a/PCGC/PCGC_code/star.sh b/PCGC/PCGC_code/star.sh new file mode 100644 index 0000000..eb6c548 --- /dev/null +++ b/PCGC/PCGC_code/star.sh @@ -0,0 +1,82 @@ +#!/bin/bash -l +#$ -cwd + +R1=$1 +R2=$2 + +## check the length of the read to choose star configuration +read=`zcat -d $R1| head | tail -n +10` +readLength=`echo ${#read}` +echo "fastq read length: "$readLength + +out="StarBamOption" +if [ ! -d $out ] ; then mkdir -p $out; fi + +R1_base=`basename $R1` +cd $out + +if [ -d ${R1_base} ] ; then rm -r ${R1_base}; fi + +mkdir -p ${R1_base} +cd ${R1_base} + +echo "star align begin..." +date + +if [[ $readLength -gt 60 ]]; + then + genomeDB="~/scratch/softwareMakeup/genome100" + seedStart=50 + else + genomeDB="/ifs/home/c2b2/ys_lab/wm2313/data/softwares/STAR_2.3.0e/genome" + seedStart=30 +fi + +/ifs/home/c2b2/ys_lab/wm2313/data/softwares/STAR_2.3.0e/STAR --genomeDir $genomeDB --sjdbGTFfile /ifs/home/c2b2/ys_lab/wm2313/data/GSNAP/chr_genes.gtf --readFilesCommand zcat --seedSearchStartLmax $seedStart --runThreadN 4 --outSAMunmapped Within --outSAMstrandField intronMotif --outFilterIntronMotifs RemoveNoncanonical --readFilesIn $R1 $R2 --outStd SAM --outSAMmode Full | ~/data/softwares/samtools-0.1.19/samtools view -bS -> ${R1_base}.bam + +## --outSAMstrandField intronMotif is used to add XS strand attribute for alignments that contain splice junctions. +## --outFilterIntronMotifs RemoveNoncanonical filter out alignments that contain non-canonical junctions + +echo "star alignment finished: " +date + +bam=${R1_base}.bam +sortbam=${R1_base}.hg19UCSC.star.sort.bam +~/data/softwares/samtools-0.1.19/samtools sort $bam ${R1_base}.hg19UCSC.star.sort +echo "samtools sort finished: " +date + +if [[ -e $sortbam ]]; +then +rm -f $bam +fi + +~/data/softwares/samtools-0.1.19/samtools index $sortbam +echo "samtools index finished: " +date + +if [[ -e ${R1_base}".hg19UCSC.star.sort.bam.bai" ]]; + + then + echo ${R1_base} done! >> ../starbam.checklist + + else + echo echo ${R1_base} failed! >> ../starbam.checklist +fi + + + +logs="analysis_logs" +if [ ! -d $logs ] ; then mkdir -p $logs; fi +currentDir=`pwd` +parentDir=`dirname $currentDir` +## gtf=/ifs/home/c2b2/ys_lab/wm2313/data/GSNAP/chr_genes.gtf +gtf=/ifs/home/c2b2/ys_lab/wm2313/data/GSNAP/gencode.v19.annotation.gtf + +## peform alignment QC +qsub -l mem=5G,time=5:: -pe smp 6 -o $logs/${R1_base}.RNAseqc.o -e $logs/${R1_base}.RNAseqc.e -N RNASEQC.${R1_base} ~/code/star_RNA-SeQC_pipe.sh $sortbam $gtf + +## compute FPKM with cufflinks +mkdir $parentDir/fpkms ## where to put the fpkm files(gene level) +qsub -l mem=1G,time=16:: -pe smp 6 -m e -o $logs/${R1_base}.cufflink.o -e $logs/${R1_base}.cufflink.e -N cufflinks.${R1_base} ~/code/cufflinks.sh $sortbam $gtf $parentDir/fpkms + diff --git a/PCGC/PCGC_code/star_RNA-SeQC_pipe.sh b/PCGC/PCGC_code/star_RNA-SeQC_pipe.sh new file mode 100644 index 0000000..912adda --- /dev/null +++ b/PCGC/PCGC_code/star_RNA-SeQC_pipe.sh @@ -0,0 +1,62 @@ +#!/bin/bash -l +#$ -cwd + +bam=$1 +gtf=$2 + +gsnap_dir=/ifs/home/c2b2/ys_lab/wm2313/data/GSNAP +reference=$gsnap_dir/hg19.fa +bam_base=`basename $bam .bam` +sample=${bam_base} +reheaderbam=${bam_base}".reheader.bam" +echo "sample: "$sample + +if [[ ! -e $reheaderbam.bai ]]; then + +echo reheader the bam files for GATK... +~/data/softwares/jdk1.7.0_60/bin/java -jar -Xmx3g /ifs/home/c2b2/ys_lab/wm2313/data/softwares/picard/AddOrReplaceReadGroups.jar I=$bam O=$reheaderbam LB=XX PL=XX PU=XX SM=XX +echo index the new bam file.. +~/data/softwares/samtools-0.1.19/samtools index $reheaderbam + +fi + + +echo perform RNA-SeQC.. +~/data/softwares/jdk1.7.0_60/bin/java -jar -Xmx10g /ifs/home/c2b2/ys_lab/wm2313/data/softwares/RNA-SeQC_v1.1.7.jar -o ./"RNA-SeQC" -gatkFlags "-DBQ 1" -r $reference -t $gtf -s "$sample|$reheaderbam|NA" + + + +## get the summary of the mapping quality +cd RNA-SeQC + +summary=../../rnaseqc.info +if [[ ! -e $summary ]]; then + echo -e "sample\tReadsNO\tmappingRate\tintraRate\texonicRate\tintronicRate\tintergenicRate\treadsLength" > $summary +fi + +ID=`echo ${bam_base} | cut -d '.' -f1-3` + +if [[ -e metrics.tsv ]]; then + + data=`tail -n +2 metrics.tsv` + intraRate=`echo $data | cut -d " " -f5,5` + exonicRate=`echo $data | cut -d " " -f7,7` + + mappingRate=`echo $data | cut -d " " -f8,8` + + readsLength=`echo $data | cut -d " " -f12,12` + mappedNO=`echo $data | cut -d " " -f17,17` + + intergenicRate=`echo $data | cut -d " " -f18,18` + intronicRate=`echo $data | cut -d " " -f28,28` + readsNO=`echo "$mappedNO/$mappingRate" | bc` + + echo -e "$ID\t$readsNO\t$mappingRate\t$intraRate\t$exonicRate\t$intronicRate\t$intergenicRate\t$readsLength" >> $summary + + cd .. + rm -f $reheaderbam + rm -f $reheaderbam.bai + +else + echo -e "$ID\tfailure on RNA-SeQC" >> $summary +fi From 4e472a64377d54f3b7cfd077cfb7df03909bbb72 Mon Sep 17 00:00:00 2001 From: Wenji Ma % Yufeng Date: Thu, 14 Aug 2014 17:30:56 -0400 Subject: [PATCH 2/3] star pipeline for RNA-seq data analysis --- PCGC/PCGC_code/cufflinks.sh | 2 +- PCGC/PCGC_code/handleStar.sh | 1 - PCGC/PCGC_code/star.sh | 2 +- PCGC/PCGC_code/star_RNA-SeQC_pipe.sh | 5 ++--- 4 files changed, 4 insertions(+), 6 deletions(-) diff --git a/PCGC/PCGC_code/cufflinks.sh b/PCGC/PCGC_code/cufflinks.sh index 91085f5..5d4878e 100644 --- a/PCGC/PCGC_code/cufflinks.sh +++ b/PCGC/PCGC_code/cufflinks.sh @@ -3,7 +3,7 @@ bam=$1 gtf=$2 dir=$3 -out="ensembleCufflinksout" +out="Cufflinksout" if [ ! -d $out ]; then mkdir -p $out; fi if [ ! -d $dir ]; then mkdir -p $dir; fi diff --git a/PCGC/PCGC_code/handleStar.sh b/PCGC/PCGC_code/handleStar.sh index 5b4570d..6efb892 100644 --- a/PCGC/PCGC_code/handleStar.sh +++ b/PCGC/PCGC_code/handleStar.sh @@ -13,6 +13,5 @@ r2_base=`basename $r2 .gz` echo $r1_base -# echo qsub -l mem=26G,time=10:: -m e -pe smp 2-4 -o iit_logs/$genome.$r1_base.o -e iit_logs/$genome.$r1_base.e -N do.$r1_base $code/gsnap.sh $genome $r1 $r2 ## qsub -l mem=11G,time=16:: -P shen-chung -pe smp 4 -m e -o $log/star.$r1_base.o -e $log/star.$r1_base.e -N star.$r1_base $code/star.sh $r1 $r2 qsub -l mem=6G,time=14:: -P ys_lab -pe smp 6 -o $log/star.$r1_base.o -e $log/star.$r1_base.e -N star.$r1_base $code/star.sh $r1 $r2 diff --git a/PCGC/PCGC_code/star.sh b/PCGC/PCGC_code/star.sh index eb6c548..56096c4 100644 --- a/PCGC/PCGC_code/star.sh +++ b/PCGC/PCGC_code/star.sh @@ -9,7 +9,7 @@ read=`zcat -d $R1| head | tail -n +10` readLength=`echo ${#read}` echo "fastq read length: "$readLength -out="StarBamOption" +out="StarBam" if [ ! -d $out ] ; then mkdir -p $out; fi R1_base=`basename $R1` diff --git a/PCGC/PCGC_code/star_RNA-SeQC_pipe.sh b/PCGC/PCGC_code/star_RNA-SeQC_pipe.sh index 912adda..bc75a55 100644 --- a/PCGC/PCGC_code/star_RNA-SeQC_pipe.sh +++ b/PCGC/PCGC_code/star_RNA-SeQC_pipe.sh @@ -9,7 +9,6 @@ reference=$gsnap_dir/hg19.fa bam_base=`basename $bam .bam` sample=${bam_base} reheaderbam=${bam_base}".reheader.bam" -echo "sample: "$sample if [[ ! -e $reheaderbam.bai ]]; then @@ -26,7 +25,7 @@ echo perform RNA-SeQC.. -## get the summary of the mapping quality +## get a summary of the mapping quality cd RNA-SeQC summary=../../rnaseqc.info @@ -58,5 +57,5 @@ if [[ -e metrics.tsv ]]; then rm -f $reheaderbam.bai else - echo -e "$ID\tfailure on RNA-SeQC" >> $summary + echo -e "$ID\tfails on RNA-SeQC" >> $summary fi From 050553af136254cc385c973bbf5df99ee2ebb390 Mon Sep 17 00:00:00 2001 From: Wenji Ma % Yufeng Date: Thu, 14 Aug 2014 17:34:45 -0400 Subject: [PATCH 3/3] star pipeline for RNA-seq data analysis --- PCGC/PCGC_code/star.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/PCGC/PCGC_code/star.sh b/PCGC/PCGC_code/star.sh index 56096c4..cec5782 100644 --- a/PCGC/PCGC_code/star.sh +++ b/PCGC/PCGC_code/star.sh @@ -4,7 +4,7 @@ R1=$1 R2=$2 -## check the length of the read to choose star configuration +## check the length of reads to choose star configuration read=`zcat -d $R1| head | tail -n +10` readLength=`echo ${#read}` echo "fastq read length: "$readLength