diff --git a/PCGC/PCGC_code/cufflinks.sh b/PCGC/PCGC_code/cufflinks.sh new file mode 100644 index 0000000..5d4878e --- /dev/null +++ b/PCGC/PCGC_code/cufflinks.sh @@ -0,0 +1,17 @@ +#!/bin/bash +#$ -cwd +bam=$1 +gtf=$2 +dir=$3 +out="Cufflinksout" + +if [ ! -d $out ]; then mkdir -p $out; fi +if [ ! -d $dir ]; then mkdir -p $dir; fi + +bam_base=`basename $bam .bam` +individualOut=$bam_base"_cufflinks" +cmd="/ifs/data/c2b2/ngs_lab/ngs/usr/bin/cufflinks -o $out/$individualOut --compatible-hits-norm --GTF $gtf $bam" +echo -e "do cufflinks with ref genes: \n $cmd" +$cmd +cp ./$out/$individualOut/genes.fpkm_tracking $dir/${bam_base}.genes.fpkm_tracking + diff --git a/PCGC/PCGC_code/handleStar.sh b/PCGC/PCGC_code/handleStar.sh new file mode 100644 index 0000000..6efb892 --- /dev/null +++ b/PCGC/PCGC_code/handleStar.sh @@ -0,0 +1,17 @@ +#!/bin/bash -l +#$-cwd + +r1=$1 +r2=$2 +code=/ifs/home/c2b2/ys_lab/wm2313/code/ +log=starlogs + +if [ ! -d $log ] ; then mkdir -p $log; fi + +r1_base=`basename $r1 .gz` +r2_base=`basename $r2 .gz` + +echo $r1_base + +## qsub -l mem=11G,time=16:: -P shen-chung -pe smp 4 -m e -o $log/star.$r1_base.o -e $log/star.$r1_base.e -N star.$r1_base $code/star.sh $r1 $r2 +qsub -l mem=6G,time=14:: -P ys_lab -pe smp 6 -o $log/star.$r1_base.o -e $log/star.$r1_base.e -N star.$r1_base $code/star.sh $r1 $r2 diff --git a/PCGC/PCGC_code/run_GTEx_star.sh b/PCGC/PCGC_code/run_GTEx_star.sh new file mode 100644 index 0000000..33e8a42 --- /dev/null +++ b/PCGC/PCGC_code/run_GTEx_star.sh @@ -0,0 +1,15 @@ +#!/bin/bash +#$ -cwd + +dir=$1 ## the dir of fastq files +## /ifs/home/c2b2/ys_lab/wm2313/gtex/Heart_reads/fastq/ +for r1 in $dir/*_1.fastq.gz; do + r1_base=`basename $r1 _1.fastq.gz` + r2=$dir/${r1_base}_2.fastq.gz + + if [[ -e $r2 ]]; then + echo $r1 + echo $r2 + sh handleStar.sh $r1 $r2; + fi +done diff --git a/PCGC/PCGC_code/star.sh b/PCGC/PCGC_code/star.sh new file mode 100644 index 0000000..cec5782 --- /dev/null +++ b/PCGC/PCGC_code/star.sh @@ -0,0 +1,82 @@ +#!/bin/bash -l +#$ -cwd + +R1=$1 +R2=$2 + +## check the length of reads to choose star configuration +read=`zcat -d $R1| head | tail -n +10` +readLength=`echo ${#read}` +echo "fastq read length: "$readLength + +out="StarBam" +if [ ! -d $out ] ; then mkdir -p $out; fi + +R1_base=`basename $R1` +cd $out + +if [ -d ${R1_base} ] ; then rm -r ${R1_base}; fi + +mkdir -p ${R1_base} +cd ${R1_base} + +echo "star align begin..." +date + +if [[ $readLength -gt 60 ]]; + then + genomeDB="~/scratch/softwareMakeup/genome100" + seedStart=50 + else + genomeDB="/ifs/home/c2b2/ys_lab/wm2313/data/softwares/STAR_2.3.0e/genome" + seedStart=30 +fi + +/ifs/home/c2b2/ys_lab/wm2313/data/softwares/STAR_2.3.0e/STAR --genomeDir $genomeDB --sjdbGTFfile /ifs/home/c2b2/ys_lab/wm2313/data/GSNAP/chr_genes.gtf --readFilesCommand zcat --seedSearchStartLmax $seedStart --runThreadN 4 --outSAMunmapped Within --outSAMstrandField intronMotif --outFilterIntronMotifs RemoveNoncanonical --readFilesIn $R1 $R2 --outStd SAM --outSAMmode Full | ~/data/softwares/samtools-0.1.19/samtools view -bS -> ${R1_base}.bam + +## --outSAMstrandField intronMotif is used to add XS strand attribute for alignments that contain splice junctions. +## --outFilterIntronMotifs RemoveNoncanonical filter out alignments that contain non-canonical junctions + +echo "star alignment finished: " +date + +bam=${R1_base}.bam +sortbam=${R1_base}.hg19UCSC.star.sort.bam +~/data/softwares/samtools-0.1.19/samtools sort $bam ${R1_base}.hg19UCSC.star.sort +echo "samtools sort finished: " +date + +if [[ -e $sortbam ]]; +then +rm -f $bam +fi + +~/data/softwares/samtools-0.1.19/samtools index $sortbam +echo "samtools index finished: " +date + +if [[ -e ${R1_base}".hg19UCSC.star.sort.bam.bai" ]]; + + then + echo ${R1_base} done! >> ../starbam.checklist + + else + echo echo ${R1_base} failed! >> ../starbam.checklist +fi + + + +logs="analysis_logs" +if [ ! -d $logs ] ; then mkdir -p $logs; fi +currentDir=`pwd` +parentDir=`dirname $currentDir` +## gtf=/ifs/home/c2b2/ys_lab/wm2313/data/GSNAP/chr_genes.gtf +gtf=/ifs/home/c2b2/ys_lab/wm2313/data/GSNAP/gencode.v19.annotation.gtf + +## peform alignment QC +qsub -l mem=5G,time=5:: -pe smp 6 -o $logs/${R1_base}.RNAseqc.o -e $logs/${R1_base}.RNAseqc.e -N RNASEQC.${R1_base} ~/code/star_RNA-SeQC_pipe.sh $sortbam $gtf + +## compute FPKM with cufflinks +mkdir $parentDir/fpkms ## where to put the fpkm files(gene level) +qsub -l mem=1G,time=16:: -pe smp 6 -m e -o $logs/${R1_base}.cufflink.o -e $logs/${R1_base}.cufflink.e -N cufflinks.${R1_base} ~/code/cufflinks.sh $sortbam $gtf $parentDir/fpkms + diff --git a/PCGC/PCGC_code/star_RNA-SeQC_pipe.sh b/PCGC/PCGC_code/star_RNA-SeQC_pipe.sh new file mode 100644 index 0000000..bc75a55 --- /dev/null +++ b/PCGC/PCGC_code/star_RNA-SeQC_pipe.sh @@ -0,0 +1,61 @@ +#!/bin/bash -l +#$ -cwd + +bam=$1 +gtf=$2 + +gsnap_dir=/ifs/home/c2b2/ys_lab/wm2313/data/GSNAP +reference=$gsnap_dir/hg19.fa +bam_base=`basename $bam .bam` +sample=${bam_base} +reheaderbam=${bam_base}".reheader.bam" + +if [[ ! -e $reheaderbam.bai ]]; then + +echo reheader the bam files for GATK... +~/data/softwares/jdk1.7.0_60/bin/java -jar -Xmx3g /ifs/home/c2b2/ys_lab/wm2313/data/softwares/picard/AddOrReplaceReadGroups.jar I=$bam O=$reheaderbam LB=XX PL=XX PU=XX SM=XX +echo index the new bam file.. +~/data/softwares/samtools-0.1.19/samtools index $reheaderbam + +fi + + +echo perform RNA-SeQC.. +~/data/softwares/jdk1.7.0_60/bin/java -jar -Xmx10g /ifs/home/c2b2/ys_lab/wm2313/data/softwares/RNA-SeQC_v1.1.7.jar -o ./"RNA-SeQC" -gatkFlags "-DBQ 1" -r $reference -t $gtf -s "$sample|$reheaderbam|NA" + + + +## get a summary of the mapping quality +cd RNA-SeQC + +summary=../../rnaseqc.info +if [[ ! -e $summary ]]; then + echo -e "sample\tReadsNO\tmappingRate\tintraRate\texonicRate\tintronicRate\tintergenicRate\treadsLength" > $summary +fi + +ID=`echo ${bam_base} | cut -d '.' -f1-3` + +if [[ -e metrics.tsv ]]; then + + data=`tail -n +2 metrics.tsv` + intraRate=`echo $data | cut -d " " -f5,5` + exonicRate=`echo $data | cut -d " " -f7,7` + + mappingRate=`echo $data | cut -d " " -f8,8` + + readsLength=`echo $data | cut -d " " -f12,12` + mappedNO=`echo $data | cut -d " " -f17,17` + + intergenicRate=`echo $data | cut -d " " -f18,18` + intronicRate=`echo $data | cut -d " " -f28,28` + readsNO=`echo "$mappedNO/$mappingRate" | bc` + + echo -e "$ID\t$readsNO\t$mappingRate\t$intraRate\t$exonicRate\t$intronicRate\t$intergenicRate\t$readsLength" >> $summary + + cd .. + rm -f $reheaderbam + rm -f $reheaderbam.bai + +else + echo -e "$ID\tfails on RNA-SeQC" >> $summary +fi