#!/bin/bash

# yeast.GATK.alignment.pipeline
# 
#
# Created by Ben Taylor on 29/11/2013.
export PERL5LIB=$PERL5LIB:perl5/:perl5/lib:perl5/lib/perl5:perl5/libgd:perl5/libgd/bin:perl5/libgd/include:perl_modules:perl_modules/lib64:perl_modules/lib:perl_modules/lib/perl5/:perl_modules/lib/perl5/site_perl:
export PATH=$PATH:samtools-0.1.16:perl5/libgd:perl5/libgd/bin:perl5/libgd/include
export bwa_dir=bwa
export samtools_dir=samtools-0.1.19
export java7=jre1.7.0_25/bin/java
export gatk=GenomeAnalysisTK-2.7-2-g6bda569
export picard_dir=picard-tools-1.96
#        
export fastq_dir=${1}
export sample=${2}
export fastq_extension_1=${3}
export fastq_extension_2=${4}
export genome_name=${5}
export genome_source=${6}
export min_score=${7}
export user=${8}
#
############################
echo COPY GENOME TO TMP FOLDER
    rm -rf /tmp/$user/$sample 
    mkdir /tmp/$user
    mkdir /tmp/$user/$sample
    export temp_dir=/tmp/$user/$sample
#
    rsync -av $genome_source/$genome_name* $temp_dir
    export genome=$temp_dir/$genome_name
    $java7 -Xmx4g -jar $picard_dir/CreateSequenceDictionary.jar R=$genome.fa O=$genome.dict

echo SAMPLE: $fastq_dir/${sample}${fastq_extension_1}.gz
echo SAMPLE: $fastq_dir/${sample}${fastq_extension_2}.gz
echo GENOME: $temp_dir/$genome_name
echo ---------------------
#
echo BWA ALIGN     
    $bwa_dir/bwa mem -t 1 -M $genome -R "@RG\tID:$sample\tPL:illumina\tPU:illumina\tLB:$sample\tSM:$sample\tCN:BGI" \
                $fastq_dir/${sample}${fastq_extension_1}.gz $fastq_dir/${sample}${fastq_extension_2}.gz > $temp_dir/$sample.sam
#
echo SAMTOOLS SAM to BAM
            $samtools_dir/samtools view -f 1 -F 12 -q $min_score -b@ 1 -T $genome.fa $temp_dir/$sample.sam -o $temp_dir/$sample.q$min_score.bam
            # keeps paired reads
            # removed unmapped reads
#
echo SORT
    $samtools_dir/samtools sort -@ 1 -m 4G $temp_dir/$sample.q$min_score.bam  $temp_dir/$sample.q$min_score.srt
#
echo DEDUP
    $java7 -Xmx4g -jar $picard_dir/MarkDuplicates.jar \
        I=$temp_dir/$sample.q$min_score.srt.bam \
        O=$temp_dir/$sample.q$min_score.dedup.bam \
        M=$temp_dir/$sample.q$min_score.dedup_metrics.txt \
        AS=true \
        VERBOSITY=ERROR \
        VALIDATION_STRINGENCY=LENIENT \
        REMOVE_DUPLICATES=true
#
echo INDEX
    $samtools_dir/samtools index $temp_dir/$sample.q$min_score.dedup.bam
#
echo GATK REALIGN
$java7 -Xmx4g -jar $gatk/GenomeAnalysisTK.jar \
                        -T RealignerTargetCreator \
                        -R $genome.fa \
                        --logging_level ERROR \
                        -fixMisencodedQuals \
                        -I $temp_dir/$sample.q$min_score.dedup.bam \
                        -o $temp_dir/$sample.q$min_score.dedup.dedup.forIR.intervals 
#  NOTE - REMOVE -fixMisencodedQuals if using MiSeq data                    
echo GATK INDEL REALIGN
$java7 -Xmx4g -jar $gatk/GenomeAnalysisTK.jar \
                        -T IndelRealigner \
                        -R $genome.fa \
                        -fixMisencodedQuals \
                        --logging_level ERROR \
                        -I $temp_dir/$sample.q$min_score.dedup.bam \
                        -targetIntervals $temp_dir/$sample.q$min_score.dedup.dedup.forIR.intervals \
                        -o $temp_dir/$sample.q$min_score.dedup.realign.bam
#  NOTE - REMOVE -fixMisencodedQuals if using MiSeq data                    
# call SNPs to give set of known intervals for base recalibration
echo GATK HaplotypeCaller
$java7 -Xmx4g -jar $gatk/GenomeAnalysisTK.jar \
                                 -T HaplotypeCaller \
                                 --genotyping_mode DISCOVERY \
                                 --logging_level ERROR \
                                 -stand_call_conf 30 \
                                 -stand_emit_conf 30 \
                                 --min_mapping_quality_score 10 \
                                 -contamination 0.0 \
                                 -nct 1 \
                                 -R $genome.fa \
                                 -I $temp_dir/$sample.q$min_score.dedup.realign.bam \
                                 -o $temp_dir/$sample.q$min_score.dedup.realign.vcf
# filter these to give high quality SNPs
echo GATK SelectVariants
$java7 -Xmx4g -jar $gatk/GenomeAnalysisTK.jar \
                                    -T SelectVariants \
                                    --logging_level ERROR \
                                    -R $genome.fa \
                                    -V $temp_dir/$sample.q$min_score.dedup.realign.vcf \
                                    -selectType SNP \
                                    -o $temp_dir/$sample.q$min_score.dedup.realign.SNP.vcf

echo GATK VariantAnnotator  
$java7 -Xmx4g -jar $gatk/GenomeAnalysisTK.jar \
                                     -T VariantAnnotator \
                                     -R $genome.fa \
                                     --logging_level ERROR \
                                     --annotation AlleleBalance \
                                     --annotation ClippingRankSumTest \
                                     --annotation BaseCounts \
                                     --annotation GCContent \
                                     --annotation HardyWeinberg \
                                     --annotation HomopolymerRun \
                                     --annotation NBaseCount \
                                     --annotation VariantType \
                                     --annotation AlleleBalanceBySample \
                                     --annotation DepthPerAlleleBySample \
                                     --annotation HaplotypeScore \
                                     --variant $temp_dir/$sample.q$min_score.dedup.realign.SNP.vcf  \
                                     -I $temp_dir/$sample.q$min_score.dedup.realign.bam \
                                     -o $temp_dir/$sample.q$min_score.dedup.realign.SNP.anno.vcf
echo GATK VariantFiltration
$java7 -Xmx4g -jar $gatk/GenomeAnalysisTK.jar \
                                     -R $genome.fa \
                                     --logging_level ERROR \
                                     -T VariantFiltration \
                                                    --filterExpression ' DP < 6  ' \
                                                    --filterName 'DP10' \
                                                    --filterExpression ' ABHet < 0.75 ' \
                                                    --filterName 'ABHetLO' \
                                                    --filterExpression ' HRun > 10 ' \
                                                    --filterName 'HRun' \
                                                    --filterExpression ' QD < 2.0  ' \
                                                    --filterName 'QD' \
                                                    --filterExpression ' FS > 60.0 ' \
                                                    --filterName 'FS' \
                                                    --filterExpression ' MQ < 40.0 ' \
                                                    --filterName 'MQ' \
                                                    --filterExpression ' MQRankSum < -12.5 ' \
                                                    --filterName 'MQRS' \
                                                    --filterExpression ' ReadPosRankSum < -8.0 ' \
                                                    --filterName 'RPRS' \
                                                    --filterExpression 'HaplotypeScore > 13.0' \
                                                    --filterName "HScore" \
                                     --variant $temp_dir/$sample.q$min_score.dedup.realign.SNP.anno.vcf \
                                     -o $temp_dir/$sample.q$min_score.dedup.realign.SNP.anno.f1.vcf
echo GATK SelectVariants
$java7 -Xmx4g -jar $gatk/GenomeAnalysisTK.jar \
                                     -R $genome.fa \
                                     --logging_level ERROR \
                                     -T SelectVariants \
                                     -select 'vc.isNotFiltered()' \
                                     --variant $temp_dir/$sample.q$min_score.dedup.realign.SNP.anno.f1.vcf \
                                     -o $temp_dir/$sample.q$min_score.dedup.realign.SNP.anno.f2.vcf      
echo run Base recalibrate
$java7 -Xmx4g -jar $gatk/GenomeAnalysisTK.jar \
                        -T BaseRecalibrator \
                        --logging_level ERROR \
                        -R $genome.fa \
                        -knownSites $temp_dir/$sample.q$min_score.dedup.realign.SNP.anno.f2.vcf \
                        -I $temp_dir/$sample.q$min_score.dedup.realign.bam \
                        -o $temp_dir/$sample.q$min_score.dedup.realign.forRC.table
echo GATK PrintReads
$java7 -Xmx4g -jar $gatk/GenomeAnalysisTK.jar \
                        -T PrintReads \
                        --logging_level ERROR \
                        -R $genome.fa \
                        -I $temp_dir/$sample.q$min_score.dedup.realign.bam \
                        -BQSR $temp_dir/$sample.q$min_score.dedup.realign.forRC.table \
                        -o $temp_dir/$sample.q$min_score.dedup.RR.bam 

echo COPY BAM FILES TO OUTPUT FOLDER
    mv $temp_dir/$sample.q$min_score.dedup.RR.bam* $fastq_dir
echo REMOVE GENOME TMP FOLDER
    rm -rf $temp_dir
#
echo END END END        # END END END

