#!/bin/bash
#
# yeast_filtering.sh
# 
#
# Created by Ben Taylor on 11/10/2013.
#
# REQUIRES 
        # run bam_readounts     https://github.com/genome/bam-readcount
        # samtools
        # vcftools
        
export somatic_score_filter=50
export vcftools_dir=vcftools_0.1.11/bin
export PERL5LIB=vcftools_0.1.11/perl
#
export fastq_dir=${1}
export sample_vcf=${2}
export r_bam=${3}
export genome=${4}
export user=${5}
#
export nom=$(echo $sample_vcf | sed 's/.vcf//g' )
#
export bam_file_index=$(echo $sample_vcf | sed 's/.somatic-sniper.*$//').bam
samtools-0.1.19/samtools index $fastq_dir/$bam_file_index
#
echo USER:::: $user 
    mkdir /tmp/$user
    mkdir /tmp/$user/$nom
    export temp_dir=/tmp/$user/$nom
#
echo temp directory: $temp_dir
#
#    rsync -av $fastq_dir/$r_bam $temp_dir
#    rsync -av $fastq_dir/$r_bam.bai $temp_dir
    export ref_bam=$fastq_dir/$r_bam
#
    rsync -av $fastq_dir/$sample_vcf $temp_dir
    export sample=$temp_dir/$sample_vcf
#
echo SAMPLE: $sample
echo REFERENCE BAM: $ref_bam
echo USER: $user
#
# FILTER 1. somatic score > 50 
#
   sed -nr '/#/p' $sample > $temp_dir/head.tmp                                                                      # vcf head file
#
   sed '/#/d' $sample > $temp_dir/in.tmp                                                                            # vcf data file
#
   awk '{print $11}' $temp_dir/in.tmp | awk -v ssf=$somatic_score_filter 'BEGIN {FS = ":"} $13 > ssf {print NR}' > $temp_dir/numbers.tmp          # print line number where somatic score is above 50
#
   awk 'FNR==NR{num[$1];next}(FNR in num)' $temp_dir/numbers.tmp $temp_dir/in.tmp > $temp_dir/out1.tmp              # filter in.tmp to give only lines where somatic score is above 50
#
   cat $temp_dir/head.tmp $temp_dir/out1.tmp > $temp_dir/out1.f1.vcf                                                # output filtered file to complete VCF
#
   cat $temp_dir/out1.f1.vcf | $vcftools_dir/vcf-to-tab > $temp_dir/out1.f1.tab                                     # output genotype tab file
#
   sed '/#/d' $temp_dir/out1.f1.vcf > $temp_dir/in.tmp                                                              # filter vcf data file
#
#
echo START RE-GENOTYPING
### RE-GENOTYPE NORM AND SAMPLE TO REASSIGN BASE WHERE ALLELE FREQ IS BELOW 0.3
#
   awk '{print $10}' $temp_dir/in.tmp | awk 'BEGIN {FS = ":"} {print $5}' | tr ',' '\t' > $temp_dir/bcount.norm.tmp     #A,C,G,T counts
   awk '{print $11}' $temp_dir/in.tmp | awk 'BEGIN {FS = ":"} {print $5}' | tr ',' '\t' > $temp_dir/bcount.tum.tmp
#
   sed '/#/d' $temp_dir/out1.f1.tab | cut -f4 | tr '/' '\t' | \
            awk -F $'\t' 'BEGIN {OFS = FS} { if ($2 < $1) print $2, $1; else print $0 }' > $temp_dir/bases.norm.tmp # normA1 normA2
   sed '/#/d' $temp_dir/out1.f1.tab | cut -f5 | tr '/' '\t' | \
            awk -F $'\t' 'BEGIN {OFS = FS} { if ($2 < $1) print $2, $1; else print $0 }' > $temp_dir/bases.tum.tmp # tumA1 tumA2
        ## THE AWK COMMAND GOES THROUGH BASES FILES AND SORT THEM ALPHABETICALLY RATHER THAN IN READ DEPTH ORDER. THIS PREVENTS PROBLEMS LATER WHEN ALLELES MATCH BY ARE ON A and B rather than A and A ==> DOESNT AS NOT ALWAYS AC> CA. sometimes CT > AC.
#
#
# Go through norm.tmp and change any allele with freq < 0.3 to other allele
    paste $temp_dir/bcount.norm.tmp $temp_dir/bases.norm.tmp | \
    awk -F $'\t' 'BEGIN {OFS = FS} { 
                                   if ($5=="A") { a=$1/($1+$2+$3+$4); b=$1 };
                                    if ($5=="C") { a=$2/($1+$2+$3+$4); b=$2 }; 
                                    if ($5=="G") { a=$3/($1+$2+$3+$4);  b=$3 };
                                    if ($5=="T") { a=$4/($1+$2+$3+$4); b=$4 };
                                    if ($6=="A") { c=$1/($1+$2+$3+$4); d=$1 };
                                    if ($6=="C") { c=$2/($1+$2+$3+$4); d=$2 };
                                    if ($6=="G") { c=$3/($1+$2+$3+$4); d=$3 };
                                    if ($6=="T") { c=$4/($1+$2+$3+$4); d=$4 };
                                    print $0, a, b, c, d }' > $temp_dir/norm.tmp
#   
    awk -F $'\t' 'BEGIN {OFS = FS} { 
                                    if ($7 < 0.3) { $7=$9; $5=$6; $8=$10 };
                                    if ($9 < 0.3) { $9=$7; $6=$5; $10=$8 };
                                    a="|";
                                    print $0, a }' $temp_dir/norm.tmp > $temp_dir/norm2.tmp
#
# Go through sample and change any allele with freq < 0.3 to other allele
   paste $temp_dir/bcount.tum.tmp $temp_dir/bases.tum.tmp | \
    awk -F $'\t' 'BEGIN {OFS = FS} { 
                                   if ($5=="A") { a=$1/($1+$2+$3+$4); b=$1 };
                                    if ($5=="C") { a=$2/($1+$2+$3+$4); b=$2 }; 
                                    if ($5=="G") { a=$3/($1+$2+$3+$4);  b=$3 };
                                    if ($5=="T") { a=$4/($1+$2+$3+$4); b=$4 };
                                    if ($6=="A") { c=$1/($1+$2+$3+$4); d=$1 };
                                    if ($6=="C") { c=$2/($1+$2+$3+$4); d=$2 };
                                    if ($6=="G") { c=$3/($1+$2+$3+$4); d=$3 };
                                    if ($6=="T") { c=$4/($1+$2+$3+$4); d=$4 };
                                    print $0, a, b, c, d }' > $temp_dir/tum.tmp
#
    awk -F $'\t' 'BEGIN {OFS = FS} { 
                                    if ($7 < 0.3) { $7=$9; $5=$6; $8=$10 };
                                    if ($9 < 0.3) { $9=$7; $6=$5; $10=$8 };
                                    a="|";
                                    print $0, a }' $temp_dir/tum.tmp > $temp_dir/tum2.tmp
#
#    # reassign genotyping
    paste $temp_dir/norm2.tmp $temp_dir/tum2.tmp | \
                awk -F $'\t' 'BEGIN {OFS = FS} { 
                                        if ( $5!=$16 && $6!=$17 && $5==$17 ) { 
                                                h1=$16; $16=$17; $17=h1;  h1=$18; $18=$20; $20=h1; h1=$19; $19=$21; $21=h1 };  # THIS SWAPS THE tum ALLELES AROUND if they match the norm in the opposite orientation (base, freq and number)
                                        if ( $5!=$16 && $6!=$17 && $6==$16 ) { 
                                                h1=$16; $16=$17; $17=h1;  h1=$18; $18=$20; $20=h1; h1=$19; $19=$21; $21=h1 };  # THIS SWAPS THE tum ALLELES AROUND if they match the norm in the opposite orientation (base, freq and number)
                                        if ( $5==$16 ) a=0; else a=1; 
                                        if ( $6==$17 ) b=0; else b=1; 
                                        if ( a==0 && b==0 ) c="unmut" ;
                                            else if ( a==1 && b==0 ) c=$5">"$16 ;
                                            else if ( a==0 && b==1 ) c=$6">"$17 ;
                                            else if ( a==1 && b==1 && $5==$6 && $16==$17 ) c=$5">"$16 ;                         # this deals with 1/1 mutations when they are homozygous on norm and mut - ie AA > TT
                                            else c="threeAlleles";                                                              # when there are no other matches, then generally there are three alleles present
                                        print $0, a"/"b, c } ' > $temp_dir/both.tmp
    ##### the threeAlleles is a small but persistant problem ##################
    # sed -n '/threeAllels/p' $temp_dir/both.tmp
#
    paste $temp_dir/in.tmp $temp_dir/both.tmp > $temp_dir/data.tmp
#
echo START FILTERING
# FILTER
#
    # 1. remove unmut lines
        sed '/unmut/d' $temp_dir/data.tmp > $temp_dir/data2.tmp
    # 2. remove lines where mut or tum allele (0/1 or 1/0) count is below 4
                awk -F $'\t' 'BEGIN {OFS = FS} { 
                                                if ( $34=="0/1" && $32<4 ) next;
                                                else if ( $34=="0/1" && $21<4 ) next;
                                                else if ( $34=="1/0" && $30<4 ) next; 
                                                else if ( $34=="1/0" && $19<4 ) next; 
                                                else print $0 }' $temp_dir/data2.tmp > $temp_dir/data3.tmp
#
    # 3. remove lines with homozygous 1/1 mutations and allele count is below 4.
                awk -F $'\t' 'BEGIN {OFS = FS} { 
                                                if ($34=="1/1" && $30<4 ) next;
                                                if ($34=="1/1" && $19<4) next;
                                                else print $0 }' $temp_dir/data3.tmp > $temp_dir/data4.tmp
#
    # 3. remove lines with threeAllele mutations and allele count is below 4 and assing mutation to the first set of allels
                awk -F $'\t' 'BEGIN {OFS = FS} { 
                                                if ($35=="threeAlleles" && ($32<4 || $30<4 || $21<4 || $19<4 ) ) next;
                                                else print $0 }' $temp_dir/data4.tmp | \
                                            awk -F $'\t' 'BEGIN {OFS = FS} { 
                                                    if ($35=="threeAlleles" ) $35=$16">"$27;
                                                    print $0 }' > $temp_dir/data5.tmp 
#
echo START READCOUNT
    # read count to remove lines with 3' read end bias
#
        # generete regions list
                awk -F $'\t' 'BEGIN {OFS = FS } { print $1, $2, $2 } ' $temp_dir/data5.tmp | \
                        sort -k1,1n -k2,2g | awk ' { print $1, $2, $3 } ' > $temp_dir/readcount.regions
                        #### this sort is where there are problematic -- sort -kn for numeric or -k for characters
#                
        # run bam_readounts
                export bam_count_dir=bam-readcount_build/bin
                export b=15
                export w=1
                export bam_file=$(echo $nom | sed 's/.somatic-sniper//').bam
#
echo READCOUNT ON:  $ref_bam 
                $bam_count_dir/bam-readcount -b $b -w $w -f $genome \
                    -l $temp_dir/readcount.regions \
                    $ref_bam \
                    > $temp_dir/$output.REF.readcount
#                    
echo READCOUNT ON: $fastq_dir/$bam_file
                $bam_count_dir/bam-readcount -b $b -w $w -f $genome \
                    -l $temp_dir/readcount.regions \
                    $fastq_dir/$bam_file \
                    > $temp_dir/$output.readcount
#
                                # READCOUNT OUTPUT
                                # chr	position	reference_base BASE_READS:
                                # base → the base that all reads following in this field contain at the reported position i.e. C
                                #    count → the number of reads containing the base
                                #    avg_mapping_quality → the mean mapping quality of reads containing the base
                                #    avg_basequality → the mean base quality for these reads
                                #    avg_se_mapping_quality → mean single ended mapping quality
                                #   num_plus_strand → number of reads on the plus/forward strand
                                #   num_minus_strand → number of reads on the minus/reverse strand
                                # avg_pos_as_fraction → average position on the read as a fraction (calculated with respect to the length after clipping)
                                #   avg_num_mismatches_as_fraction → average number of mismatches on these reads per base
                                #   avg_sum_mismatch_qualities → average sum of the base qualities of mismatches in the reads
                                # num_q2_containing_reads → number of reads with q2 runs at the 3’ end
                                # avg_distance_to_q2_start_in_q2_reads → average distance of position (as fraction of unclipped read length) to the start of the q2 run
                                # avg_clipped_length → average clipped read length of reads
                                # avg_distance_to_effective_3p_end → average distance to the 3’ prime end of the read (as fraction of unclipped read length)
#
        # 1. get ref and alt base
        cut -f35 $temp_dir/data5.tmp | tr ">" "\t" | cut -f1 > $temp_dir/ref.tmp
        cut -f35 $temp_dir/data5.tmp | tr ">" "\t" | cut -f2 > $temp_dir/alt.tmp
#
        ## get readcount columns for REF
            paste $temp_dir/ref.tmp $temp_dir/$output.REF.readcount  | \
                awk  -F $'\t' 'BEGIN {OFS = FS } {
                                        if ( $1=="A" ) b=$7;   
                                            else if ( $1=="C" ) b=$8;
                                            else if ( $1=="G" ) b=$9;
                                            else if ( $1=="T" ) b=$10;
                                            else b="unk";
                                        print "|", b }' | tr ":" "\t" | cut -f1,3,5,9,14,15 > $temp_dir/ref.counts.tmp        # count, ave_base_qual, avg_pos_as_fraction,read_length,avg_distance_to_effective_3p_end -- REF, VAR 
#
        ## get readcount columns for sample
            paste $temp_dir/alt.tmp $temp_dir/$output.readcount  | \
                awk  -F $'\t' 'BEGIN {OFS = FS } {
                                        if ( $1=="A" ) b=$7;   
                                            else if ( $1=="C" ) b=$8;
                                            else if ( $1=="G" ) b=$9;
                                            else if ( $1=="T" ) b=$10;
                                            else b="unk";
                                        print "|", b }' | tr ":" "\t" | cut -f1,3,5,9,14,15 > $temp_dir/sample.counts.tmp        # count, ave_base_qual, avg_pos_as_fraction,read_length,avg_distance_to_effective_3p_end -- REF, VAR 
#
            # combine and filter
            paste $temp_dir/data5.tmp $temp_dir/ref.counts.tmp $temp_dir/sample.counts.tmp | \
            awk  -F $'\t' 'BEGIN {OFS = FS } {
                                                 if ( $39 < 0.1 || $45 < 0.1 ) next;          #  avg_pos_as_fraction (REF OR SAMPLE) < 0.1
                                                 else if ( $41 < 0.1 || $47 < 0.1 ) next;     #  avg_distance_to_effective_3p_end < 0.1
                                                 else if ( $38 < 30 || $44 < 30 ) next;       #  ave base quality < 20
                                                 else if ( $40 < 50 || $46 < 50 ) next;       #  ave read length > 50 bp
                                                 else print $0 }' > $temp_dir/data6.tmp                # FILTER READS ON ABOVE (sligtly watered down from varscan filters)
#
    cat <(echo -e chr"\t"pos"\t"filter"\t"ref_ss"\t"alt_ss"\t".A"\t".B"\t".C"\t"info"\t"norm_info"\t"sample_info"\t"norm_A"\t"norm_C"\t"norm_G"\t"norm_T"\t"norm_allele_1"\t"norm_allele_2"\t"norm_allele_1_freq"\t"norm_allele_1_count"\t"norm_allele_2_freq"\t"norm_allele_2_count"\t".D"\t"sample_A"\t"sample_C"\t"sample_G"\t"sample_T"\t"sample_allele_1"\t"sample_allele_2"\t"sample_allele_1_freq"\t"sample_allele_1_count"\t"sample_allele_2_freq"\t"sample_allele_2_count"\t".E"\t"genotype"\t"ft"\t"."\t"ref_count"\t"ref_ave_base_qual"\t"ref_avg_pos_as_fraction"\t"ref_read_length"\t"ref_avg_distance_to_effective_3p_end"\t".F"\t"sample_count"\t"sample_ave_base_qual"\t"sample_avg_pos_as_fraction"\t"sample_read_length"\t"sample_avg_distance_to_effective_3p_end) \
    $temp_dir/data6.tmp  > $fastq_dir/$nom.filtered.txt 
#   
wc -l $temp_dir/in.tmp
wc -l $temp_dir/data* 
echo REMOVING TEMP DIRECTORY 
    rm -rf $temp_dir
#
echo END END END ##### FILTERING FINISHED
