# Bash script strict mode (helps with debugging)
#!/bin/zsh
#set -eo pipefail #Refer to http://redsymbol.net/articles/unofficial-bash-strict-mode/ for details
IFS=$'\n\t'
#record start time
#Extract 5' umi scstart=$(date +%s)
echo "\n\n\n\n\nCOV-ID processing pipeline"
echo "updated 09Apr2021 / Robert Warneford-Thomson"
echo "script performs following steps:"
echo "1. download fastqs"
echo "2. Remove low quality reads from fastq"
echo "3. Removing adaptor sequences with cutadapt"
echo "4. Demultiplexing lamp barcodes"
echo "5. Aligns reads against COV-ID index with bowtie2"
echo "6. Remove alignments with >1 mismatches"
echo "7. Output alignment metrics"
echo "8. Extract numbers of reads mapping to each target"
echo "_______________________________"
echo ""
# #Initialize
zstart=$(date +%s)
wd=/home/rwt/analyses/2020/BR08.covID/CV35/Exp7_reseq
HOME=/home/rwt
FASTQdir=/home/bonasiolab/raw_data/FASTQ/2021
datadir=/home/rwt/Data/COV-ID/CV35/Exp7_reseq
readFilesCommand=zcat                                     # this is for gzipped FASTQs
filenum=12
subthreads=2

cd $wd
mkdir -p $wd/metrics

#Enter raw fastq filenames in bonasiolab fastq folder beside desired descriptive filename
cat << EOF > filenames
210427.HGF2MBGXJ.b0811.R1.gz CV35.Exp7_reseq.paperCOVIDb1.R1.fastq.gz
210427.HGF2MBGXJ.b0912.R1.gz CV35.Exp7_reseq.paperCOVIDb2.R1.fastq.gz
EOF

#Make barcode file
mkdir -p $wd/barcodes/logs
cat  << EOF > $wd/barcodes/barcodes.fasta
>i01
^CCTGT
>i02
^GTTAC
>i03
^GCATC
>i04
^TGCGA
>i05
^ATCAT
>i06
^GCTTG
>i07
^CACTG
>i08
^AGTCA
>i09
^CTAGA
>i10
^TAACG
>i11
^CTAGT
>i12
^AGCCT
>i13
^AAATG
>i14
^AGCCC
>i15
^ATATC
>i16
^CCAAG
>i17
^CGAGT
>i18
^CGATG
>i19
^CGCGG
>i20
^CGGAT
>i21
^GCGCC
>i22
^GGCGA
>i23
^GGTGT
>i24
^GTCAA
>i25
^GTCGC
>i26
^GTGAT
>i27
^TAAAC
>i28
^TACTA
>i29
^TAGAG
>i30
^TCTAG
>i31
^TGAAT
>i32
^TTATA
EOF

echo "1. download fastqs"
echo "..."
mkdir -p $datadir/fastq
while read filename ; do 
echo "\nDownloading fastq file" $(find $FASTQdir -name $(echo $filename |\
 	cut -f1 -d" ")) "renaming as:" $(echo $datadir)/fastq/$(echo $filename | cut -f2 -d" "); 
	cp $(find $FASTQdir -name $(echo $filename | cut -f1 -d" ")) \
	$(echo $datadir)/fastq/$(echo $filename | cut -f2 -d" ")  ; 
done < filenames ;
cut -f2 -d " " filenames | awk 'gsub(".R1.fastq.gz", "") {print $0}' > $wd/file.names
rm $wd/filenames
echo "\n Finished downloading fastq files"

# Filter out bad quality reads 
pstart=$(date +%s)
mkdir -p $wd/fastq_filter/logs
echo "\n 2. Remove low quality reads from fastq"
echo "..."
cat $wd/file.names | parallel -j$filenum \
	"zcat $datadir/fastq/{}.R1.fastq.gz | \
	 fastq_quality_filter -Q33 -v -q 20 -p 90 -z \
	-o $wd/fastq_filter/{}.R1.Q20.fastq.gz &> $wd/fastq_filter/logs/{}.fastq_filter.log"
echo "Finished"

# extract fastq_filter metrics
 cd $wd/fastq_filter/logs
 echo "Sample" > SampleNames
 echo "InputReads" > TotReads
 echo "QualityFilteredReads" > OutReads
 
while read filename; do 
awk 'BEGIN{FS=OFS=" "}/Input/{print $2}' $wd/fastq_filter/logs/$filename.fastq_filter.log >> TotReads
awk 'BEGIN{FS=OFS=" "}/Output/{print $2}' $wd/fastq_filter/logs/$filename.fastq_filter.log >> OutReads
; done < $wd/file.names

# write metrics to file
echo "Sample" > Samples
cat $wd/file.names >> Samples
paste Samples TotReads OutReads > $wd/metrics/fastq.filter.metrics

conda activate python3
#Remove Adapter sequences from read 1
#3' adapter CATCTCCGAGCCCAC... if present remove from from 3' end of R1
#keep reads with at least 35 bp in read1 (5 bp sample barcode + ~25 FIP primer + 5 mappable reads = minlength 35 bp),
echo "\n 3. Removing adaptor sequences with cutadapt"
echo "..."

mkdir -p $wd/Adapter.trim/metrics ; cd $wd
cat file.names | parallel -j$filenum "cutadapt -j $subthreads -m 35 \
-a CATCTCCGAGCCCAC -e 0.1 -n 5 --quality-cutoff 6 \
-o $wd/Adapter.trim/{}.R1.trim.fastq.gz \
 $wd/fastq_filter/{}.R1.Q20.fastq.gz > Adapter.trim/metrics/{}.adapter.trim.metrics"

 # extract adapter removal metrics
 cd $wd/Adapter.trim/metrics
 echo "Sample" > SampleNames
 echo "InputAdapterRemovalReads" > TotReads
 echo "ReadsAdapters" > AdapterReads
 echo "AdapterReadsTrimmed" > TooShortReads
 echo "ReadsWritten" > ReadsWritten
 
while read filename; do
i=$filename.adapter.trim.metrics
awk 'BEGIN{FS=OFS=" "}/Total reads processed/{print $4}' $i >> TotReads
awk 'BEGIN{FS=OFS=" "}/Reads with adapters/{print $4}' $i >> AdapterReads
awk 'BEGIN{FS=OFS=" "}/Reads that were too short/{print $6}' $i >> TooShortReads
awk 'BEGIN{FS=OFS=" "}/Reads written/{print $5}' $i >> ReadsWritten
; done < $wd/file.names
# write metrics to file
paste $wd/metrics/fastq.filter.metrics TotReads AdapterReads TooShortReads ReadsWritten > $wd/metrics/Adapter.trim.metrics
echo "Finished"

#Demultiplex LAMP barcodes with cutadapt
echo "\n 4. Demultiplexing lamp barcodes"
cat $wd/file.names | parallel "cutadapt -e 0 --no-indels \
-g file:$wd/barcodes/barcodes.fasta -o '$wd/barcodes/{}_{name}.fastq.gz' \
$wd/Adapter.trim/{}.R1.trim.fastq.gz &> $wd/barcodes/logs/{}.cutadapt.demux.log"
pend=$(date +%s)
runtime=$(($pend-$pstart))

echo $(printf '%dh:%dm:%ds\n' $(($runtime/3600)) $(($runtime%3600/60)) $(($runtime%60))) "runtime to Filter reads, trim adapters + demultiplex barcodes\n"

 # extract Demultiplexing metrics
 cd $wd/barcodes/logs
 echo "Sample" > SampleNames
 echo "Input_reads" > TotReads
 echo "ReadsWithBarcodes" > BarcodeReads
 
while read filename; do
i=$filename.cutadapt.demux.log
awk 'BEGIN{FS=OFS=" "}/Total reads processed/{print $4}' $i >> TotReads
awk 'BEGIN{FS=OFS=" "}/Reads with adapters/{print $4}' $i >> BarcodeReads
; done < $wd/file.names
# write metrics to file
cd $wd/barcodes/logs
paste $wd/metrics/Adapter.trim.metrics SampleNames TotReads BarcodeReads > $wd/metrics/Adapter.demux.metrics
echo "\nFinished extracting Barcodes"

#Make sample name file / strip last 2 extensions
cd $wd/barcodes
ls *i[0-9]*fastq.gz | parallel "echo '{= s:\.[^.]+$::;s:\.[^.]+$::; =}'" | sort -n > $wd/sample.names


# Map reads to index with bowtie2
echo "\n 5. Aligns reads against cov-ID index with bowtie2"
echo "..."
pstart=$(date +%s)
mkdir -p $wd/bowtie/logs
mkdir -p $wd/bowtie/unmapped

#change to index directory
cd /home/rwt/genomes/Bowtie2/SARS_COV2
cat $wd/sample.names | parallel -j$filenum "echo "aligning sample {}" >> $wd/bowtie/logs/{}.bowtie.log;
bowtie2 -x COV_ID_index -p $subthreads -N 0 --rdg 5,3 --rfg 5,3 --mp 6,2 --no-unal \
-U $wd/barcodes/{}.fastq.gz -S $wd/bowtie/{}.aligned.out.sam \
--un-gz $wd/bowtie/unmapped/{}.unmapped.sam.gz &>> $wd/bowtie/logs/{}.bowtie.log;
samtools view -S -b $wd/bowtie/{}.aligned.out.sam > $wd/bowtie/{}.aligned.out.bam; 
rm $wd/bowtie/{}.aligned.out.sam ;
echo 'bowtie2 -x COV_ID_index -p $subthreads -N 0 --rdg 5,3 --rfg 5,3 --mp 6,2 --no-unal \
-U $wd/barcodes/{}.fastq.gz -S $wd/bowtie/{}.aligned.out.sam \
--un-gz $wd/bowtie/unmapped/{}.unmapped.sam.gz &>> $wd/bowtie/logs/{}.bowtie.log;
samtools view -S -b $wd/bowtie/{}.aligned.out.sam > $wd/bowtie/{}.aligned.out.bam; 
rm $wd/bowtie/{}.aligned.out.sam' >> $wd/bowtie/logs/{}.bowtie.log"
conda deactivate

pend=$(date +%s)
runtime=$(($pend-$pstart))
echo "Finished alignment"
echo $(printf '%dh:%dm:%ds\n' $(($runtime/3600)) $(($runtime%3600/60)) $(($runtime%60))) "runtime to align reads\n"

# Remove reads that have >1 mismatch with bamtools
echo "\n 6. Remove alignments with >1 mismatches"
echo "..."
mkdir -p $wd/bowtie/filter; mkdir -p $wd/bowtie/sorted ; cd $wd/bowtie

for i in *bam; do bamtools filter -tag XM:"<=1" -in $i -out filter/${i%.aligned.out.bam}.filter.bam
	samtools sort filter/${i%.aligned.out.bam}.filter.bam > sorted/${i%.aligned.out.bam}.sort.bam
; done

cd $wd/bowtie/sorted
for i in *bam; do bamtools index -in $i; done 
echo "Finished"

#output mapping metrics
echo "\n 7. Output alignment metrics"
pstart=$(date +%s)
echo "..."
mkdir -p $wd/bowtie/metrics
#extract mapping statistics from logs
cd $wd/bowtie/logs
mkdir -p $wd/bowtie/logs/separate
mkdir -p $wd/bowtie/logs/merged


# Extract mapping statistics for each sample, collapse data from all barcodes into single log for each fastq

while read filename; do 
	echo 'Sample' > Samples; echo  'total' > totReads
	echo 'MappedUnique' > MappedUnique
	echo "Unmapped" > unMapped; echo "Multimapping" > Multimapping
	echo "Mismatch<1" > FilteredMismatches

	#pull out all detected barcodes for each file
	grep $filename\_ $wd/sample.names > samples.temp
			
			#for each barcode, extract mapping metrics
			while read sample; do
				i=$sample.bowtie.log
			tottemp=$(awk 'BEGIN{FS=OFS=" "}/reads/{print $1}' $i)

			echo $tottemp >> totReads
			# for files with zero reads parse logs differently 
			if [ "$tottemp" -eq "0" ] ; 
			then 
				echo 0 >> unMapped
				echo 0 >> MappedUnique
				echo 0 >> Multimapping
				echo sampleName >> Samples
				echo 0 >> FilteredMismatches
				continue
			fi
			awk 'BEGIN{FS=OFS=" "}/aligned 0 times/{print $1}' $i >> unMapped
			awk 'BEGIN{FS=OFS=" "}/aligned exactly 1 time/{print $1}' $i >> MappedUnique
			awk 'BEGIN{FS=OFS=" "}/aligned >1 times/{print $1}' $i >> Multimapping
			echo $sample >> Samples

			#<=1 mismatch reads
			samtools view -c $wd/bowtie/sorted/$sample.sort.bam >> FilteredMismatches
			; done < samples.temp
	# write sample metrics to table
	paste Samples totReads unMapped MappedUnique Multimapping FilteredMismatches > $wd/bowtie/logs/separate/$filename.mapping.metrics
	rm Samples totReads unMapped MappedUnique Multimapping FilteredMismatches samples.temp

	#merge separate barcode metrics into single log file; sum columns into single row
	tail -n+2 $wd/bowtie/logs/separate/$filename.mapping.metrics | \
	awk -F"\t" 'BEGIN {ORS ="\t"; sum=0; OFS=","} {for (i=2; i<=NF; i++) a[i]+=$i } END {for (i in a) print a[i]}' > temp.vals
	echo $filename > temp.name
	paste temp.name temp.vals > $wd/bowtie/logs/merged/$filename.mapping.merged
; done < $wd/file.names 

# Combine summed metrics into single table (maintain order of file.names using xargs)
cd $wd/bowtie/logs/merged
cat $wd/file.names | xargs -Ihello echo  hello.mapping.merged | xargs cat > $wd/bowtie/logs/merged/temp.mapping
echo "Sample\tMappingInput\tUnMapped\tUniqueMapping\tMultimapping\tMismatchFiltered" > header
cat header temp.mapping > $wd/metrics/merged.mapping.txt 
rm temp.mapping header
echo "\n Finished outputting mapping metrics"

# Combine all metrics into single table
cd $wd/metrics
if [ -n "$(awk '{print $1}' merged.mapping.txt > temp.a ; awk '{print $1}' Adapter.demux.metrics > temp.b; cmp temp.a temp.b)" ]
then
	echo "Log files are in different order, cannot merge"
else 
	echo "Merging logs"
 awk 'BEGIN {FS="\t"; OFS="\t"}; {print $1,$2,$3,$6,$10}' Adapter.demux.metrics > temp.a
 awk 'BEGIN {FS="\t"; OFS="\t"}; {print $2,$3,$4,$5,$6}' merged.mapping.txt > temp.b
paste temp.a temp.b > $wd/metrics/QC.final.metrics
rm temp.a temp.b
fi 

#obtain reads aligning to each target using Yeo lab python script
echo "\n 8. Extract numbers of reads mapping to each target"
echo "..."
cat $wd/sample.names | parallel -j$filenum "samtools view $wd/bowtie/sorted/{}.sort.bam \
 | ~/opt/eclip-0.3.99/gscripts/gscripts/general/count_aligned_from_sam.py > \
 $wd/bowtie/metrics/{}.mapping.summary.txt"

#condense summary data into table
cd $wd/bowtie/metrics
while read file; do 
	ls $file* > temp.files
	while read i ; do
	if grep -q "STATH" $i ; 
		then
		awk 'BEGIN{FS=OFS=" "}/STATH/{print $2}' $i >> STATHRawReads ;
		else
		echo "0" >> STATHRawReads
	fi

	if grep -q "NC_045512" $i ; 
		then
		awk 'BEGIN{FS=OFS=" "}/NC_045512/{print $2}' $i >> SARS_COV2Reads ;
		else
		echo "0" >> SARS_COV2Reads
	fi		

	if grep -q "SARS_SPIKE" $i ; 
		then
		awk 'BEGIN{FS=OFS=" "}/SARS_SPIKE/{print $2}' $i >> SARS_COV2SpikeReads ;
		else
		echo "0" >> SARS_COV2SpikeReads
	fi		

	if grep -q "Influenza" $i ;
                then
                awk 'BEGIN{FS=OFS=" "}/Influenza/{print $2}' $i >> InfluenzaReads ;
                else
                echo "0" >> InfluenzaReads 
        fi
	echo -ne ${i%.mapping.summary.txt}"\n" >> Samples
	; done < temp.files
; done < $wd/file.names

echo "Samples\tSTATHRawReads\tSARS_COV2Reads\tSARS_COV2SpikeReads\tInfluenzareads" > $wd/metrics/summary.reads.txt
paste Samples STATHRawReads SARS_COV2Reads SARS_COV2SpikeReads InfluenzaReads >> $wd/metrics/summary.reads.txt
rm Samples *Reads
echo "Finished"

pend=$(date +%s)
runtime=$(($pend-$pstart))
echo "Finished alignment"
echo $(printf '%dh:%dm:%ds\n' $(($runtime/3600)) $(($runtime%3600/60)) $(($runtime%60))) "runtime to generate metrics + alignment tables\n"


# output bigwig files
# mkdir -p $wd/bw
# cd $wd/bowtie/sorted
# for i in *bam; do 
# 	source $wd/bam2_bw.sh $i $wd/bw
# 	; done 

cd $wd
zend=$(date +%s)
runtime=$(($zend-$zstart))
echo "\n Script complete!"
echo $(printf '%dh:%dm:%ds\n' $(($runtime/3600)) $(($runtime%3600/60)) $(($runtime%60))) " runtime"

