#!/bin/bash

#Specify Directory

DIRECT="./Unique_Align/"

COUNT_FILE=$(echo $DIRECT'Counts.txt')
touch $COUNT_FILE
echo NAME TOTAL WT EDIT HDR INDEL SNP >> $COUNT_FILE

for IN_FILE in $DIRECT*.fasta; do

	echo 'Working on...'$IN_FILE

	#Declare Variables
	ROOT=$(echo $IN_FILE | cut -d . -f 1-2)
	OUT_EDITS=$(echo $ROOT'_edits.fasta')
	OUT_WT=$(echo $ROOT'_wt.fasta')
	
	#Declare files
	touch $OUT_WT
	cp $IN_FILE $OUT_EDITS
	touch $COUNT_FILE
	
	#This module finds WT sequences in MSA, in which breaks (-) can be anywhere with 50 bp of DSB
	WT_0="TGTTTGGCTCGCCAGGATACCACGCCTGGAAATCAAACGCCGTGCCGTCAATCCACTCAAACACGGTGGGGTCTTCGGCAGAGCGAGCCCCACCAAGCCA"
	WT_len=${#WT_0}
	for i in {1..100}; do
		WT_n="${WT_0:0:i}"'-*'"${WT_0:i:WT_len}"
		WT_tn=$(grep -cE "$WT_n" $OUT_EDITS)
		while (($WT_tn!=0)); do
			MatchSeq=$(grep -nE -m 1 "$WT_n" $OUT_EDITS | cut -d : -f 1)
			MatchTitle=$((MatchSeq-1))
			sed -n "$MatchTitle,$MatchSeq p" $OUT_EDITS >> $OUT_WT
			sed -i.temp "$MatchTitle,$MatchSeq d" $OUT_EDITS
			WT_tn=$(grep -cE "$WT_n" $OUT_EDITS);
		done
	done
	rm -f $(echo $DIRECT'*.temp')
	
	#Declare more variables
	OUT_HDR=$(echo $ROOT'_hdr.fasta')
	OUT_noHDR=$(echo $ROOT'_NOhdr.fasta')
	
	#Declare more files
	touch $OUT_HDR
	cp $OUT_EDITS $OUT_noHDR

	#This module separates HDR edits from all other edits and allows for 1 bp deletion
	HDR_0="TTTATTTAATTAAATAAA"
	HDR_len=${#HDR_0}
	for i in {1..18}; do
		HDR_n="${HDR_0:0:i}"'*-*'"${HDR_0:i:HDR_len}"
		HDR_tn=$(grep -cE "$HDR_n" $OUT_noHDR)
		while (($HDR_tn!=0)); do
			MatchSeq=$(grep -nE -m 1 "$HDR_n" $OUT_noHDR | cut -d : -f 1)
			MatchTitle=$((MatchSeq-1))
			sed -n "$MatchTitle,$MatchSeq p" $OUT_noHDR >> $OUT_HDR
			sed -i.temp "$MatchTitle,$MatchSeq d" $OUT_noHDR
			HDR_tn=$(grep -cE "$HDR_n" $OUT_noHDR);
		done
	done
	rm -f $(echo $DIRECT'*.temp')


	#Start Quantifying

	TOT_SUM=0
	i=1
	TOT_len=$(grep -c ">" $IN_FILE)
	while ((i<=$TOT_len)); do

		#Sum all of the sequences in a given file
		N_i=$(grep -m $i ">" $IN_FILE | tail -n1 | cut -d - -f 2)
		TOT_SUM=$((TOT_SUM + N_i))
		
		i=$((i+1));
	done


	WT_SUM=0
	WT_INDEL_wSUM=0
	i=1
	WT_len=$(grep -c ">" $OUT_WT)
	while ((i<=$WT_len)); do

		#Sum all of the sequences in a given file
		N_i=$(grep -m $i ">" $OUT_WT | tail -n1 | cut -d - -f 2)
		WT_SUM=$((WT_SUM + N_i))

		#Following Section counts - in a sequence
		SeqIndex=$((i*2))
		INDEL=$(sed -n "$SeqIndex p" $IN_FILE | tr -cd -)
		INDEL_len=${#INDEL}

		#Weight the number of indels by the number of sequences
		INDEL_WEIGHT=$((INDEL_len*N_i))
		WT_INDEL_wSUM=$((WT_INDEL_wSUM+INDEL_WEIGHT))
		
		i=$((i+1));
	done
	WT_INDEL_ROUND_1=$((WT_SUM/2))
	WT_INDEL_ROUND_2=$((WT_INDEL_wSUM+WT_INDEL_ROUND_1))
	WT_INDEL_AVG=$((WT_INDEL_ROUND_2/WT_SUM))
	echo $WT_INDEL_AVG

	EDITS_SUM=0
	i=1
	EDITS_len=$(grep -c ">" $OUT_EDITS)
	while ((i<=$EDITS_len)); do

		#Sum all of the sequences in a given file
		N_i=$(grep -m $i ">" $OUT_EDITS | tail -n1 | cut -d - -f 2)
		EDITS_SUM=$((EDITS_SUM + N_i))
		
		i=$((i+1));
	done


	HDR_SUM=0
	i=1
	HDR_len=$(grep -c ">" $OUT_HDR)
	while ((i<=$HDR_len)); do

		#Sum all of the sequences in a given file
		N_i=$(grep -m $i ">" $OUT_HDR | tail -n1 | cut -d - -f 2)
		HDR_SUM=$((HDR_SUM + N_i))
		
		i=$((i+1));
	done

	#Make files for other edits
	OUT_INDEL=$(echo $ROOT'_indel.fasta')
	OUT_SNP=$(echo $ROOT'_snp.fasta')

	touch $OUT_INDEL
	touch $OUT_SNP

	noHDR_SUM=0
	i=1
	noHDR_len=$(grep -c ">" $OUT_noHDR)
	while ((i<=$noHDR_len)); do

		#Sum all of the sequences in a given file
		N_i=$(grep -m $i ">" $OUT_noHDR | tail -n1 | cut -d - -f 2)
		noHDR_SUM=$((noHDR_SUM + N_i))

		#Following Section counts - in a sequence
		SeqIndex=$((i*2))
		SeqTitle=$((SeqIndex-1))
		INDEL=$(sed -n "$SeqIndex p" $OUT_noHDR | tr -cd -)
		INDEL_len=${#INDEL}
		INDEL_diff=$((INDEL_len-WT_INDEL_AVG))
		INDEL_remain=$(echo $INDEL_diff | tr -d -)
		if (($INDEL_remain>0)); then
			sed -n "$SeqTitle,$SeqIndex p" $OUT_noHDR >> $OUT_INDEL
		else
			sed -n "$SeqTitle,$SeqIndex p" $OUT_noHDR >> $OUT_SNP;
		fi

		i=$((i+1));
	done

	INDEL_SUM=0
	i=1
	INDEL_len=$(grep -c ">" $OUT_INDEL)
	while ((i<=$INDEL_len)); do

		#Sum all of the sequences in a given file
		N_i=$(grep -m $i ">" $OUT_INDEL | tail -n1 | cut -d - -f 2)
		INDEL_SUM=$((INDEL_SUM + N_i))
		
		i=$((i+1));
	done

	SNP_SUM=0
	i=1
	SNP_len=$(grep -c ">" $OUT_SNP)
	while ((i<=$SNP_len)); do

		#Sum all of the sequences in a given file
		N_i=$(grep -m $i ">" $OUT_SNP | tail -n1 | cut -d - -f 2)
		SNP_SUM=$((SNP_SUM + N_i))
		
		i=$((i+1));
	done


	echo $IN_FILE $TOT_SUM $WT_SUM $EDITS_SUM $HDR_SUM $INDEL_SUM $SNP_SUM >> $COUNT_FILE;

done