#!/bin/sh

# Example for a script that allows the determination of the size-distribution of a segment between to constant sequence motifs. Sequences not containing both motifs are excluded from the analysis. Sequences of the same length are combined in single files.

# Output is also generated for collapsed files. The length distribution is stored in the file with the suffix "length_dist.txt". 

# Note that unconstrained length distributions from a fasta file can simply be generated by using "cat <file> | awk '{if(NR%4==2) print length($1)}' | sort -n | uniq -c > read_length.txt 

rm -r --force sizedist_output

mkdir sizedist_output
mkdir sizedist_output/collapsed

for ((i=0; i<101; i++)) ;

do
	echo "extracting $i/100"
	egrep -E "^GGCGCGAAATTAATACGACT.{$i}CACGCTGCCCGTATGTATAT" $1 -B1 --no-group-separator > ./sizedist_output/L$(($i))_$1
	fastx_collapser -i ./sizedist_output/L$(($i))_$1 -o ./sizedist_output/collapsed/coll_L$(($i))_$1 -v
	echo "$i" `wc -l < ./sizedist_output/L$(($i))_$1| awk '{print $1/2}'` >> ./sizedist_output/$1_length_dist.txt

		                                  
done
