#!/usr/bin/perl

## Written by Matthew J. Maurer, 2013
## Version 0.11

use strict;
use warnings;
use Bio::SeqIO;
use Getopt::Std;
#####################################################################
####################################################################################################
our($opt_q, $opt_u, $opt_d, $opt_c, $opt_f, $opt_h);			
getopts('q:u:d:c:f:h');
####################################################################################################
############################################################################################################################
sub helpmsg{
print "The following option is mandatory:
 -q  Query sequence
The following options can be set by the user:
 -u  Distance upstream of the beginning of the query sequence: Default = 0
 -d  Distance downstream from the end of the query sequence: Default = 0
 -c  Coordinate system; 0 or 1 for zero-based or one-based genome coordinate system: Default = 0
 -f  Output format; bed format or list format: Default = bed
     Note: bed format forces zero-based coordinate system
"
}

################################################################################
################################################################################
sub findSeqMatches{
	my($chr, $DNA, $querySeq, $upstreamDistance, $downstreamDistance, $genCoordSys, $outFormat)=@_;

	my $queryRevCompSeq = Bio::Seq->new(-seq => $querySeq)->revcom->seq;

	if ($outFormat eq "bed"){
		$genCoordSys = 0;
	}
	my $lenDNA = length($DNA);
	my $pos = $genCoordSys - 1;
	
	while ($DNA =~ /(?=($querySeq|$queryRevCompSeq))/g) {					
	    my $pos = pos($DNA)+$genCoordSys;
		my $extractedSeq = '';
		my $strand;
		my $begin;
		my $end;
		if ($1 eq $querySeq){
			$strand = "+";
			$begin = $pos-$upstreamDistance;
			$end = $pos+length($querySeq)-1+$downstreamDistance;
			if($begin < $genCoordSys){$begin = $genCoordSys}
			if($end > $lenDNA){$end = $lenDNA}			

			## Extract sequence from $begin to $end
#			unless ($begin > $end){
#				$extractedSeq = substr($DNA, $begin, ($end-$begin+1));			
#			}

			## Extract 20nt sequence upstream of PAM site
			unless ($begin > $end){
				unless ($pos == 0){
					if($pos-21 < $genCoordSys){
						$extractedSeq = substr($DNA, $genCoordSys, $pos-1);
					} else {
						$extractedSeq = substr($DNA, $pos-21, 20);
					}
				}
			}
			
		} elsif ($1 eq $queryRevCompSeq){
			$strand = "-";
			$begin = $pos-$downstreamDistance;
			$end = $pos+length($querySeq)-1+$upstreamDistance;
			if($begin < $genCoordSys){$begin = $genCoordSys}
			if($end > $lenDNA){$end = $lenDNA}			

			## Extract sequence from $begin to $end
#			unless ($begin > $end){
#				$extractedSeq = substr($DNA, $begin, ($end-$begin+1));
#				$extractedSeq = Bio::Seq->new(-seq => $extractedSeq)->revcom->seq;
#			}

			## Extract 20nt sequence upstream of PAM site
			unless ($begin > $end){
				unless ($pos+3 >= $lenDNA){
					if($pos+23 > $lenDNA){
						$extractedSeq = substr($DNA, $pos+3, $lenDNA-$pos-3);
					} else {
						$extractedSeq = substr($DNA, $pos+3, 20);
					}
					$extractedSeq = Bio::Seq->new(-seq => $extractedSeq)->revcom->seq;
				}
			}
			
		}	
		last if $pos < $genCoordSys;
		unless ($begin > $end) {
			if ($outFormat eq "bed"){
				print $chr."\t".$begin."\t".$end."\t".$strand."\t".$extractedSeq."\n";
			} else {
				print $chr.":".$begin."-".$end."\n";
			}
		}
	}
}
################################################################################
if($opt_h){&helpmsg(); die}		#If -h, display help message.

unless ($opt_q){ #If arguments are undefined, then
	print "WARNING!  Query sequence must be specified with -q\n$!";	#Print my own error message,
	&helpmsg();							#display the help message,
	die}								#and end the program.

unless ($opt_u){$opt_u = 0}	# if no -u specified, set = 0
unless ($opt_d){$opt_d = 0}	# if no -d specified, set = 0
unless ($opt_c){$opt_c = 0}	# if no -c specified, set = 0
unless ($opt_f){$opt_f = "bed"}	# if no -f specified, set = bed

##############################
my $stream = Bio::SeqIO->newFh(-fh => \*ARGV ,
                      		   -format => 'fasta');

while ( my $seq = <$stream> ) {
	findSeqMatches($seq->id, $seq->seq, $opt_q, $opt_u, $opt_d, $opt_c, $opt_f);
}
