#########################################################################################################################
# filterSeqsbySignal: this script is used to filter(separate) according to the signal (CPE data) that they contain.
# It takes as input the signal, the file with the results from the analysis of motifsearchTreeDecomposed.pl (by Sylvain Foissac)
# and the list of sequences and an output sequence file name
#
# USAGE: perl filterSeqsbySignal.pl filter_name results_file input_Sequence_file ouput_sequence_file
# EX: perl filterSeqsbySignal-v0.pl 'activation_early_strong3_final' X_tropicalis_results X_tropicalis.NCBIREFSEQ3UTR.fa act_ear_str3.out
#
# By Pedro Gabriel Ferreira pedro.ferreira@crg.es
# Version 0 - 26/02/2008
#########################################################################################################################
### MODULES IMPORT ###
use strict;
use Bio::SeqIO;


########################################################
### GLOBAL VARIABLES
########################################################
my %hashSelectedSeqs;
my ($filter, $results_file, $input_seq_file, $ouput_seq_file);




########################################################
### READING PARAMETERS
########################################################
if ((scalar(@ARGV) != 4) ) {
	print "\n\nWrong usage!\n";
	print "USAGE: perl filterSeqsbySignal.pl filter results_file seqs_file output_file\n\n";
	exit;
}

if (scalar(@ARGV) == 4) {
	($filter, $results_file, $input_seq_file, $ouput_seq_file) = @ARGV;
}



########################################################
### OPEN FILES
########################################################

# open files for reading and writing
open INFILE, "<$results_file" or die $!; 

my $seq_in = Bio::SeqIO->new(-file => $input_seq_file);
my $seq_out = Bio::SeqIO->new('-file' => ">$ouput_seq_file", '-format' => 'fasta');

my @res;
#scan the results file
while(<INFILE>) {
	@res = ();
	@res = split(' ', $_);
	if ($res[1] eq $filter){
		# if it matches the filter keep the identifier of the seq
		$hashSelectedSeqs{$res[0]}=1;
	}
}


### Traverse the identifier list and output all the selected sequences
my $cnt = 0;
while (my $inseq = $seq_in->next_seq) {

	if (exists $hashSelectedSeqs{$inseq->display_id()}) {
		print $inseq->display_id() ."\n";
		$seq_out->write_seq($inseq);
		$cnt++; 
	}
}

print "Number of seqs: " . $cnt . "\n";



exit;
