
################################################################################
# Alisa Yurovsky, Futcher Lab, Stony Brook University, 2014
#
# for ribosome profiling data 
#
# outputs the RRT values for each codon for each codon window position (unless number of output positions is modified by the options
#
# the optional permutations output file provides a pvalue for each codon at each window position
#
# the optional windows output file provides all the window strings accepted for the computation
#
################################################################################
#!/usr/bin/perl
use Getopt::Long;
use warnings; use strict;

my $stage1_file = "";
my $codons_file = "";
my $output_file = "";
my $output_windows_file = "";
my $permutations_file = "";
my $num_perm = 100;
my $normalization_begin_pos = 1;
my $normalization_end_pos = 10;
my $discard_matches = 0;

GetOptions("b=i" => \$normalization_begin_pos, 	# optional, default position in a 10 codon window is 1
			"c=s" => \$codons_file,
			"d=i" => \$discard_matches,			# optional, specify any non-zero integer, modify the "matches" hash in the code to ignore desired string matches
			"e=i" => \$normalization_end_pos, 	# optional, default position in a 10 codon window is 10
            "o=s" => \$output_file,
			"p=s" => \$permutations_file, 		# optional, specifies the file to output the results of the permutations
			"q=i" => \$num_perm,				# optional, default number is 100  
			"s=s" => \$stage1_file,
			"w=s" => \$output_windows_file); 	# optional, will print out only the accepted windows 

open(STAGE1, $stage1_file) || die("Cannot open the file from stage1");
open(COD, $codons_file) || die("Cannot open the codon file, should be a text file with one codon per line\n");
open(OUT, ">" . $output_file) || die("Cannot open the output file\n");
print "\n";
if ($output_windows_file ne "") {
	open(OUTWINDOW, ">" . $output_windows_file) || die("Cannot open the accepted windows output file\n");
}
if ($permutations_file ne "") {
	open(PERM_OUT, ">" . $permutations_file) || die("Cannot open the permutations output file\n");
}
if ($num_perm > 10000) {
	print "Greater than 10K permutations will probably take too long; consider exiting and re-running with fewer\n"; 
}
if ($num_perm < 10) {
	die("Fewer than 10 permutations does not make sense\n");
}
if ($normalization_begin_pos < 1) {
	die("Normalization begin position must be >= 1\n");
}
if ($normalization_end_pos > 10) {
	die("Normalization end position must be <= 10\n");
}
if ($normalization_begin_pos >= $normalization_end_pos) {
	die("Normalization begin position must be less than normalization end position");
}
if ($discard_matches !=0 ) {
	print "Accepted windows will be further pruned if they match certain strings in the matches hash; modify matches hash as necessary\n";
}

print "Opened input/output files and checked parameters\n";

################################################################################
# MODIFY THE MATCHES HASH AS DESIRED 
my %matches;
my $SD = "AGGAGG";
# 6-match
$matches{$SD} = 1; 
# 5-matches
$matches{"AGGAG"} = 1; 
$matches{"GGAGG"} = 1; 
foreach my $pos (0..5) { 
    foreach my $letter ("A","T","C","G") {
        my $seq = substr($SD,0,$pos) . $letter . substr($SD,$pos+1,5-$pos);
        if(!defined($matches{$seq})) {
            $matches{$seq} = 1;
        }
    }
}
# 4-matches
$matches{"AGGA"} = 1; 
$matches{"GGAG"} = 1; 
$matches{"GAGG"} = 1;
foreach my $pos1 (0..4) {
    foreach my $pos2 ($pos1+1..5) { 
        if($pos1 == $pos2) {
            next;
        }
        foreach my $letter1 ("A","T","C","G") { 
            foreach my $letter2 ("A","T","C","G") {
                my $seq = substr($SD,0,$pos1) . $letter1 . substr($SD,$pos1+1,$pos2-$pos1-1) . $letter2 . substr($SD,$pos2+1,5-$pos2);
                if(!defined($matches{$seq})) {
                    $matches{$seq} = 1; 
                }
            }
        }
    }
} 
# 3-matches
$matches{"AGG"} = 1;
$matches{"GGA"} = 1; 
$matches{"GAG"} = 1;
# 2-matches
$matches{"GG"} = 1;

################################################################################

sub fisher_yates_shuffle {#pass array reference and the array will be randomly permuted in a random way
    my $array=shift;
        my $i;
    for ($i=@$array;--$i;) {
                my $j=int rand ($i+1);
            next if $i==$j;
                @$array[$i,$j]=@$array[$j,$i];
        }
}
################################################################################

# read in the codons
my %codon_stats;
my %permuted_stats;
while(my $line = <COD>) {
    chomp $line;
	$codon_stats{$line}{"num_considered"} = 0;
	for(my $i = ($normalization_begin_pos -1); $i < $normalization_end_pos; $i++) {
		$codon_stats{$line}{$i} = 0;
		my @zeros = (0) x $num_perm;
		$permuted_stats{$line}{$i} = \@zeros;
	}
}

################################################################################

# get the windows and make selections
while(my $gene = <STAGE1>) {
	chomp $gene;
	my $seq = <STAGE1>;
	chomp $seq;
	my $starts_string = <STAGE1>;
	chomp $starts_string;
	my @ss = split /,/, $starts_string;
	my $seq_end = length($seq) - 30;
	for(my $i = 27; $i < $seq_end; $i += 3)  {
		################################################################################
		# check that there are at least 20 reads and no more than 1 instance of the codon we are looking at
		my $codon = substr($seq,$i,3);
		#print "Processing codon " . $codon . ", at index " . $i . "\n";
		my $total_starts = 0;
		my $total_starts_for_normalization = 0;
		my $non_zero_starts = 0;
		my $codon_repeats = 0;
		for(my $j = $i - 27; $j < $i + 30; $j += 3) {
			if($j <= $i) { # checks number of reads only in the 10 codon window
				$total_starts += $ss[($j/3)];
				# check the range for normalization
				my $stats_idx = ($i - $j)/3;
				if (($stats_idx < $normalization_end_pos) && ($stats_idx >= ($normalization_begin_pos-1))) { # stats are zero-indexed
					$total_starts_for_normalization += $ss[($j/3)];
				}
				# check the non-zero starts
				if ($ss[($j/3)] > 0) {
					$non_zero_starts++;
				}
			}
			if (($j != $i) && (substr($seq,$j,3) eq $codon)){
				$codon_repeats = 1;
				last;
			}
		}
		################################################################################
		# check general window acceptance conditions
		if(($codon_repeats > 0) || ($total_starts < 20) || ($non_zero_starts < 3)) {
			next;
		}
		# if discard_matches option is specified, reject windows if they match certain strings
		if ($discard_matches != 0) {
			my $search_string = substr($seq,$i-27,60);
			my $found_match = 0;
			foreach my $sequence (keys %matches) {
				if (index($search_string,$sequence) != -1) {
					$found_match++;
					last;
				}
			}
			if($found_match > 0) {
				next;
			}
		}
		################################################################################
		# update the actual stats
		for(my $j = $i - 27; $j <= $i; $j += 3) {	
			my $stats_idx = ($i - $j)/3;
			if (($stats_idx < $normalization_end_pos) && ($stats_idx >= ($normalization_begin_pos-1))) { # stats are zero-indexed
				if($total_starts_for_normalization != 0) {
           			$codon_stats{$codon}{$stats_idx} += ($ss[($j/3)])/$total_starts_for_normalization; # normalize so all values in window sum to 1
				}
			}
		}
		$codon_stats{$codon}{"num_considered"}++;
		################################################################################
		# get the "window" of 60 nt and output if the option was supplied
		if ($output_windows_file ne "") {
			print OUTWINDOW substr($seq,$i-27,57) . "\n";
		}
		################################################################################
		# if permutations option was supplied, run the permutation and collect the stats
		if ($permutations_file ne "") {
			# create an array that I will be permuting
			my @to_shuffle;
			for(my $j = $i - 27; $j <= $i; $j += 3) {	
				my $stats_idx = ($i - $j)/3;
				if (($stats_idx < $normalization_end_pos) && ($stats_idx >= ($normalization_begin_pos-1))) { # stats are zero-indexed
					if($total_starts_for_normalization != 0) {
						push(@to_shuffle, ($ss[($j/3)])/$total_starts_for_normalization);
					} else {
						push(@to_shuffle, 0);
					}
				}
			}
			# now perform the shuffles and assign the values to all permutations
			for(my $k = 0; $k < $num_perm; $k++) {
				&fisher_yates_shuffle(\@to_shuffle);	
				for(my $n = ($normalization_begin_pos - 1); $n < $normalization_end_pos; $n++) {
					${$permuted_stats{$codon}{$n}}[$k] += $to_shuffle[$n];
				}		
			}		
		}
	} 
}


################################################################################

# get the averages and print
foreach my $codon (keys %codon_stats) {
	print OUT $codon;
	for(my $i = ($normalization_begin_pos - 1); $i < $normalization_end_pos; $i++) {
		if ($codon_stats{$codon}{"num_considered"} == 0) {
			printf OUT ",%.6f", 0; #$codon_stats{$codon}{$i};
		} else {
			printf OUT ",%.6f", ($codon_stats{$codon}{$i}/$codon_stats{$codon}{"num_considered"});
		}
	}
	print OUT "," . $codon_stats{$codon}{"num_considered"} . "\n";
} 

################################################################################

if ($permutations_file ne "") {
	foreach my $codon (keys %codon_stats) {
		print PERM_OUT $codon;
		for(my $i = ($normalization_begin_pos - 1); $i < $normalization_end_pos; $i++) {
			my $num_larger = 0;
			my $num_smaller = 0;
			my $score = 0;
			if ($codon_stats{$codon}{"num_considered"} == 0) {
				$score = 1; 
				$num_larger = $num_perm;
			} else {
				$score = ($codon_stats{$codon}{$i}/$codon_stats{$codon}{"num_considered"});
				foreach my $val (@{$permuted_stats{$codon}{$i}}) {
					if(($val/$codon_stats{$codon}{"num_considered"}) >= ($codon_stats{$codon}{$i}/$codon_stats{$codon}{"num_considered"})) {
						$num_larger++;
					}
					if(($val/$codon_stats{$codon}{"num_considered"}) <= ($codon_stats{$codon}{$i}/$codon_stats{$codon}{"num_considered"})) {
						$num_smaller++;
					}
				}
			}
			my $median = 1/($normalization_end_pos - $normalization_begin_pos + 1);
			if ($score > $median) { # greater than the median
				printf PERM_OUT ",%.4f" , ((1+$num_larger)/$num_perm);
			} else {
				printf PERM_OUT ",%.4f" , ((1+$num_smaller)/$num_perm);
			}
		}
		print PERM_OUT "," . $codon_stats{$codon}{"num_considered"} . "\n";
	} 
}

################################################################################
