#Calculates the averages of values in an sgr-file that falls within regions supplied as an gff-file
#The output is a text file named "RegionMeanSameSort6Out.txt"
#IMPORTANT: Both the sgr-file and the gff-file needs to be sorted in the same way (first by chromosome and then by start position)
#All neccessary files and parameters are  asked for by the script when executed
#Per Stenberg, per.stenberg@umu.se

print "Enter sgr-file name/path and press ENTER:\n";
$sgr_file=<STDIN>;
chomp($sgr_file);
open IN_SGR, "$sgr_file" or die "Can't open input file!\n";
print "Enter gff-file name/path and press ENTER:\n";
$gff_file=<STDIN>;
chomp($gff_file);
open IN_GFF, "$gff_file" or die "Can't open input file!\n";
print "Set minimum number of values to calculate mean/median:\n"; #the minimum number of values in the sgr-file that falls within a particular region
$min_nr=<STDIN>;
chomp($min_nr);
print "If less than minimum number of values set value to:\n";
$fail_value=<STDIN>;
chomp($fail_value);
print "Calculate mean/median from top nr consecutive probes (0) or top percent (1):\n";
$top_choice=<STDIN>;
chomp($top_choice);
print "Set number of top nr consecutive or percent features to calculate mean/median (0=all):\n";
$top_nr=<STDIN>;
chomp($top_nr);

$nr_data_rows=0;
while($rad=<IN_SGR>) {
	if($nr_data_rows%100000==0 && $nr_data_rows>1){
		print "$nr_data_rows rows read from file\n";
	}
	$nr_data_rows++;
	chomp($rad);
	@arr=split/\t/, $rad;
	push @chr_arr, $arr[0]; # chrom name
	push @pos_arr, $arr[1]; # middle position of probe
	push @val_arr, $arr[2]; # Enrichment value
}
close(IN_SGR);
print "Done reading $nr_data_rows rows of data!\n";

open OUT, ">RegionMeanSameSort6Out.txt" or die "Can't open outfile!\n";
print OUT "Region\tAverage\tMedian\tSum\n";
$last_x=0;
while($rad=<IN_GFF>){
	chomp($rad);
	@tmp=split/\t/, $rad;
	for($x=$last_x; $x<$nr_data_rows; $x++){
		if($chr_arr[$x] eq $tmp[0]){
			if($pos_arr[$x]>=$tmp[3] && $pos_arr[$x]<=$tmp[4]){
				$last_x=$x;
				@all_val=();
				while($pos_arr[$x]<=$tmp[4] && $chr_arr[$x] eq $tmp[0] && $x<$nr_data_rows){
					push @all_val, $val_arr[$x];
					$x++;
				}
				if(scalar(@all_val)>=$min_nr){
					if($top_nr==0){
						$sum=0;
						foreach $val (@all_val){
							$sum+=$val;
						}
						$av=$sum/scalar(@all_val);
						@sorted_val = sort { $a <=> $b } @all_val;
						if(scalar(@all_val)%2==0){
							$median=($sorted_val[(scalar(@all_val)/2)-1]+$sorted_val[scalar(@all_val)/2])/2;
						}
						else{
							$median=$sorted_val[int(scalar(@all_val)/2)];
						}
					}
					elsif($top_choice==0){
						$av_max=-9999999;
						for($y=0; $y<=scalar(@all_val)-$top_nr; $y++){
							$av_tmp=0;
							@med_arr=();
							for($z=$y; $z<$y+$top_nr; $z++){
								$av_tmp+=$all_val[$z];
								push @med_arr, $all_val[$z];
							}
							$av_tmp/=$top_nr;
							@sorted_med_arr = sort { $a <=> $b } @med_arr;
							if($top_nr%2==0){
								$med_tmp=($sorted_med_arr[($top_nr/2)]+$sorted_med_arr[($top_nr/2)-1])/2;
							}
							else{
								$med_tmp=$sorted_med_arr[int($top_nr/2)];
							}
							if($av_tmp>$av_max){
								$av_max=$av_tmp;
								$med_max=$med_tmp;
							}
						}
						$av=$av_max;
						$median=$med_max;
					}
					elsif($top_choice==1){
						@sorted_val = sort { $a <=> $b } @all_val;
						$percent_nr=int(scalar(@sorted_val)*($top_nr/100));
						if($percent_nr>=1){
							$sum=0;
							for($p=scalar(@sorted_val)-$percent_nr; $p<scalar(@sorted_val); $p++){
								$sum+=$sorted_val[$p];
							}
							$av=$sum/$percent_nr;
							if($percent_nr%2==0){
								$median=($sorted_val[(scalar(@sorted_val)-($percent_nr/2))-1]+$sorted_val[scalar(@sorted_val)-($percent_nr/2)])/2;
							}
							else{
								$median=$sorted_val[scalar(@sorted_val)-int($percent_nr/2)-1];
							}
						}
						else{
							$av=$fail_value;
							$median=$fail_value;
						}
					}
				}
				else{
					$av=$fail_value;
					$median=$fail_value;
				}
				print OUT "$tmp[0]_$tmp[3]_$tmp[4]\t$av\t$median";
				if($top_nr==0){
					print OUT "\t$sum\n";
				}
				else{print OUT "\n";}
				#$last_x=$x-1;
				last;
			}
		}
		if($x==$nr_data_rows-1){
			print OUT "$tmp[0]_$tmp[3]_$tmp[4]\t$fail_value\t$fail_value\n";
		}
	}
}