#!/usr/bin/perl -w
use strict;
use warnings;

use FindBin;
use lib "$FindBin::Bin/../../";


#use DBI;
use Config::EZConf;
use Bio::Annotations;
use Bio::Annotations::Alignment::TranscriptTranslation;
use Statistics::Basic qw(:all);
use Bio::DB::BigWig;
#use Time::HiRes qw ( time  );

# gene file downloaded at ftp://ftp.ncbi.nlm.nih.gov/gene/DATA/
main ();
sub configure_main {
    my $ezconf = shift;
    
#     Bio::Annotations::ezconf_options(
#         $ezconf,
#         '',
#         $ezconf->add_group (label => 'Annotation DB')
#     );
#   
    my $group = $ezconf->add_group (label => 'Annotation BD');
    $ezconf->add_option (
        id          => 'dsn',
        label       => 'DSN',
        description => 'Annotation database',
        type        => 'string',
        default => '',
        group => $group
    );

    $ezconf->add_option (
        id          => 'user',
        label       => 'User',
        description => 'Annotation database username',
        type        => 'string',
        default     => 'public',
        group => $group
    );

    $ezconf->add_option (
        id          => 'pass',
        label       => 'Pass',
        description => 'Annotation database password',
        type        => 'string',
        default     => 'public',
        group => $group
    );
    
    my $ann_grp = $ezconf->add_group (label => 'Annotation parameters');
    
    $ezconf->add_option(
        id          => 'species',
        synonyms    => [qw/sp/],
        label       => 'Species name',
        description => 'The species to work on',
        type        => 'string',
        default     => 'Human',
        group       => $ann_grp
    );
    
    $ezconf->add_option(
        id          => 'annotation_source',
        synonyms    => [qw/ann/],
        label       => 'Annotation source name',
        description => 'The annotation source. Current possibilities are : NCBIGene, UCSC, AceView, EnsEMBL.',
        type        => 'string',
        default     => 'UCSCGene',
        group       => $ann_grp
    );
    
    $ezconf->add_option (
        id          => 'annotation_name',
        label       => 'annotation_name',
        description => 'If provided, will be used to fetch annotation',
        type        => 'string',
        default     => '',
        group       => $ann_grp
    );
    
    my $stud_cons = $ezconf->add_group (label => 'Altorf');
    
    $ezconf->add_option(
        id          => 'transcript_translation_id',
        label       => 'studied transcript_translation_id id',
        description => 'db id',
        type        => 'integer',
        default     => '',
        group       => $stud_cons,
    );
    
    my $grp_cons = $ezconf->add_group (label => 'Conservation');
    

    $ezconf->add_option(
        id          => 'bigwig_method',
        label       => 'bigwig_method',
        description => 'Call _external_ bigWigSummary or use _internal_ Bio::DB::BigWig',
        type        => 'string',
        default     => 'internal',
        group       => $grp_cons,
    );

    $ezconf->add_option(
        id          => 'window_count',
        label       => 'window_count',
        description => 'the number of background window to analyze',
        type        => 'integer',
        default     => '1000',
        group       => $grp_cons,
    );
                
    $ezconf->add_option(
        id          => 'phylop_bigwig_path',
        label       => 'Wig filepath',
        description => 'WIG conservation filepath',
        type        => 'string',
        default     => '',
        group       => $grp_cons,
    );
    
    $ezconf->add_option(
        id          => 'phylop_conservation_attr',
        label       => 'Conservation attribute',
        description => 'Conservation attribute that will be assigned. Possible values are: phyloP100way or phastCons100.',
        type        => 'string',
        default     => 'phyloP100way',
        group       => $grp_cons,
    );
    
    my $mt_cons = $ezconf->add_group (label => 'MultiThreading');
    
    $ezconf->add_option(
        id          => 'min_index',
        label       => 'min_index',
        description => 'min array index',
        type        => 'integer',
        default     => -1,
        group       => $mt_cons,
    );
    
    $ezconf->add_option(
        id          => 'max_index',
        label       => 'max_index',
        description => 'max array index',
        type        => 'integer',
        default     => -1,
        group       => $mt_cons,
    );
    
}

sub main {
    my $ezconf = Config::EZConf->new;
    configure_main ($ezconf);
    my $conf = $ezconf->parse_config (@ARGV);
    
    if ($conf->{bigwig_method} eq 'external' and `which bigWigSummary` eq '') {
        die("bigWigSummary not found in PATH");
    }
    if ($conf->{phylop_bigwig_path} eq '' or !(-e $conf->{phylop_bigwig_path})) {
        die("please provide an AltORF DB conservation bigwig filepath (set bigwig_path) that match the bigwig conservation file that will be used in analysis. " );
    }
    
    
    if ($conf->{min_index} == -1 or $conf->{max_index} == -1){
        die("Please specify minimal and maximal index for processing." );
    }
    
    print STDERR "Connecting to Annotation DB\n";
    my $schema = Bio::Annotations->connect($conf->{dsn}, $conf->{user}, $conf->{pass});
    
    my $annotation = get_annotation($schema,$conf);
    
    print STDERR "fetching altorfs\n";
    my @dbic_trans_rs;
    if($conf->{transcript_translation_id} ne ""){
        @dbic_trans_rs = $schema->resultset('DBIC::TranscriptTranslation')->search(
            {
                'me.transcript_translation_id'    => $conf->{transcript_translation_id},
                'me.ref_based_location'    => 'orf',
                'gene.annotation_id' => $annotation->annotation_id()
            },
            {
                join      => { 'dbic_transcript' => 'gene' }
            }
        );
    }
    else{
        # going throught all the annotation
        @dbic_trans_rs = $schema->resultset('DBIC::TranscriptTranslation')->search(
            {
                'me.type'    => 'cryptic',
                'me.ref_based_location'    => 'utr3',
                'gene.annotation_id' => $annotation->annotation_id()
            },
            {
                join      => { 'dbic_transcript' => 'gene' }
            }
        );
    }
    
    print STDERR "Total altorf for annotation = ". scalar(@dbic_trans_rs) . "\n";
#     die();
    my $to_analyse = $conf->{max_index} - $conf->{min_index};
    
    print STDERR "Analyzing codon 3rd conservation for ".$to_analyse." altorfs\n";
    
    my $total_altorf = 1;
    my $beginning = time();    
    my $total_time = 0;    

    for(my $i = $conf->{min_index}; $i < $conf->{max_index}; $i++){
        
        my $analysis_time = time();

        my $dbic_trans = $dbic_trans_rs[$i];
        
        if($total_altorf % 100 == 0){
            print STDERR "Processed " . (($total_altorf / $to_analyse) * 100)  . "% total of translation (".((time() - $beginning) / 60)." min).\n";
        }
        $total_altorf++;
        
        my $dbic_transcript = $dbic_trans->dbic_transcript();
        my $transcript = Bio::Annotations::Alignment::Transcript->build_single({
            schema => $schema,
            distant_id => $dbic_transcript->transcript_id(),
            assembly => $annotation->assembly(),
            accession => $dbic_transcript->accession(),
            gene => $dbic_transcript->gene()
        });
        
        my $altorf = Bio::Annotations::Alignment::TranscriptTranslation->build_single({
            schema => $schema,
            distant_id => $dbic_trans->transcript_translation_id(),
            assembly => $annotation->assembly(),
            accession => $dbic_trans->accession(),
            type => $dbic_trans->type(),
            ref_based_location => $dbic_trans->ref_based_location(),
            frame => $dbic_trans->frame(),
            as_kosak_motif => $dbic_trans->as_kosak_motif(),
            transcript => $transcript,
            phastCons100_mean  =>  $dbic_trans->phastCons100_mean(),
            phastCons100_ratio =>  $dbic_trans->phastCons100_ratio(),
            phyloP100way_mean  =>  $dbic_trans->phyloP100way_mean(),
            phyloP100way_ratio =>  $dbic_trans->phyloP100way_ratio()
        });
        
        # get the reference altorf
        my $ref_orf = $altorf->get_reference_orf($annotation);
        my $ref_orf = $altorf->transcript;
        
        # extract third nucleotide conservation for ref and altorf
        # method returns a reference to array of hash reference:
        # i.e. {   
        #        coord_val => {'phylop' => val, 'phastcons' => val},
        #        coord_val => {'phylop' => val, 'phastcons' => val},
        #        coord_val => {'phylop' => val, 'phastcons' => val}
        # } 
        # representing chromosome coord associated to a conservation score (phastcons or phylop)
#         print STDERR "fetching phylop data\n";

        # get altorf coord and extract altorf coord from conservation map
        my $alt_locs = $altorf->locations;
        my $alt_start = $alt_locs->[0]->start;
        my $alt_stop = $alt_locs->[@$alt_locs-1]->end;
        
        if($altorf->strand() < 0){
            $alt_start = $alt_locs->[@$alt_locs-1]->start;
            $alt_stop = $alt_locs->[0]->end;
        }
        
        my @alt_phylop_3nt;
        if ($conf->{bigwig_method} eq 'internal') {
            my $bw = Bio::DB::BigWig->new(-bigwig => $conf->{phylop_bigwig_path});
            $alt_phylop_3nt[0] = $altorf->get_conservation_by_codon_pos_internal($annotation,$bw,1);
            $alt_phylop_3nt[1] = $altorf->get_conservation_by_codon_pos_internal($annotation,$bw,2);
            $alt_phylop_3nt[2] = $altorf->get_conservation_by_codon_pos_internal($annotation,$bw,3);
        }
        if ($conf->{bigwig_method} eq 'external') {
            $alt_phylop_3nt[0] = $altorf->get_conservation_by_codon_pos($annotation,$conf->{phylop_bigwig_path},1);
            $alt_phylop_3nt[1] = $altorf->get_conservation_by_codon_pos($annotation,$conf->{phylop_bigwig_path},2);
            $alt_phylop_3nt[2] = $altorf->get_conservation_by_codon_pos($annotation,$conf->{phylop_bigwig_path},3);
        }
        
        my @phylop_by_pos;
        foreach my $nt (0..2) {
            if(scalar(keys(%{$alt_phylop_3nt[$nt]})) > 0){
                my @alt_phylop = keys(%{$alt_phylop_3nt[$nt]});
                $phylop_by_pos[$nt] = get_avg_conservation($alt_phylop_3nt[$nt],\@alt_phylop);
            }
        }
        
        print STDERR "gene=" . $altorf->transcript->gene->symbol . "\n";
        
        print 
            $altorf->transcript_translation_id , "\t"
            , join("\t",@phylop_by_pos), "\t",
            , $altorf->accession, "\t",
            , $altorf->transcript->accession , "\t",
            , $altorf->transcript->gene->symbol , "\n";
            
        print STDERR "done analyzing ".$dbic_trans->transcript_translation_id().". Required time = " . (time() - $analysis_time) . "sec. Total exec time = ".((time() - $beginning) / 60)." min\n";
    }
    print STDERR "Analysis completed\n";
}

sub get_avg_conservation {
    my($ref_conservation_3nt_by_coords,$cons_pos) = @_;
    
    my @cons_vals;
    
    foreach my $pos (@$cons_pos){
        push(@cons_vals,$ref_conservation_3nt_by_coords->{$pos});
    }
    
    return mean(@cons_vals);    
}

sub get_annotation{
    my($schema,$conf) = @_;
    # get the choosen annotation
    if($conf->{species} eq '' or $conf->{annotation_source} eq ''){
        die("Annotation information is required!");
    }
    
    my $annotation = $schema->resultset('Annotation')->search(
        {
            #'species.common_name'    => $conf->{species},
            #'annotation_source.name' => $conf->{annotation_source},
            'me.name' => $conf->{annotation_name},
        },
        {
            join => [ {'assembly' => 'species'}, 'annotation_source' ],
            order_by => ['me.annotation_id DESC'],
        }
      )->single();
      
    if(!defined($annotation)){
        $annotation = $schema->resultset('Annotation')->search(
            {
                'species.common_name'    => $conf->{species},
                'annotation_source.name' => $conf->{annotation_source},
            },
            {
                join => [ {'assembly' => 'species'}, 'annotation_source' ],
                order_by => ['me.annotation_id DESC'],
            }
          )->single
        or die("Cannot find annotation");
    }
    
    print STDERR ("---Annotation Information---\n");
    print STDERR (" - Specie: ", $annotation->assembly()->species()->common_name(),"\n");
    print STDERR (" - annotation source: ", $annotation->annotation_source()->name(), " ", $annotation->name, "\n");
    print STDERR ("----------------------------\n");
    
    return $annotation;
}



 
