#!/usr/bin/perl
use strict;
use warnings;

=head1 Description

This script is used to get methylation level for each CpG. The two paired Cs are merged.

=head1 Usage

perl get_CpG_ML.pl <CpG position file> <Bismark output>  > output

=head1 Version

v1.0, Ming-an Sun, July 18, 2016

=cut

die `pod2text $0` unless @ARGV==2;

my ($cpg_file, $bismark_file) = @ARGV;

warn "Reading $cpg_file ...\n";
my %cpgs;
open(CG, $cpg_file=~/.gz$/ ? "zcat $cpg_file |" : $cpg_file)||die"Cannot open $cpg_file\n";
while(<CG>){
    if(/^(\S+)\s+(\d+)/){
        $cpgs{$1}{$2}{0} = -1;
    }
}
close CG;

warn "Parsing $bismark_file ...\n";
open(METH, $bismark_file=~/.gz$/ ? "zcat $bismark_file |" : $bismark_file)||die"Cannot open $bismark_file\n";
my $hd = <METH>;
seek(METH, 0,0) unless $hd=~/^bismark/i;
while(<METH>){
    my ($id, $patt, $chr, $pos) = split;
    my $pos_adj;
    if(defined $cpgs{$chr}{$pos}){
        $pos_adj = $pos;
    }
    elsif(defined $cpgs{$chr}{$pos-1}){
        $pos_adj = $pos-1;
    }
    else{
        warn "Undefined: $chr $pos\n";
        next;
    }

    if($cpgs{$chr}{$pos_adj}{0} == -1){
        $cpgs{$chr}{$pos_adj}{0} = 0;
        $cpgs{$chr}{$pos_adj}{1} = 0;
    }
    if($patt eq '+'){
        $cpgs{$chr}{$pos_adj}{1}++;
    }
    else{
        $cpgs{$chr}{$pos_adj}{0}++;
    }
}
close METH;

warn "Output results ...\n";
foreach my $chr (sort keys %cpgs){
    foreach my $pos (sort {$a <=> $b} keys %{$cpgs{$chr}}){
        next if $cpgs{$chr}{$pos}{0} == -1;
        my $meth = $cpgs{$chr}{$pos}{1};
        my $all  = $cpgs{$chr}{$pos}{0} + $cpgs{$chr}{$pos}{1};
        my $ml = sprintf("%.4f", $meth/$all);
        print "$chr\t$pos\t$ml\t$all\t$meth\n";
    }
}

warn "Job finished!\n";
__END__
sunm2@cn2589:/home/sunm2/work/21.placenta/03.DNAm/04.pmr:) head hg19.CpG 
chrM	162
chrM	170
sunm2@cn2589:/home/sunm2/work/21.placenta/03.DNAm/04.pmr:) zcat placenta_chr1.rmdup.txt.gz | head
Bismark methylation extractor version v0.15.0
SRR847325.8	-	chr1	205600817	z
SRR847325.8	-	chr1	205600809	z
SRR847325.8	-	chr1	205600804	z
SRR847325.8	-	chr1	205600798	z

