#!/usr/bin/perl
use warnings;
use strict;

# This script was designed to extract the mappable portion of a 4C library
# from Michelle West.  The library had an embedded 5' barcode, followed by
# some restriction site information, and finally a mappable sequence which 
# we need to extract and map.

my %barcodes = (
    ATCACG => 'TSBC_1',
    CGATGT => 'TSBC_2',
    TAGCTT => 'TSBC_10',
    GTGGCC => 'TSBC_20',
    );


my %fhs;

my $file = $ARGV[0];

foreach my $barcode ((keys %barcodes),'NoCode') {

    my $outfile = $file;
    $outfile =~ s/\.fastq\.gz/_${barcode}.fastq.gz/;

    if (exists $barcodes{$barcode}) {
	$outfile =~ s/\.fastq\.gz/_$barcodes{$barcode}.fastq.gz/;
    }

    warn "Made $outfile from $barcode\n";
    
    open (my $fh, "| gzip -c > $outfile") or die $!;

    $fhs{$barcode} = $fh;

}


open (IN, "zcat $file |") or die $!;

my %barcode_counts;

#my $count = 0;
while (1) {

#    ++$count;

#    last if ($count == 100000);
    my $header = <IN>;
    my $seq = <IN>;
    my $header2 = <IN>;
    my $quals = <IN>;

    last unless ($quals);

    my $barcode = substr($seq,0,6);
    my $restriction = substr($seq,6,20);

    if ($barcode !~ /N/) {
	++$barcode_counts{$barcode};
    }

    if (exists $barcodes{$barcode} and $restriction eq 'AGCTGCTGGGAGGAGACATG') {
	# We should be able to leave the CATG restriction site in place for mapping
	# but for the moment we're removing it.
	$seq = substr($seq,26);
	$quals = substr($quals,26);

	# If the sequence is just the genomic read-through sequence
	# then we want to remove this read all together.
	if (index($seq,'GTGAACCAGAGTTTCATCTGCGACCCGGACGACG') == 0) {
	    next;
	}
	
    }
    else {
	$barcode = 'NoCode';
    }

    print {$fhs{$barcode}} $header,$seq,$header2,$quals;

}

my @barcodes = sort {$barcode_counts{$b} <=> $barcode_counts{$a}} keys %barcode_counts;

for (0..9) {
    print join("\t",($barcodes[$_],$barcode_counts{$barcodes[$_]})),"\n";
}

