#!/usr/bin/python
# Intersect NMP-seq lesions with yeast nucleosomes and TFBSs 
from glob import glob
import os 
from os.path import join as jp
rawFolder = '02-align'
damage = '04-Damage'
TFBS= '05-Nucleosome-TFBS/TFBS/G'
nuc= '05-Nucleosome-TFBS/Nucleosomes/G'
yeast_sizes='/Users/pmao/Documents/data/Resources/Yeast/yeast_chromosome_sizes/yeast_genome.txt'
abf1_link='/Users/pmao/Documents/data/Resources/Yeast/yeast_Abf1_Reb_binding_sites_from_John/Abf1_expanded_1001bp_sorted.bed'
reb1_high_link='/Users/pmao/Documents/data/Resources/Yeast/yeast_Abf1_Reb_binding_sites_from_John/Reb1_high_expanded_1001bp_sorted.bed'
reb1_low_link='/Users/pmao/Documents/data/Resources/Yeast/yeast_Abf1_Reb_binding_sites_from_John/Reb1_low_expanded_1001bp_sorted.bed'
nuc_link='/Users/pmao/Documents/data/Resources/Yeast/yeast_nucleosomes_Nature/Nature_nucleosomes_saccer3_0_based_5_sorted.bed'
bash_code = open("TFBS_nuc_script.sh", 'w')

for r1 in glob("02-align/*bedOut.bed"):
	sample = r1.split('/')[-1].split('.')[0]
	print sample
	ads = jp(damage,sample) # Address
	cmd = ' '.join(['awk -v OFS=\"\\t\" \'{print $1,$2,$3,$6,$7}\'',jp(rawFolder,sample)+'.bed >',
	                 ads + '_damage.bed'])
	cmd += '\n'
	cmd += ' '.join(['awk \'$5 == \"C\"|| $5 == \"c\"\'', ads + '_damage.bed | sort -k1,1 -k2,2n >', ads + '_G_lesions_sorted.bed'])   # G lesion reads             
	cmd += '\n'
	cmd +=' '.join(['bedtools intersect -wa -wb -a', abf1_link, '-b', 
	                ads + '_G_lesions_sorted.bed -sorted | sort | uniq -c >', jp(TFBS,sample) + '_Abf1_counted.bed']) # format: count, chrom, TF-start, TF-stop, TF-center, motif-strand, chrom-damage, damage-start, damage-stop, damage-strand, sequence
	cmd += '\n'
	cmd +=' '.join(['awk -v OFS=\"\\t\" \'{print $8-$5,$6,$10,$1}\'', jp(TFBS,sample) + '_Abf1_counted.bed  >', jp(TFBS,sample) + '_Abf1_counted_distance.bed']) # format: damage-distance-to-tfbs-center, motif-strand, damage-strand, count
	cmd +='\n'
	cmd +=' '.join(['bedtools intersect -wa -wb -a', reb1_high_link, '-b', 
	                ads + '_G_lesions_sorted.bed -sorted | sort | uniq -c >', jp(TFBS,sample) + '_Reb1_high_counted.bed']) 
	cmd += '\n'
	cmd +=' '.join(['awk -v OFS=\"\\t\" \'{print $8-$5,$6,$10,$1}\'', jp(TFBS,sample) + '_Reb1_high_counted.bed  >', jp(TFBS,sample) + '_Reb1_high_counted_distance.bed'])
	cmd += '\n'
	cmd +=' '.join(['bedtools intersect -wa -wb -a', reb1_low_link, '-b', 
	                ads + '_G_lesions_sorted.bed -sorted | sort | uniq -c >', jp(TFBS,sample) + '_Reb1_low_counted.bed']) 
	cmd += '\n'
	cmd +=' '.join(['awk -v OFS=\"\\t\" \'{print $8-$5,$6,$10,$1}\'', jp(TFBS,sample) + '_Reb1_low_counted.bed  >', jp(TFBS,sample) + '_Reb1_low_counted_distance.bed'])
	cmd +='\n'
	cmd += ' '.join(['bedtools intersect -wa -wb -a', nuc_link, '-b', 
	                ads + '_G_lesions_sorted.bed -sorted |sort| uniq -c >', jp(nuc,sample) + '_nucleosomes_counted.bed']) # format: count, chrom, nuc-start, nuc-stop, nuc-center, nuc-score, chrom-damage, damage-start, damage-stop, strand, sequence
	cmd +='\n'
	cmd +=' '.join(['awk -v OFS=\"\\t\" \'{print $8-$5,$10,$1}\'', jp(nuc,sample) + '_nucleosomes_counted.bed  >', jp(nuc,sample) + '_nucleosomes_counted_distance.bed']) # output format: damage-distance-to-dyad, damage_strand, count
	cmd += '\n'
	bash_code.write(cmd+'\n')  
bash_code.close()


