###RNA-seq code

All brackets [ and ] indicate file name inputs. Do not add the brackets, they just mark places where file names belong.

Downloading files from public databases:
>wget -r [ftp address of .sra file here]
or, combining downloading with extracting reads:
>fastq-dump [SRR ID here] --split-3;

Genome index building:
>STAR --runThreadN 20 --runMode genomeGenerate --genomeDir /work/04237/lleblanc/Mus_musculus/Ensembl --genomeFasta Files [path to assembly.fa file here] --sjdbGTFfile [path to gtf file here] --sjdbOverhang 99 --genomeSAsparseD 2 --limitGenomeGenerateRAM 60000000000

Merging reads from a single experiment distributed across multiple fastq files:
>cat *.fq > merged.fastq

Aligning reads to the genome:

>/work/04237/lleblanc/STAR-STAR_2.4.0f1/bin/Linux_x86_64_static/STAR --runThreadN 24 --genomeDir [path to genome directory here] --readFilesIn [input .fastq file] --outFileNamePrefix [output file name]

Determining RPKM:
>python rpkmforgenes.py -o [output file name] -a [path to refGene.txt file] --fulltranscript -nocollapse -rmnameoverlap -samu -i [input file name]

rpkmforgenes.py is from http://sandberg.cmb.ki.se/rnaseq/

#!/usr/bin/python
"""
Calculates gene expression from a read mapping file
"""

lastmodified = "7 Feb 2014"
author = "Daniel Ramskold"
#4 june 2010: now assumes sam and gff files use 1-based coordinates, not 0-based, -exonnorm is default
#6 june 2010: partially rewrote code for overlapping exons from different isoforms, swapped ID and symbol fields for some annotation types
#10 june 2010: corrected gff end coordinates
#17 june 2010: removed exons of other isoforms from -intronsinstead, normalise by only mRNA by default
#28 june 2010: bed12 format as annotation, -flat option for collapsing before RPKM calculation, put strand in unused gene name column if -strand
#3 july 2010: changed output header to give number of reads normalised by regardless of normalisation option
#8 july 2010: -bedann for 3 columns gives 1-based chrX:start:end but no change for -bed3ann, -bothends map end-1 position instead of end, added -n option, more stringent check (strand, length) to filter with -diffreads
#28 july 2010: added -unique option (removal of multimapping reads)
#6 aug 2010: -u option can call bigWigSummary on bigWig files
#11 aug 2010: rewrote parsing of sam files to make -bothends and -diffreads work better, removed the need for -unique option
#19 aug 2010: changed gene/ID fields with -bedann
#23 aug 2010: changed ouput of 0 rpkm to 0.0 rpkm
#10 sep 2010: indexing exons instead of reads
#17 nov 2010: reenabled old sam format input function (now with -samse), added bam file support
#9 dec 2010: allowed -u to read from files with FASTA header and fixed line wrap
#5 jan 2011: looks for .bigWig file suffix, fixed reporting for -readcount -nocollapse
#20 jan 2011: changed GTF format ID to transcript ID
#24 jan 2011: added -randomreads
#9 feb 2011: speed optimizations for -randomreads, fixed error reporting at 0 total reads
#7 mars 2011: debugging of isoform deconvolution
#12 apr 2011: added -namecollapse option
#3 may 2011: added -dynuniq
#8 aug 2011: let -dynuniq/-ulen be followed by a read length value
#31 aug 2011: ignores lines starting with # in annotation file
#26 sep 2011: changed output sorting, -sortpos for old sorting
#27 sep 2011: added -rmnameoverlap
#30 sep 2011: improved read length auto-detection for -ulen, added -table
#4 oct 2011: altered -rmnameoverlap
#19 oct 2011: added -quite, -p, made auto-detect for -ulen handle multiple read lengths
#20 oct 2011: -ulen not needed to run with Helena's files
#3 nov 2011: added detection of file suffix for -ulen
#4 nov 2011: fixed bug with -p combined wth -forcedtotal
#29 nov 2011: -exportann
#24 apr 2012: debugging of isoform deconvolution
#30 may 2012: added -bamse flag, also works for sam files
#8 jun 2012: renamed -bamse to -bamu, fixed a crash with unknown query sequence in bam files
#25 jun 2012: paired-end support with -bamu, -samse, made -bamu default for bam/sam, made -fulltranscript default instead of -no3utr, added last modified date to #arguments line
#27 jul 2012: added -minqual
#17 aug 2012: changed -readcount to not counting reads for completely non-unique exons
#20 aug 2012: -forcetotal changes
#22 aug 2012: added -maxNM, -exportann now allowed for when using multiple input files
#11-13 sep 2012: .unique20-255.btxt suffix added for -u
#19 sep 2012: catch runtime warnings from matrix determinant
#21 sep 2012: more fixes for rare much-too-large rpkm values
#19 oct 2012: made -strand work with -u
#26 oct 2012: added -bothendsceil
#5 Nov 2012: some more error reporting, made * a valid CIGAR string for SAM files
#6 Nov 2012: considers reads with empty CIGAR in sam and bam files unmapped
#15 Jan 2013: debugged -bothends for -samu/-bamu, added -midread
#6 Feb 2013: bug fix for paired-end bam/sam that caused it to ignore all reads
#8 Feb 2013: added -ensgtfann and -norandom
#12 Feb 2013: swapped the strand of the 2nd read for paired-end sequencing
#11 Apr 2013: added -readpresent (from Ramu Chenna)
#3 May 2013: added -rmregions
#7 May 2013: bug fix for -rmregions used with -intronsinstead/-rmnameoverlap
#5 Aug 2013: fixed a bug for -bamu and -mapends where it double-counted paired-end reads
#7 Feb 2014: Added early detection for -p and -table clash
#13 Nov 2014: -addchr

from numpy import matrix, linalg
import numpy, sys, time, os, subprocess, math
from collections import defaultdict

WINDOWSIZE = 5000

uniqueposdir = "none"

if False: # check if print statement works, SyntaxError if it doesn't
	print 'This program does not work under python 3, run it in python 2.5/2.6/2.7'

class Cexon:
	def __init__(self, start, end, normalise_with):
		self.start = start
		self.end = end
		self.reads = 0
		self.transcripts = []
		self.readspersample = []
		self.forbidden = False # only for normalisation, hide from expression calculation and output
		self.normalise_with = normalise_with # include in exon normalisation
		self.length = self.end - self.start
	
	def calclength(self, chromosome, readlength_dict, filesuffix):
		if ONLYUNIQUEPOS:
			if USESTRANDINFO:
				chromosome = chromosome[:-1]
			if uniquenessfiletype == 'fasta':
				self.setuniquelength(os.path.join(uniqueposdir, chromosome + ".fa"))
			elif bigWigSummary_path:
				self.uniquelengthfromBigWig(chromosome)
			else:
				self.setuniquelength_H(os.path.join(uniqueposdir, chromosome + filesuffix), readlength_dict)
		else:
			self.length = self.end - self.start

	def exonstring(self):
		outstr = "(" + str(self.start) + "," + str(self.end)
		for tx in self.transcripts:
			outstr += "," + tx.ID
		outstr += ") " + str(self.reads) + " " + str(self.length)
		return outstr
	
	def setuniquelength_H(self, chromosomefile, readlength_dict):
		try:
			cfileh = openfile(chromosomefile, 'rb')
		except:
			global warnedchromosomes
			if not chromosomefile in warnedchromosomes:
				warnedchromosomes.append(chromosomefile)
				if vocal: print "Warning: Did not find", chromosomefile
			self.length = self.end - self.start
			return
		self.length = 0
		for readlength, weight in readlength_dict.items():
			cfileh.seek(self.start, 0)
			sequence = cfileh.read(self.end-self.start)
			self.length += sum(l != '\0' and ord(l) <= readlength for l in sequence) * weight
	
	def setuniquelength(self, chromosomefile):
		try:
			cfileh = openfile(chromosomefile)
		except:
			global warnedchromosomes
			if not chromosomefile in warnedchromosomes:
				warnedchromosomes.append(chromosomefile)
				print "Warning: Did not find", chromosomefile # if a SyntaxError is here, check the python version, use version 2.5/2.6/2.7
			self.length = self.end - self.start
			return
		
		global chromosomefile_infodict
		try: chromosomefile_infodict
		except: chromosomefile_infodict = {}
		try: offset, linelength, seqlength = chromosomefile_infodict[chromosomefile]
		except:
			line1 = cfileh.readline(1000)
			if len(line1) < 1000 and line1[0] == '>': offset = len(line1)
			else:
				cfileh.seek(0)
				offset = 0
			line2 = cfileh.readline(1000)
			if len(line2) < 1000:
				linelength = len(line2)
				seqlength = len(line2.rstrip())
			else:
				linelength = 0
				seqlength = 0
			chromosomefile_infodict[chromosomefile] = offset, linelength, seqlength
		if linelength == 0:
			startfilepos = self.start + offset
			endfilepos = self.end + offset
		else:
			startfilepos = offset + (self.start // seqlength)*linelength + (self.start % seqlength)
			endfilepos = offset + (self.end // seqlength)*linelength + (self.end % seqlength)	
		cfileh.seek(startfilepos, 0)
		sequence = cfileh.read(endfilepos-startfilepos)
		self.length = sequence.count('A')+sequence.count('C')+sequence.count('G')+sequence.count('T')	# upper-case means unique
	
	def uniquelengthfromBigWig(self, chromosome):
		try:
			proc = subprocess.Popen([bigWigSummary_path, uniqueposdir, chromosome, str(self.start), str(self.end), '1'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
		except KeyboardInterrupt:
			raise
		except:
			global bigwig_norun
			try: bigwig_norun
			except:
				if vocal: print 'Error: Failed running bigWigSummary, required by the -u option for bigWig files. If it was\'t aborted by the user, perhaps its permission to execute has not been set (\'chmod +x '+ bigWigSummary_path + '\' on Unix-like systems)'
				sys.exit(1)
				bigwig_norun = 1
			self.length = self.end - self.start
			return
		ret, err = proc.communicate()
		if proc.returncode:
			if err.startswith('no data'):
				self.length = 0
			else:
				if vocal:
					print "Warning: bigWigSummary returned an error:"
					print err
				self.length = self.end - self.start
		else:
			self.length = float(ret)*(self.end - self.start)
			

class Ctranscript:
	def __init__(self, ID, sortnum):
		self.ID = ID
		self.expression = []
		self.reads = []
		self.overlaptx = None
		self.genename = "?"
		self.sortnum = sortnum

class Cgene:
	def __init__(self, name):
		self.name = name
		self.transcripts = []
		self.exons = []
		self.chromosome = ""
		self.overlapsets = None

	def exonstring(self):
		outstr = self.name
		for exon in self.exons:
			outstr += "\t" + exon.exonstring()
		return outstr

class Cunit:
	# after collapse of genes
	def __init__(self, sortnum):
		self.name1 = "."
		self.name2 = "."
		self.rpkms = []
		self.reads = []
		self.sortnum = sortnum

class Cregion:
	allchromosomes = {}
	indexdict = {} # inverse of allchromosomes
	allwindows = []
	WL = 3000
	
	def __init__(self, chromosome, start, end=None, strand='?'):
		self.start = start
		if end == None: self.end = start
		else: self.end = end
		try: self.chrindex = Cregion.allchromosomes[chromosome+strand]
		except KeyError:
			self.chrindex = len(Cregion.allchromosomes)
			Cregion.allchromosomes[chromosome+strand] = self.chrindex
			Cregion.indexdict[self.chrindex] = chromosome+strand
			Cregion.allwindows.append([])
	
	def addtowindows(self):
		# adds instance to Cregion.allwindows
		wchr = Cregion.allwindows[self.chrindex]
		if len(wchr) <= self.end//Cregion.WL: wchr.extend([[] for i in range(1+self.end//Cregion.WL-len(wchr))])
		for wi in range(self.start//Cregion.WL, self.end//Cregion.WL+1):
			wchr[wi].append(self)
		
	def getwindow(self):
		# returns list of Cregion instances which could overlap
		wchr = Cregion.allwindows[self.chrindex]
		s = min(len(wchr), self.start//Cregion.WL)
		e = min(len(wchr), self.end//Cregion.WL+1)
		return list(set([v for l in wchr[s:e] for v in l])) # flattens wchr[s:e], removes duplicates
		
	def overlaps(self, other):
		return self.start <= other.start < self.end or other.start <= self.start < other.end
	
	def overlapping(self):
		return [r for r in self.getwindow() if r.overlaps(self)]
	
	def getchromosome(self):
		return Cregion.indexdict[self.chrindex][:-1]
	
	def startingwithin(self):
		# returns list of Cregion instances whose start coordinate is within the region
		return [r for r in self.getwindow() if self.start <= r.start < self.end]
	
	def getstrand(self):
		strand = Cregion.indexdict[self.chrindex][-1]
		if strand == "?": raise Exception("No strand given")
		return strand
	
	def __repr__(self):
		return self.name(1, 0)
	
	def name(self, start_add=0, end_add=0):
		try:
			strand = self.getstrand()
		except:
			return self.getchromosome()+":"+str(self.start+start_add)+"-"+str(self.end+end_add)
		else:
			return self.getchromosome()+":"+str(self.start+start_add)+"-"+str(self.end+end_add)+":"+strand

	@staticmethod
	def clearwindows(new_windowsize=None):
		if new_windowsize is not None: Cregion.WL = new_windowsize
		Cregion.allwindows = [[] for c in Cregion.allchromosomes]
		
	@staticmethod
	def overlappingpoint(chromosome, pos, strand='?'):
		try:
			wchr = Cregion.allwindows[Cregion.allchromosomes[chromosome+strand]]
		except KeyError:
			return []
		s = pos//Cregion.WL
		try:
			return [r for r in wchr[s] if r.start <= pos < r.end]
		except IndexError:
			return []
	
	@staticmethod
	def closesttopoint(chromosome, pos, strand='?', mindist=0, maxdist=1e30, check_forward=True, check_backward=True):
		try:
			wchr = Cregion.allwindows[Cregion.allchromosomes[chromosome+strand]]
		except KeyError:
			return []
		s = pos//Cregion.WL
		windist = 0
		candidates = set()
		
		def closesttopoint_distance(r):
			if r.start <= pos <= r.end: return 0
			return min(abs(r.start-pos), abs(r.end-pos))
		
		while windist < 5+maxdist/Cregion.WL:
			new_candidates = set()
			if check_backward:
				try: new_candidates |= set(r for r in wchr[s-windist])
				except IndexError:pass
			if check_forward:
				try: new_candidates |= set(r for r in wchr[s+windist])
				except IndexError:pass
			new_candidates = set(r for r in new_candidates if mindist <= closesttopoint_distance(r) < maxdist)
			if candidates:
				candidates |= new_candidates
				break
			candidates |= new_candidates
			windist += 1
		
		if not candidates: return []
		closest_dist = min(closesttopoint_distance(r) for r in candidates)
		return [r for r in candidates if closesttopoint_distance(r) == closest_dist]

def split_by_rmregions(exon, gene):
	if exon.start >= exon.end: return []
	r = Cregion(gene.chromosome, exon.start, exon.end)
	rmregions = r.overlapping()
	if not rmregions: return [exon]
	rmregion = rmregions[0]
	rightexon = Cexon(rmregion.end, exon.end, exon.normalise_with)
	rightexon.transcripts = exon.transcripts
	rightexon.forbidden = exon.forbidden
	exon.end = rmregion.start
	return split_by_rmregions(exon, gene) + split_by_rmregions(rightexon, gene)

def contractarrays(Ll, Rl):
	row = 0
	while row < len(Rl):
		zeropattern = Ll[row]
		(Ll, Rl) = contractarrays_inner(zeropattern, Ll, Rl, row)
		row += 1
	return (Ll, Rl)

def contractarrays_inner(zeropattern, Ll, Rl, firstmatchingrow):
	currentrow = firstmatchingrow+1
	while currentrow < len(Ll):
		matches = 1
		for place in range(len(zeropattern)):
			if (zeropattern[place] and not Ll[currentrow][place]) or \
			 (Ll[currentrow][place] and not zeropattern[place]):
				matches = 0
		if matches:
			for place in range(len(zeropattern)):
				Ll[firstmatchingrow][place] += Ll[currentrow][place]
			Rl[firstmatchingrow][0] += Rl[currentrow][0]
			del Ll[currentrow]
			del Rl[currentrow]
		else:
			currentrow += 1
	return (Ll, Rl)
		
def rowincludes(inclusiverow, includedrow):
	# checks if all elements of includedrows are found within inclusiverow
	for element in range(len(inclusiverow)):
		if includedrow[element] and not inclusiverow[element]:
			return 0
	return 1
	
def elementsinrow(row):
	count = 0
	for element in range(len(row)):
		if row[element]:
			count += 1
	return count
			
def removecommonrow(Ll, Rl):
	currentrow = 0
	while currentrow < len(Ll):
		matches = 1
		for place in range(len(Ll[currentrow])):
			if not Ll[currentrow][place]:
				matches = 0
		if matches:
			del Ll[currentrow]
			del Rl[currentrow]
		else:
			currentrow += 1
	return (Ll, Rl)
				
def removecolumn(Ll, column):
	currentrow = 0
	while currentrow < len(Ll):
		del Ll[currentrow][column]
		currentrow += 1		
	return Ll

def removezeros(Ll, Rl):
	currentrow = 0
	while currentrow < len(Ll):
		matches = 1
		for place in range(len(Ll[currentrow])):
			if Ll[currentrow][place]:
				matches = 0
		if matches:
			del Ll[currentrow]
			del Rl[currentrow]
		else:
			currentrow += 1
	return (Ll, Rl)

def gtf_field(p, tags):
	l1 = [v.strip() for v in p[-1].split(';')]
	for tag in tags:
		for f in l1:
			if f.startswith(tag):
				return f.split(' "')[-1].split('"')[0]
	return '.'

def fromannotationline(line, l_annotationtype=None):
	if l_annotationtype is None: l_annotationtype = annotationtype
	inferred_strand = False
	if l_annotationtype == 0:
		# from refGene.txt
		p = line.rstrip('\r\n').split("\t")
		exonstarts = [int(f) for f in p[9].split(",")[:-1]]	# start positions for exons for the gene
		exonends = [int(f) for f in p[10].split(",")[:-1]]
		ID = p[1]
		chromosome = p[2]
		genename = p[12]
		strand = p[3]
		cdsstart = min(int(p[7]), int(p[6]))
		cdsend = max(int(p[7]), int(p[6]))
	elif l_annotationtype == 1:
		# from knownGene.txt
		p = line.rstrip('\r\n').split("\t")
		exonstarts = [int(f) for f in p[8].split(",")[:-1]]
		exonends = [int(f) for f in p[9].split(",")[:-1]]
		ID = p[11]
		chromosome = p[1]
		genename = p[0]
		strand = p[2]
		cdsstart = min(int(p[5]), int(p[6]))
		cdsend = max(int(p[5]), int(p[6]))
	elif l_annotationtype == 2:
		# from ensGene.txt or sibGene.txt
		p = line.rstrip('\r\n').split("\t")
		ID = p[1]
		strand = p[3]
		chromosome = p[2]
		genename = p[12]
		cdsstart = min(int(p[7]), int(p[6]))
		cdsend = max(int(p[7]), int(p[6]))
		exonstarts = [int(f) for f in p[9].split(",")[:-1]]
		exonends = [int(f) for f in p[10].split(",")[:-1]]
	elif l_annotationtype == 3:
		# 3 column bed file
		p = line.rstrip('\r\n').split("\t")
		exonstarts = [int(p[1])]
		exonends = [int(p[2])]
		cdsstart = 0
		cdsend = 0
		chromosome = p[0]
		genename = None
		ID = p[0]+":"+p[1]+"-"+p[2]
		strand = "+"
		inferred_strand = True
	elif l_annotationtype == 4:
		# 6 column bed file
		p = line.rstrip('\r\n').split("\t")
		exonstarts = [int(p[1])]
		exonends = [int(p[2])]
		cdsstart = 0
		cdsend = 0
		chromosome = p[0]
		genename = None
		ID = p[3]
		if len(p) > 5: strand = p[5]
		else:
			strand = "+"
			inferred_strand = True
	elif l_annotationtype == 5:
		# gtf file format
		p = line.rstrip('\r\n').split("\t")
		exonstarts = [int(p[3])-1]
		exonends = [int(p[4])]
		cdsstart = 0
		cdsend = 0
		chromosome = p[0]
		transInfo = p[8].split(';')
		transIDs = transInfo[1].lstrip('transcript_id ').strip('"').replace(' ', '').split(',')
		ID = '+'.join(transIDs)
		genename = p[0]+":"+p[3]+"-"+p[4]
		strand = p[6]
		if strand == '.': strand = '+'
	elif l_annotationtype == 6:
		# up to 12 column bed file
		p = line.rstrip('\r\n').split("\t")
		chromosome = p[0]
		absstart = int(p[1])
		try: absend = int(p[2])
		except: absend = absstart
		try: strand = p[5]
		except:
			strand = "+"
			inferred_strand = True
		try:	cdsstart = int(p[6]); cdsend = int(p[7])
		except: cdsstart = 0; cdsend = 0
		try: blocksizes = map(int, p[10].split(',')) ; blockstarts = map(int, p[11].split(','))
		except: blocksizes = [absend - absstart]; blockstarts = [0]
		exonstarts = [absstart + rs for rs in blockstarts]
		exonends = [es + size for es,size in zip(exonstarts, blocksizes)]
		try: genename = p[3]
		except: 
			genename = None
		ID = str(chromosome)+":"+str(min(exonstarts)+1)+"-"+str(max(exonends)+1)
		if len(blockstarts) != len(blocksizes) or max(exonends) != absend:
			if vocal: print "Warning: Block structure for " + ID + " is malformed"
	elif l_annotationtype == 7:
		# ensembl gtf
		exonstarts = []
		exonends = []
		cdspoints = []
		for exonline, chrom, counter in line:
			p = exonline.rstrip('\r\n').split('#')[0].split("\t")
			pos1 = int(p[3])-1
			pos2 = int(p[4])-1
			chromosome = chrom
			ID = gtf_field(p, ['transcript_name', 'transcript_id'])
			genename = gtf_field(p, ['gene_name', 'gene_id'])
			strand = p[6]
			exontype = p[2]
			if exontype in ('start_codon', 'stop_codon'):
				if strand == '+':
					cdspoints.append(pos1)
				else:
					cdspoints.append(pos2)
			else:
				exonstarts.append(pos1)
				exonends.append(pos2)
			if exontype == 'CDS':
				cdspoints.extend([pos1,pos2])
		if cdspoints:
			cdsstart = min(cdspoints)
			cdsend = max(cdspoints)
		else:
			cdsstart = 0
			cdsend = 0
		
	if USESTRANDINFO:
		if genename is None: genename = strand
		if swapstrands:
			if strand == "+": chromosome += "-"
			else: chromosome += "+"
		else: 
			chromosome += strand	# so e.g. "chr1+" is different from "chr1-"
	else:
		if genename is None: genename = '.'
		inferred_strand = False
	return (chromosome, strand, cdsstart, cdsend, exonstarts, exonends, genename, ID, inferred_strand)
		
class Cline:
	def __init__(self, line, counter):
		global allchromosomes, allchromosomes_dict
		
		self.line = line
		(chromosome, direction, cdsstart, cdsend, exonstarts, exonends, genename, ID, inferred_strand) = fromannotationline(line)
		try: self.chromosome = allchromosomes_dict[chromosome]
		except:
			allchromosomes.append(chromosome)
			self.chromosome = allchromosomes.index(chromosome)
			allchromosomes_dict[chromosome] = self.chromosome
		self.start = min(exonstarts)
		self.end = max(exonends)
		self.strand = direction
		self.sortnum = counter
		
	def __cmp__(self, other):
		if self.chromosome != other.chromosome:
			return cmp(self.chromosome, other.chromosome)
		if self.start > other.end:
			return 1
		if other.start > self.end:
			return -1
		return cmp(self.start, other.start)

def openfile(filename, mode='r'):
	if filename.endswith(".gz"):
		import gzip
		fileh = gzip.open(filename, mode)
	elif filename.endswith(".bz2"):
		import bz2
		fileh = bz2.BZ2File(filename, mode)
	else: fileh = open(filename,mode)
	return fileh	

def addread(chromosome, start, end, strand, halfweight=False):
	if addchr: chromosome = 'chr'+chromosome
	try: chrID = allchromosomes_dict[chromosome]
	except:
		return -1, None
	endpos = 1-end if strand == '-' else end-1
	if halfweight:
		return chrID, (start, endpos, None, None)
	else:
		return chrID, (start, endpos)

def addread_tuple(chromosome, postuple):
	if addchr: chromosome = 'chr'+chromosome
	try: chrID = allchromosomes_dict[chromosome]
	except:
		return -1, None
	return chrID, postuple

def readsfrombedtabfile(filename, justreadnum, readlength_dict):
	try: 
		if removemultimappers:
			tmpfile = sorttotmpfile(filename, 3, ignoredprefix='track', sep='\t')
			fileh = open(tmpfile,'r')
		else:
			fileh = openfile(filename)
	except IOError:
		if vocal: print "Warning: No such file", filename
		return
	try:
		if not fileh.readline().startswith("track"): fileh.seek(0)
		while 1:
			line = fileh.readline()
			if not line: break
			if justreadnum: yield 1; continue
			p = line.rstrip('\r\n').split("\t")
			if minqual and  int(p[4]) < minqual: continue # too low score
			if readlength_dict is not None:
				if len(p) <= 10:
					readlength_dict[int(p[2])-int(p[1])] += 1
				else:
					readlength_dict[sum(map(int, p[10].split(',')))] += 1
			chromosome = p[0]
			try: strand = p[5]
			except: strand = '.'
			if USESTRANDINFO: chromosome += strand
			yield addread(chromosome, int(p[1]), int(p[2]), strand)
	finally:
		fileh.close()
		if removemultimappers:
			os.remove(tmpfile)

def readsfrombedtabfile_rs(filename, justreadnum, readlength_dict):
	if removemultimappers and vocal: print "Warning: The -unique option was ignored, unsupported for this format"
	try: fileh = openfile(filename)
	except:
		if vocal: print "Warning: No such file", filename
		return
	try:
		if not fileh.readline().startswith("track"): fileh.seek(0)
		while 1:
			line = fileh.readline()
			if not line: break
			p = line.rstrip('\r\n').split("\t")
			if minqual and  int(p[4]) < minqual: continue # too low score
			if readlength_dict is not None and (len(p) <= 3 or not 'jxn' in p[3]):
				readlength_dict[int(p[2])-int(p[1])] += 1
			chromosome = p[0]
			try: strand = p[5]
			except: strand = '.'
			if USESTRANDINFO: chromosome += strand
			for nb_reads in xrange(int(p[4])):
				yield addread(chromosome, int(p[1]), int(p[2]), strand)
	finally:
		fileh.close()
		if removemultimappers:
			os.remove(tmpfile)

def readsfrombedspacefile(filename, justreadnum, readlength_dict):
	try: 
		if removemultimappers:
			tmpfile = sorttotmpfile(filename, 3, ignoredprefix='track', sep=' ')
			fileh = open(tmpfile,'r')
		else:
			fileh = openfile(filename)
	except IOError:
		if vocal: print "Warning: No such file", filename
		return

	try:
		if not fileh.readline().startswith("track"): fileh.seek(0)
		while 1:
			line = fileh.readline()
			if not line: break
			if justreadnum: yield 1; continue
			p = line.rstrip('\r\n').split()
			if minqual and  int(p[4]) < minqual: continue # too low score
			if readlength_dict is not None:
				if len(p) <= 10:
					readlength_dict[int(p[2])-int(p[1])] += 1
				else:
					readlength_dict[sum(map(int, p[10].split(',')))] += 1
			chromosome = p[0]
			try: strand = p[5]
			except: strand = '.'
			if USESTRANDINFO: chromosome += strand
			yield addread(chromosome, int(p[1]), int(p[2]), strand)
	finally:
		fileh.close()

def readsfromgtffile(filename, justreadnum, readlength_dict):
	if removemultimappers and vocal: print "Warning: The -unique option was ignored, unsupported for this format"
	try: fileh = openfile(filename)
	except:
		if vocal: print "Warning: No such file", filename
		return
	try:
		if not fileh.readline().startswith("track"): fileh.seek(0)
		while 1:
			line = fileh.readline()
			if not line: break
			if justreadnum: yield 1; continue
			p = line.rstrip('\r\n').split("\t")
			if minqual and  int(p[5]) < minqual: continue # too low score
			if readlength_dict is not None:
				readlength_dict[int(p[4])-(int(p[3])-1)] += 1
			chromosome = p[0]
			try: strand = p[6]
			except: strand = '.'
			if USESTRANDINFO: chromosome += strand
			yield addread(chromosome, int(p[3])-1, int(p[4]), strand)
	finally:
		fileh.close()

def splitcigar(string):
	numarr = ['']
	symbolarr = []
	wasnum = 1
	for letter in string:
		if letter.isdigit():
			if wasnum:
				numarr[-1] += letter
			else:
				numarr.append(letter)
			wasnum = 1
		else:
			if not wasnum:
				symbolarr[-1] += letter
			else:
				symbolarr.append(letter)
			wasnum = 0
	if '' in numarr:
		if vocal: print 'Warning: strange CIGAR field in SAM file:', string
		return [0 if v == '' else int(v) for v in numarr], symbolarr
	return [int(v) for v in numarr], symbolarr

def rowparse(row):
	chromosome = row[2]
	start = row[1]-1
	endincl = start + row[4]-1
	if row[3] & 16:
		if USESTRANDINFO:
			chromosome += '+' if row[3] & 0x80 else "-"
		endincl = -endincl
	else:
		if USESTRANDINFO:	
			chromosome += '-' if row[3] & 0x80 else "+"
	return chromosome, start, endincl

def process_rowinfo(rowinfo, justreadnum):
	lastname = ''
	while 1:
		try: row = rowinfo.pop()
		except IndexError: break
		try: nrow = rowinfo[-1]
		except IndexError: pass
		else:
			if row[0] == nrow[0]:	# same name
				try:
					if rowinfo[-2][0] == row[0]:
						# multimapper
						try:
							while rowinfo[-1][0] == row[0]:
								rowinfo.pop()
						except IndexError: pass
						continue
				except IndexError: pass
				if row[3]&0x40 or row[3]&0x80:
					# paired end reads
					if row[4]&0x80:
						chromosome_1, start_1, endincl_1 = rowparse(nrow)
						chromosome_2, start_2, endincl_2 = rowparse(row)
					else:
						chromosome_1, start_1, endincl_1 = rowparse(row)
						chromosome_2, start_2, endincl_2 = rowparse(nrow)
					yield addread_tuple(chromosome_1, (start_1, endincl_1, start_2, endincl_1))
					continue
				# multimapper
				try:
					while rowinfo[-1][0] == row[0]:
						rowinfo.pop()
				except IndexError: pass
				continue
		chromosome_1, start_1, endincl_1 = rowparse(row)
		yield addread_tuple(chromosome_1, (start_1, endincl_1))

def readsfromsam(filename, justreadnum, readlength_dict):
	global saved_rowinfo
	try:
		if justreadnum: raise NameError
		rowinfo, fn = saved_rowinfo
		del saved_rowinfo
		assert fn == filename
	except NameError:
		try: 
			fileh = openfile(filename)
		except IOError:
			if vocal: print "Warning: No such file", filename
			return
		
		has_warned_cigar = False
		rowinfo = []
		for line in fileh:
			try:
				if line.startswith("@"): continue	# header
				p = line.rstrip('\r\n').split("\t")
				bits = int(p[1])
				if bits & 4: continue	# unmapped
				if p[5] == '*': continue # unmapped
				if minqual and int(p[4]) < minqual: continue # too low mapping quality
				if maxNM >= 0 and any(field.startswith('NM:i:') and int(field[5:]) > maxNM for field in p[11:]): continue # too many mismatches/indels
				if readlength_dict is not None:
					readlength_dict[len(p[9])] += 1
				chromosome = p[2]
				name = p[0]
				if p[5] == '*':
					if vocal and not has_warned_cigar: print 'Warning: line in SAM file has empty (*) CIGAR field but is not flagged as unmapped:', line.rstrip('\r\n')
					length = 1
					has_warned_cigar = True
				else:
					length = sum([l for l,s in zip(*splitcigar(p[5])) if s in ["M","D","X","=","N"]])
				rowinfo.append((name, int(p[3]), chromosome, bits, length))
			except:
				if vocal: print 'Error: could not recognize/parse this line as SAM format:', line.rstrip('\r\n')
				raise
		rowinfo.sort()
		import copy
		if justreadnum: saved_rowinfo = copy.copy(rowinfo), filename
		fileh.close()
		
	return process_rowinfo(rowinfo, justreadnum)

def pysamopen(filename):
	try:
		import pysam
	except:
		if vocal: print "Error: tried to read BAM file, but missing the pysam module"
		exit(1)
	try:
		return pysam.Samfile(filename, 'rb') # open as bam, gives ValueError for sam file
	except ValueError:
		return pysam.Samfile(filename, 'r') # open as sam

def readsfrombam(filename, justreadnum, readlength_dict):
	global saved_rowinfo
	try:
		if justreadnum: raise NameError
		rowinfo, fn = saved_rowinfo
		del saved_rowinfo
		assert fn == filename
	except NameError:
		try: 
			fileh = pysamopen(filename)
		except IOError:
			if vocal: print "Warning: No such file", filename
			return
		
		rowinfo = []
		for read in fileh:
			if read.flag & 4: continue	# unmapped
			if read.cigar is None: continue #unmapped, http://seqanswers.com/forums/showthread.php?t=3551
			if minqual and read.mapq < minqual: continue # too low mapping quality
			if maxNM >= 0 and any((key=='NM' and val > maxNM) for key, val in read.tags): continue # too many mismatches/indels
			try:
				chromosome = fileh.getrname(read.rname)
			except ValueError:
				# probably the result of mapping to chrUn_random, chr9_random etc but then not mentioning theses reference sequences in the header
				continue
			if readlength_dict is not None:
				readlength_dict[read.rlen] += 1
			name = read.qname
			length = sum(l for o,l in read.cigar if o in [0,2,3]) #0=M,1=I,2=D,3=N
			rowinfo.append((name, read.pos+1, chromosome, read.flag, length))
		rowinfo.sort()
		import copy
		if justreadnum: saved_rowinfo = copy.copy(rowinfo), filename
		fileh.close()
		
	return process_rowinfo(rowinfo, justreadnum)

def readsfrombam_se(filename, justreadnum, readlength_dict):
	if removemultimappers and vocal: print "Warning: The -unique option was ignored, unsupported for this format"
	
	try: 
		fileh = pysamopen(filename)
	except IOError:
		if vocal: print "Warning: No such file", filename
		return
	
	try:
		for read in fileh:
			if read.flag & 4: continue	# unmapped
			if read.cigar is None: continue #unmapped
			if minqual and read.mapq < minqual: continue # too low mapping quality
			if maxNM >= 0 and any((key=='NM' and val > maxNM) for key, val in read.tags): continue # too many mismatches/indels
			if (not mapends) and (read.flag & 0x80) and (read.flag & 0x2):
				# is last segment and the other segment is unmapped
				continue
			try:
				chromosome = fileh.getrname(read.rname)
			except ValueError:
				# probably the result of mapping to chrUn_random, chr9_random etc but then not mentioning theses reference sequences in the header
				continue
			if readlength_dict is not None:
				readlength_dict[read.rlen] += 1
			length = sum(l for o,l in read.cigar if o in [0,2,3]) #0=M,1=I,2=D,3=N
			strand = '-' if read.flag & 16 else '+'
			if USESTRANDINFO:
				if mapends and read.flag & 0x80:
					# swap strand for the 2nd read for paired-end sequencing
					chromosome += '-' if strand == '+' else '+'
				else:
					chromosome += strand
			start = read.pos
			yield addread(chromosome, start, start+length, strand, mapends and (read.flag & 2))
	finally:
		fileh.close()
	
	

def readsfromsam_se(filename, justreadnum, readlength_dict):
	try: 
		if removemultimappers:
			tmpfile = sorttotmpfile(filename, 0, ignoredprefix='@', sep='\t')
			fileh = open(tmpfile,'r')
		else:
			fileh = openfile(filename)
	except IOError:
		if vocal: print "Warning: No such file", filename
		return
	
	has_warned_cigar = False
	line = None
	try:
		while 1:
			line = fileh.readline()
			if not line: break
			if line.startswith("@"): continue	# header
			p = line.rstrip('\r\n').split("\t")
			read_flag = int(p[1])
			if read_flag & 4: continue	# unmapped
			if p[5] == '*':
				if vocal and not has_warned_cigar: print 'Warning: line in SAM file has empty (*) CIGAR field, will be considered as unmapped:', line.rstrip('\r\n')
				length = 1
				has_warned_cigar = True
				continue # unmapped
			if minqual and int(p[4]) < minqual: continue # too low mapping quality
			if maxNM >= 0 and any(field.startswith('NM:i:') and int(field[5:]) > maxNM for field in p[11:]): continue # too many mismatches/indels
			if (not mapends) and read_flag & 0x80 and read_flag & 2:
				# is last segment and the other segment is mapped
				continue
			if readlength_dict is not None:
				readlength_dict[len(p[9])] += 1
			if justreadnum: yield 1; continue
			chromosome = p[2]
			if int(p[1]) & 16: strand = "-"
			else: strand = "+"
			if USESTRANDINFO:
				if mapends and read_flag & 0x80:
					# swap strand for the 2nd read for paired-end sequencing
					chromosome += '-' if strand == '+' else '+'
				else:
					chromosome += strand
			
			cigarints, cigarsymbols = splitcigar(p[5])
			length = sum([l for l,s in zip(cigarints, cigarsymbols) if s in ["M","D","X","=","N"]])
			
			start = int(p[3])-1
			yield addread(chromosome, start, start+length, strand, mapends and (read_flag & 0x2))
	except:
		if line is not None:
			if vocal: print 'Error: could not recognize/parse this line as SAM format:', line.rstrip('\r\n')
		raise
	finally:
		fileh.close()
		if removemultimappers:
			os.remove(tmpfile)
	return

def readsfrombowtieoutput(filename, justreadnum, readlength_dict):
	try: 
		if removemultimappers:
			tmpfile = sorttotmpfile(filename, 0, sep='\t')
			fileh = open(tmpfile,'r')
		else:
			fileh = openfile(filename)
	except IOError:
		if vocal: print "Warning: No such file", filename
		return
	try:
		while 1:
			line = fileh.readline()
			if not line: break
			p = line.rstrip('\r\n').split("\t")
			if readlength_dict is not None:
				readlength_dict[len(p[4])] += 1
			chromosome = p[2]
			try: strand = p[1]
			except: strand = '.'
			if USESTRANDINFO: chromosome += strand
			yield addread(chromosome, int(p[3]), int(p[3])+len(p[4]), strand)
	finally:
		fileh.close()
		if removemultimappers:
			os.remove(tmpfile)
	return 

def sorttotmpfile(filename, idindex, ignoredprefix='', sep='\t'):
	# sort file, remove duplicates, export to temporary file
	# returned file needs later removal with os.remove()
	import tempfile
	outf = tempfile.mkstemp(suffix='_rpkmforgenes')[1]
	try:
		if vocal: print "Sorting to ", outf
		infh = openfile(filename, 'r')
		lines = []
		for line in infh:
			if ignoredprefix and line.startswith(ignoredprefix): continue
			p = line.rstrip('\r\n').split('\t')
			if ignoredprefix=='@' and int(p[2])&0x80: continue	# remove 2nd in paired read
			if len(p) <= idindex:
				lines.append(('', line))
			else:
				lines.append((p[idindex],line))
		infh.close()
		lines.sort()
		outfh = open(outf, 'w')
		for i in range(len(lines)):
			this = lines[i][0]
			if i > 0:
				neighbour = lines[i-1][0]
				if neighbour and neighbour == this: continue
			try:
				neighbour = lines[i+1][0]
			except IndexError: pass
			else:
				if neighbour and neighbour == this: continue
			outfh.write(lines[i][1])
		outfh.close()
	except:
		try: os.remove(outf)
		except: pass
		raise
	return outf

def mappos(chrindex, readpos):
	try: window = allwindows[chrindex][readpos//WINDOWSIZE]
	except IndexError:
		return []
	return [exon for exon in window if exon.start <= readpos < exon.end]
	
def make_overlapsets(currentgene, genecollapse, limitcollapse):
	currentgene.overlapsets = []
	currentgene_exons = [exon for exon in currentgene.exons if not exon.forbidden]
	# generate the sets of overlapping transcripts
	# firstly see which ones are connected by overlap
	for fromtx in currentgene.transcripts:	# fromtx = transcript others are connected from, the one to add to
		fromtx.overlaptx = [fromtx]
		for exon in currentgene_exons:
			if fromtx in exon.transcripts:
				for totx in exon.transcripts:	# totx = transcript others are connected to, the one to be added
					if not totx in fromtx.overlaptx:
						if genecollapse and fromtx.genename != totx.genename: continue
						fromtx.overlaptx.append(totx)
	# secondly follow the overlap to see indirect overlap
	donetx = []
	for fromtx in currentgene.transcripts:
		if fromtx in donetx: continue
		currentoverlaptx = []
		for overtx in fromtx.overlaptx:
			currentoverlaptx.append(overtx)
			donetx.append(overtx)
		hasadded = 1
		while hasadded:
			hasadded = 0
			for overtx in currentoverlaptx:
				if limitcollapse: break
				for totx in overtx.overlaptx:
					if totx in donetx: continue
					currentoverlaptx.append(totx)
					donetx.append(totx)
					hasadded = 1
		currentgene.overlapsets.append(currentoverlaptx)

	if sum([len(s) for s in currentgene.overlapsets]) != len(currentgene.transcripts):
		if vocal: print "Warning at gene model collapse. Info:", currentgene.name, [tx.ID for tx in currentgene.transcripts], [[tx.ID for tx in s] for s in currentgene.overlapsets]

def merge(o_infiles, o_outfile):
	# parse files
	numgenes = None
	header = ['#samples', '#allmappedreads', '#normalizationreads', '#arguments']
	header[3] += '\t' + ' '.join(sys.argv) + '\ttime: ' + time.asctime()
	genelines = []
	readlines = []
	for inf in o_infiles:
		lnumgenes = 0
		with open(inf, 'r') as infh:
			for line in infh:
				p = line.rstrip('\r\n').split('\t')
				if p[0] == '#samples':
					header[0] += '\t' + '\t'.join(p[1:])
					lnumsamples = len(p) - 1
				elif p[0] == '#allmappedreads':
					header[1] += '\t' + '\t'.join(p[1:])
				elif p[0] in ('#genemappedreads', '#normalizationreads'):
					header[2] += '\t' + '\t'.join(p[1:])
				elif line[0] != '#':
					if numgenes is None:
						genelines.append('\t'.join(p[:2]))
						readlines.append('')
					genelines[lnumgenes] += '\t' + '\t'.join(p[2:2+lnumsamples])
					readlines[lnumgenes] += ''.join('\t'+s for s in p[2+lnumsamples:2+2*lnumsamples])
					lnumgenes += 1
					
			if numgenes is None:
				numgenes = lnumgenes
			elif numgenes != lnumgenes:
				raise Exception('Unequal number of genes between files (try running without the -p option)')
			
	# write output
	with open(o_outfile, 'w') as outfh:
		for line in header:
			print >>outfh, line
		for linenum in xrange(numgenes):
			print >>outfh, genelines[linenum] + readlines[linenum]
			
def rmflag(argv, flag):
	# remove flag and list of arguments that follow the flag
	if flag not in argv: return
	flagindex = argv.index(flag)
	index = flagindex + 1
	del_index = [flagindex]
	while index < len(argv) and argv[index][0] != "-":
		del_index.append(index)
		index += 1
	for index in reversed(del_index):
		del argv[index]

def getarguments(flag):
	# list of arguments that follow the flag
	flagindex = sys.argv.index(flag)
	ret = []
	index = flagindex + 1
	while index < len(sys.argv) and sys.argv[index][0] != "-":
		ret.append(sys.argv[index])
		index += 1
	return ret
		
def getargument(flag):
	# the argument after the flag
	flagindex = sys.argv.index(flag)
	return sys.argv[flagindex+1]
	
def testargumentflag(flag):
	# if the flag is among the arguments
	if flag in sys.argv: return 1
	else: return 0

def main():
	global MAXGENES
	global MAXREADS
	global ONLYUNIQUEPOS
	global ONLYTXUNIQUEEXONS
	global INTRONSINSTEAD 
	global NODECONVOLUTION 
	global COLLAPSEGENES 
	global FULLTRANSCRIPTS
	global USESTRANDINFO
	global allchromosomes
	global allchromosomes_dict
	global annotationtype
	global uniqueposdir
	global swapstrands
	global mapends, midread
	global removemultimappers, minqual, maxNM
	global bigWigSummary_path, uniquenessfiletype
	global allwindows
	global vocal, addchr

	# interpret program arguments
	if len(sys.argv) < 2 or testargumentflag("--help") or testargumentflag("-h"):
		print "Non-optional arguments:"
		print " -o followed by output file"
		print " -i followed by list of input files (by default, guesses format from file extension)"
		print " -a followed by annotation file"
		print "Gene model-related options:"
		print " -u followed by a bigWig file, alternatively a directory for files for non-unique positions (lower case for nonunique k-mers (where k is the read length), upper case for unique; filenames are e.g. chr1.fa, can also be chr1_unique20-255.btxt etc"
		print " -no3utr to remove 3'UTRs"
		print " -fulltranscript to not remove 3'UTRs (default)"
		print " -maxlength followed by a distance to cut each transcript from the 3' end, from 5' if negative (never seems to give better values)"
		print " -maxgenes limit how many genes expression is calculated for (for testing purposes)"
		print " -limitcollapse to not consider indirect transcript overlap"
		print " -namecollapse to only consider overlap between isoform with the same gene identifier (shaky)"
		print " -nocollapse to get isoform expressions (shaky)"
		print " -nooverlap to ignore that transcripts can overlap (will count some reads several times)"
		print " -rmnameoverlap to ignore regions shared my multiple genes (seems to work well)"
		print " -rmregions followed by a bed file of regions which should be removed from genes"
		print " -flat to flatten all isoforms to one gene model (likely to give too low RPKM values)"
		print " -txunique to ignore regions shared by multiple gene isoforms"
		print " -onlycoding to ignore noncoding transcripts"
		print " -swapstrands to make reads on + strand map to genes on - and vice versa (and sets -strand)"
		print " -introns gives gene expression from introns rather than exons (also removes exons of other isoforms)"
		print " -keephap to not remove haplotype chromosome (_hap) annotation"
		print " -norandom to remove genes on unplaced contigs"
		print "Annotation file formats:"
		print " -genePred if annotation file uses format of refGene.txt etc (default if cannot guess from file name suffix)"
		print " -bedann tab-separated 0-based bed file, chromosome start end and 9 optional fields"
		print "Input formats:"
		print " -bed tab separated bed file (default if cannot guess from file name suffix)"
		print " -bedcompacted bed file with number of reads in the score column"
		print " -bedspace space separated bed file"
		print " -bowtie the default output format of bowtie"
#		print " -sam SAM format, and to remove non-unique hits" # broken read counting
#		print " -bam BAM format, and to remove non-unique hits" # broken read counting
		print " -samse SAM format, uniquely mapped reads (faster than -sam, , default for SAM))"
		print " -bamu BAM or SAM format, uniquely mapped reads (faster than -bam or -samse, default for BAM)"
		print " -gff GFF file, no groups"
		print "Normalisation options:"
		print " -mRNAnorm to normalize by the number of reads matching mRNA exons (default)"
		print " -exonnorm to normalize by the number of reads matching exons, including ncRNA"
		print " -allmapnorm to normalize by the total number of mapped reads (default if annotation contains no mRNA)"
		print " -forcedtotal followed by a number of reads for each sample to set a constant to normalise by"
		print "Output format options:"
		print " -readcount to add the number of reads to the output"
		print " -table another output format"
		print " -sortpos for output sorted by genome position"
		print " -exportann followed by a filename to write which exons have been used, also prints exon read counts for the last input file"
		print " -readpresent - to suppress zero count entries - Ramu"
		print "Read-related arguments:"
		print " -strand to use strand information of reads"
		print " -bothends to also map the end positions to genes, each end counted as 0.5 (or 0.25 for paired-end reads)"
		print " -bothendsceil to set -bothends but round the read count upward"
		print " -midread to use middle of the read as read position"
		print " -diffreads to count only one read if several have the same position, strand and length (use with -bam or -sam if paired-end; samtools rmdup is generally better)"
		print " -maxreads followed by maximum number of reads to be used"
		print " -randomreads to make -maxreads pick reads at random"
		print " -minqual followed by an integer, to restrict reads to minimum this mapping quality (for sam, bam) or score (for bed, gff), default use all"
		print " -maxNM followed by an integer, to restrict reads to maximum this edit distance (NM flag in sam, bam), default use all"
		print "Other optional arguments:"
		print " -n followed by list of sample names (input file names are otherwise used)"
		print " -p followed by number of files to process in parallel"
		print " -quite to skip progress messages and warnings"
		print " -h to print this message and quit"
		print "Special output values:"
		print " 0 gene has no reads, -1 gene has no exons"
		print " otherwise the output is in reads per kilobase and million mappable reads (or rather FPKM for paired-end reads)"
		print "Output:"
		print " gene symbol -tab- ID -tab- RPKM values [-tab- read count]"
		return 0
	try: outfile = getargument("-o")
	except:
		print "No outfile (-o)"
		return 1
	try: infiles = getarguments("-i")
	except:
		print "No infile (-i)"
		return 1
	try: annotationfile = getargument("-a")
	except:
		print "No annotationfile (-a)"
		return 1
	try:
		names = getarguments("-n")
		if len(names) != len(infiles):
			print "Not the same number of sample names as files with reads"
			return 1
	except: names = [filename.rsplit("/")[-1] for filename in infiles]
	compressionsuffixes = ['gz','bz2']
	vocal = not testargumentflag("-quite")
	if testargumentflag("-forcedtotal"):
		forcedtotal = [int(v) for v in getarguments("-forcedtotal")]
		if len(forcedtotal) != len(infiles):
			print "Error: -forcedtotal has fewer or more numbers than -i has files"
			return 1
	elif testargumentflag("-forcetotal"):
		forcedtotal = [int(v) for v in getarguments("-forcetotal")]
		if len(forcedtotal) != len(infiles):
			print "Error: -forcetotal has fewer or more numbers than -i has files"
			return 1
	else: forcedtotal = 0
	if testargumentflag("-exportann"):
		exportannotation = getargument("-exportann")
	else: exportannotation = 0
	
	if testargumentflag("-readpresent") and testargumentflag('-p') and vocal:
		print 'Warning: -readpresent and -p do not work together'
	if testargumentflag("-readpresent") and testargumentflag('-table'):
		print 'Error: -table and -p do not work together'
		sys.exit(1)
	elif testargumentflag("-readpresent") and len(infiles)>1 and vocal:
		print 'Warning: -readpresent will filter based on the first file (%s)'%names[0]
	if testargumentflag('-p'):
		# multi-processing mode
		# this instance of the program will be master and not perform calculations, only merging
		
		import tempfile, subprocess
		out = []
		prlist = []
		samplenames = names[:]
		inputfiles = infiles[:]
		if forcedtotal:
			ft_copy = forcedtotal[:]
		processes = int(getargument('-p'))
		restargs = sys.argv[:]
		rmflag(restargs, '-p')
		rmflag(restargs, '-n')
		rmflag(restargs, '-i')
		rmflag(restargs, '-o')
		rmflag(restargs, '-forcedtotal')
		rmflag(restargs, '-forcetotal')
		rmflag(restargs, '-exportann')
		try:
			# launch child instances
			for procleft in range(processes, 0, -1):
				numfiles = len(inputfiles) // procleft
				if numfiles == 0: continue
				out.append(tempfile.mkstemp(suffix='_rpkmforgenes_out.txt')[1])
				cmd = [sys.executable] + restargs + ['-o', out[-1], '-i']
				names_child = []
				ft_child = []
				for i in range(numfiles):
					cmd.append(inputfiles.pop(0))
					names_child.append(samplenames.pop(0))
					if forcedtotal: ft_child.append(str(ft_copy.pop(0)))
				cmd += ['-n'] + names_child
				if exportannotation and len(inputfiles) == 0:
					# put -exportann back in for the last batch of files
					cmd += ['-exportann', exportannotation]
				if forcedtotal: cmd += ['-forcedtotal'] + ft_child
				prlist.append(subprocess.Popen(cmd))
		
			for pr in prlist:
				pr.wait()
				if pr.returncode: raise Exception, 'A child process did not finish normally, look somewhere further up to find its error message'
			
			# merge the output
			merge(out, outfile)
			if vocal: print 'Merged to', outfile
		
		finally:
			# delete temporary files
			for tmpfile in out:
				try: os.remove(tmpfile)
				except: pass
		return 0
	
	if testargumentflag("-genePred") or testargumentflag("-refseq"): annotationtype = 0
	elif testargumentflag("-ucsc"): annotationtype = 1
	elif testargumentflag("-ensembl"): annotationtype = 2
	elif testargumentflag("-bedann"): annotationtype = 6
	elif testargumentflag("-bed3ann"): annotationtype = 3
	elif testargumentflag("-bed6ann"): annotationtype = 4
	elif testargumentflag("-gtfann") or testargumentflag("-gffann"): annotationtype = 5
	elif testargumentflag("-ensgtfann"): annotationtype = 7
	else:
		# if no annotation type given, guess from file suffix
		anndict = {'bed':6, 'gff':5, 'gtf':5, 'txt':0}
		try:
			annpre, ending = annotationfile.rsplit('.',1)
			if ending in compressionsuffixes: ending = annpre.rsplit('.',1)[1]
			annotationtype = anndict[ending]
		except:
			annotationtype = 0
	
	if testargumentflag("-diffreads"): usediffreads = 1
	else: usediffreads = 0
	if testargumentflag("-minqual"): minqual = int(getargument("-minqual"))
	else: minqual = 0
	if testargumentflag("-maxNM"): maxNM = int(getargument("-maxNM"))
	else: maxNM = -1
	if testargumentflag("-bed"): infileformat = "bed"
	elif testargumentflag("-bedcompacted"): infileformat = "bedcompacted"
	elif testargumentflag("-bedspace"): infileformat = "bedspace"
	elif testargumentflag("-gff") or testargumentflag("-gtf"): infileformat = "gtf"
	elif testargumentflag("-sam"): infileformat = "sam"
	elif testargumentflag("-bowtie"): infileformat = "bowtie"
	elif testargumentflag("-samse"): infileformat = "samse"
	elif testargumentflag("-samu"): infileformat = "samse"
	elif testargumentflag("-bamu"): infileformat = "bamse"
	elif testargumentflag("-bamse"): infileformat = "bamse"
	elif testargumentflag("-bam"): infileformat = "bam"
	else: infileformat = "guess"
		
	if testargumentflag("-nocollapse"): COLLAPSEGENES = 0
	else: COLLAPSEGENES = 1
	if testargumentflag("-limitcollapse"): limitcollapse = 1
	else: limitcollapse = 0
	genecollapse = testargumentflag("-namecollapse")
	if testargumentflag("-fulltranscript"): FULLTRANSCRIPTS = 1
	elif testargumentflag("-no3utr"): FULLTRANSCRIPTS = 0
	else: FULLTRANSCRIPTS = 1
	bigWigSummary_path = 0
	if testargumentflag("-dynuniq"):
		uniquenessfiletype = 'Helena'
		arr = getarguments('-dynuniq')
		if len(arr) == 0:
			global_readlength = None
		else:
			global_readlength = int(arr[0])
	elif testargumentflag("-ulen"):
		uniquenessfiletype = 'Helena'
		arr = getarguments('-ulen')
		if len(arr) == 0:
			global_readlength = None
		else:
			global_readlength = int(arr[0])
	else:
		uniquenessfiletype = 'fasta'
		global_readlength = 0
	if testargumentflag("-u"):
		uniqueposdir = getargument("-u")
		if uniqueposdir.endswith('.bw') or uniqueposdir.endswith('.bw.gz') or uniqueposdir.endswith('.bw.bz2') or uniqueposdir.endswith('.bigWig') or uniqueposdir.endswith('.bigWig.gz') or uniqueposdir.endswith('.bigWig.bz2'):
			uniquenessfiletype = 'bigWig'
			bigWigSummary_path = os.path.split(sys.argv[0])[0]
			if bigWigSummary_path == '':
				bigWigSummary_path = './'
			bigWigSummary_path = os.path.join(bigWigSummary_path, 'bigWigSummary')
			if not os.path.isfile(bigWigSummary_path):
				print 'Error: ' + bigWigSummary_path + ' does not exist, needed to read bigWig file specified by -u option, try downloading from http://hgdownload.cse.ucsc.edu/admin/exe/'
				return 1
		elif uniquenessfiletype == 'fasta':
			# detect if it uses variable length uniqueness files
			if any(f.endswith('.btxt') for f in os.listdir(uniqueposdir)):
				global_readlength = None
				uniquenessfiletype = 'Helena'
		ONLYUNIQUEPOS = 1
	else: ONLYUNIQUEPOS = 0
	if testargumentflag("-maxgenes"): MAXGENES = int(getargument("-maxgenes"))
	else: MAXGENES = 0
	if testargumentflag("-maxreads"):
		MAXREADS = int(getargument("-maxreads"))
		randomreads = testargumentflag("-randomreads")
	else: MAXREADS = 0; randomreads = 0
	if testargumentflag("-table"): outputformat = 'table'
	else: outputformat = 'v2'
	if testargumentflag("-readcount"): addreadcount = 1
	else: addreadcount = 0
	if testargumentflag("-readpresent"): readpresent = True 
	else: readpresent = False
	if testargumentflag("-intronsinstead") or testargumentflag("-introns"): INTRONSINSTEAD = 1
	else: INTRONSINSTEAD = 0
	if testargumentflag("-strand"): USESTRANDINFO = 1
	else: USESTRANDINFO = 0
	if testargumentflag("-swapstrands"):
		swapstrands = 1
		USESTRANDINFO = 1
	else: swapstrands = 0
	removemultimappers = testargumentflag("-unique")
	if testargumentflag("-maxlength"): maxgenelength = int(getargument("-maxlength"))
	else: maxgenelength = 0
	if testargumentflag("-onlycoding"): useonlycoding = 1
	else: useonlycoding = 0
	if testargumentflag("-sortpos"): originalorder = 0
	else: originalorder = 1
	if testargumentflag("-mRNAnorm") or testargumentflag("-exonnorm"): genereadnormalise = 1
	elif testargumentflag("-allmapnorm"): genereadnormalise = 0
	else: genereadnormalise = 1
	exonnorm_include_ncRNA = testargumentflag("-exonnorm")
	if testargumentflag("-nooverlap") or testargumentflag("-nodeconvolution"): NODECONVOLUTION = 1
	else: NODECONVOLUTION = 0
	if testargumentflag("-txunique"): ONLYTXUNIQUEEXONS = 1
	else: ONLYTXUNIQUEEXONS = 0
	if testargumentflag("-rmnameoverlap"): removenameoverlap = 1
	else: removenameoverlap = 0
	if testargumentflag("-bothendsceil"):
		mapends = 1
		mapends_ceil = 1
	elif testargumentflag("-bothends"):
		mapends = 1
		mapends_ceil = 0
	else:
		mapends = 0
		mapends_ceil = 0
	midread = testargumentflag("-midread")
	flattengenes = testargumentflag("-flat")
	ignoredchrfragments = []
	if not testargumentflag("-keephap"):
		ignoredchrfragments.append("_hap")
		keephap = False
	else:
		keephap = True
	if testargumentflag("-norandom"):
		ignoredchrfragments.append("_random")
		keeprandom = False
	else:
		keeprandom = True
	if testargumentflag("-addchr"): addchr = 1
	else: addchr = 0
	allchromosomes = []
	usedreadspersample = []
	totalreadspersample = []
	normreadfactors = []
	allchromosomes_dict = {}
	
	if annotationtype == 7: # -ensgtfann
		chrom_translation = {'X':'chrX', 'Y':'chrY', 'MT':'chrM'}
		for i in range(1,100):
			chrom_translation[str(i)] = 'chr%d'%i
		from collections import defaultdict
		lines_per_ID = defaultdict(list)
	
	annotationfileh = openfile(annotationfile)
	counter = 0
	annlines = []
	for line in annotationfileh:
		if line[0] == '#': continue
		if MAXGENES and counter > MAXGENES: break
		if annotationtype in [3,4,6] and line.startswith("track"): continue # bed
		counter += 1
		if annotationtype == 7: # -ensgtfann
			p_ann = line.rstrip('\r\n').split('\t')
			ID = gtf_field(p_ann, ['transcript_name', 'transcript_id'])
			chrom = chrom_translation.get(p_ann[0], p_ann[0])
			if '_MHC_' in chrom:
				if not keephap: continue # throw out haplotype chromosomes
			elif not keeprandom:
				if p_ann[0] not in chrom_translation:
					continue # throw out _random chromsosomes / unplaced contigs
			lines_per_ID[(chrom, ID)].append((line, chrom, counter))
		else:
			newline = Cline(line, counter)
			annlines.append(newline)
	annotationfileh.close()
	if annotationtype == 7: # -ensgtfann
		annlines = [Cline(L, min(e[-1] for e in L)) for L in lines_per_ID.values()]
	annlines.sort()
	if vocal: print "Has sorted entries from gene annotation file"

	global warnedchromosomes
	warnedchromosomes = []

	genes = []
	currentgene = Cgene("none")
	oldend = -1000
	annotation_includes_mRNA = 0
	for annline in annlines:
		line = annline.line
		
		# file format-specific code
		(chromosome, direction, cdsstart, cdsend, exonstarts, exonends, genename, ID, inferred_strand) = fromannotationline(line)
		
		# remove _hap and maybe _random chromosomes
		disallowedchr = 0
		for a in ignoredchrfragments:
			if a in chromosome: disallowedchr = 1
		if disallowedchr: continue

		# if -onlycoding argument then skip any noncoding transcripts
		if useonlycoding and cdsstart == cdsend: continue

		# initiate the transcript and gene
		transcript = Ctranscript(ID, annline.sortnum)
		if oldend < min(exonstarts) or chromosome != currentgene.chromosome or NODECONVOLUTION:
			currentgene = Cgene(genename)
			genes.append(currentgene)
			currentgene.chromosome = chromosome
			oldend = max(exonends)
		else:
			oldend = max([max(exonends), oldend])
		currentgene.transcripts.append(transcript)
		transcript.genename = genename
		
		if maxgenelength > 0:
			# trim from 3' end
			totaltxlength = 0
			if direction == "+":
				for exoni in range(len(exonstarts)-1, -1, -1):
					if totaltxlength >= maxgenelength:
						exonstarts = exonstarts[exoni+1:]
						exonends = exonends[exoni+1:]
						break
					else:
						totaltxlength += exonends[exoni] - exonstarts[exoni]
						if totaltxlength > maxgenelength:
							exonstarts[exoni] += totaltxlength - maxgenelength
			else:
				for exoni in range(len(exonstarts)):
					if totaltxlength >= maxgenelength:
						exonstarts = exonstarts[:exoni]
						exonends = exonends[:exoni]
						break
					else:
						totaltxlength += exonends[exoni] - exonstarts[exoni]
						if totaltxlength > maxgenelength:
							exonends[exoni] -= totaltxlength - maxgenelength
		elif maxgenelength < 0:
			# trim from 5' end
			totaltxlength = 0
			if direction == "-":
				for exoni in range(len(exonstarts)-1, -1, -1):
					if totaltxlength >= -maxgenelength:
						exonstarts = exonstarts[exoni+1:]
						exonends = exonends[exoni+1:]
						break
					else:
						totaltxlength += exonends[exoni] - exonstarts[exoni]
						if totaltxlength > -maxgenelength:
							exonstarts[exoni] += totaltxlength + maxgenelength
			else:
				for exoni in range(len(exonstarts)):
					if totaltxlength >= -maxgenelength:
						exonstarts = exonstarts[:exoni]
						exonends = exonends[:exoni]
						break
					else:
						totaltxlength += exonends[exoni] - exonstarts[exoni]
						if totaltxlength > -maxgenelength:
							exonends[exoni] -= totaltxlength + maxgenelength

		for exoni in range(len(exonstarts)):
			if cdsstart != cdsend and not FULLTRANSCRIPTS:
				# remove 3'UTR
				if direction == "+" and exonends[exoni] <= cdsend:
					newexon = Cexon(exonstarts[exoni], exonends[exoni], True)
				elif direction == "+" and exonstarts[exoni] < cdsend and exonends[exoni] > cdsend:
					newexon = Cexon(exonstarts[exoni], cdsend, True)
				elif direction == "-" and exonstarts[exoni] >= cdsstart:
					newexon = Cexon(exonstarts[exoni], exonends[exoni], True)
				elif direction == "-" and exonstarts[exoni] < cdsstart and exonends[exoni] > cdsstart:
					newexon = Cexon(cdsstart, exonends[exoni], True)
				else:
					continue
			elif cdsstart != cdsend or exonnorm_include_ncRNA:
				newexon = Cexon(exonstarts[exoni], exonends[exoni], True)
			else:
				newexon = Cexon(exonstarts[exoni], exonends[exoni], False)
			if cdsstart != cdsend:
				annotation_includes_mRNA = 1

			if INTRONSINSTEAD:
				# add exon to forbidden set
				newexon.forbidden=True

			currentgene.exons.append(newexon)
			newexon.transcripts.append(transcript)
			
		if INTRONSINSTEAD:
			for exoni in range(len(exonstarts)):		
				# add intron to allowed set
				if exoni + 1 == len(exonstarts):
					continue
				newexon = Cexon(exonends[exoni], exonstarts[exoni+1], False)
				currentgene.exons.append(newexon)
				newexon.transcripts.append(transcript)
				
	annotationfileh.close()
	if vocal: print "Has read positions from gene annotation file"
	
	uniquepos_filesuffix = None
	if ONLYUNIQUEPOS and uniquenessfiletype == 'Helena':
		# find the right value for uniquepos_filesuffix
		for suffix in ('.genomeGeneLevelMerge.unique20-255.btxt','.unique20-255.btxt', '_unique20-255.btxt'):
			if any(f.endswith(suffix) for f in os.listdir(uniqueposdir)):
				uniquepos_filesuffix = suffix
				break
		else:
			for chromosome in sorted(list(allchromosomes), key=lambda s: -len(s)):
				if USESTRANDINFO: chromosome = chromosome[:-1]
				for f in os.listdir(uniqueposdir):
					if f.startswith(chromosome) and f.endswith('.btxt'):
						uniquepos_filesuffix = f[len(chromosome):]
						break
				if uniquepos_filesuffix is not None: break
			if vocal: print 'Guessed -u file suffix:', uniquepos_filesuffix
	
	if not annotation_includes_mRNA and not (testargumentflag("-mRNAnorm") or testargumentflag("-exonnorm")):
		genereadnormalise = 0
	
	# read file with problem regions to remove from annotated genes
	if testargumentflag("-rmregions"):
		removalfile = getargument("-rmregions")
		with open(removalfile, 'r') as infh:
			for line in infh:
				chromosome, direction, cdsstart, cdsend, exonstarts, exonends, genename, ID, inferred_strand = fromannotationline(line, 6) # load bed format
				for start, end in zip(exonstarts, exonends):
					r = Cregion(chromosome, start, end)
					r.addtowindows()
				
				# if -strand and -rmregion file doesn't specify strand, remove from both strands
				strand1, strand2 = ('-','+') if swapstrands else ('+','-')
				if inferred_strand:
					if chromosome[-1] != strand1:
						if vocal: print "Warning: something wrong in annotation parsing"
						continue
					for start, end in zip(exonstarts, exonends):
						r = Cregion(chromosome[:-1]+strand2, start, end)
						r.addtowindows()
	
	# decompress bigwig file if compressed
	has_decompressed_bigwig = 0
	if bigWigSummary_path and (uniqueposdir.endswith(n) for n in ('.bw.gz','.bw.bz2', '.bigWig.gz', '.bigWig.bz2')):
		import tempfile
		f = tempfile.mkstemp(suffix='_rpkmforgenes')[1]
		h = open(f,'w')
		zh = openfile(uniqueposdir, 'r')
		h.write(zh.read())
		h.close()
		zh.close()
		uniqueposdir = f
		has_decompressed_bigwig = 1
	try:
		# sort out isoform overlaps
		for gene in genes:
			newexons = []
			# remove from a list of problem regions
			for ei, exon in enumerate(gene.exons):
				overlap = Cregion.overlappingpoint(gene.chromosome, exon.end)
				if overlap:
					exon.end = min(r.start for r in overlap)
				overlap = Cregion.overlappingpoint(gene.chromosome, exon.start)
				if overlap:
					exon.end = max(r.end for r in overlap)
				if exon.start >= exon.end:
					continue
				newexons_here = split_by_rmregions(exon, gene)[1:] # [1:] so it doesn't add 'exon' which is already in the list
				newexons.extend(newexons_here)
			gene.exons.extend(newexons) 
			
			#gene.exons = [exon for exon in gene.exons if exon.start < exon.end]
			
			# generate possible exons
			borders = list(set([exon.start for exon in gene.exons] + [exon.end for exon in gene.exons]))
			borders.sort()
			possibleexons = [Cexon(start, end, False) for start, end in zip(borders[:-1], borders[1:])]
		
			# populate exons
			for e_exon in gene.exons:
				for p_exon in possibleexons:
					# possibleexons shorter or as long as gene.exons elements at this stage, so little checks
					if e_exon.start <= p_exon.start < e_exon.end:
						p_exon.transcripts.extend(e_exon.transcripts)
						p_exon.normalise_with = p_exon.normalise_with or e_exon.normalise_with
						p_exon.forbidden = p_exon.forbidden or e_exon.forbidden
		
			# remove constitutive introns
			gene.exons = [exon for exon in possibleexons if len(exon.transcripts) > 0]
			
			if removenameoverlap:
				make_overlapsets(gene, genecollapse, limitcollapse)
				for overlapset in gene.overlapsets:
					num_exons = 0
					removal_set = []
					for exon in gene.exons:
						if any(tx in overlapset for tx in exon.transcripts):
							num_exons += 1
							if len(set(tx.genename for tx in exon.transcripts)) > 1:
								removal_set.append(exon)
					if len(removal_set) != num_exons:
						for exon in removal_set:
							exon.forbidden = True # side effect: the normalizationreads value will be higher than the sum of genes' read counts, by ~1%
			
			# forbidden exons do not belong to transcripts
			for exon in gene.exons:
				if exon.forbidden:
					exon.transcripts = []
			
			make_overlapsets(gene, genecollapse, limitcollapse)
			
		# generate overlapsets
		if COLLAPSEGENES or flattengenes:
			for gene in genes:
				make_overlapsets(gene, genecollapse, limitcollapse)
			
			# flatten to one isoform if requested
			if flattengenes:
				for currentgene in genes:
					newtxset = []
					for overlapset in currentgene.overlapsets:
						containedexons = []
						containedgenenames = []
						for tx in overlapset:
							if tx.genename not in containedgenenames:
								containedgenenames.append(tx.genename)
						containedgenenames.sort()
						name1 = "+".join(containedgenenames)
						name2 = "+".join([tx.ID for tx in overlapset])
						flattx = Ctranscript(name2, min(tx.sortnum for tx in overlapset))
						flattx.genename = name1
						newtxset.append(flattx)
						for exon in currentgene.exons:
							for tx in exon.transcripts:
								if tx in overlapset:
									containedexons.append(exon)
									break
						for exon in containedexons:
							exon.transcripts.append(flattx)
					for exon in currentgene.exons:
						exon.transcripts = [tx for tx in exon.transcripts if tx in newtxset]
					currentgene.transcripts = newtxset
	finally:
		if has_decompressed_bigwig:
			try: os.remove(uniqueposdir)
			except: pass
	
	allwindows = [[] for chromosome in allchromosomes]
	for gene in genes:
		for exon in gene.exons:
			chrID = allchromosomes_dict[gene.chromosome]
			w_start = exon.start//WINDOWSIZE
			w_end = exon.end//WINDOWSIZE
			lw_chr = len(allwindows[chrID])
			if lw_chr <= w_end:
				allwindows[chrID].extend([] for i in xrange(w_end-lw_chr+1))
			for w_i in range(w_start, w_end+1):
				allwindows[chrID][w_i].append(exon)
	if vocal: print "Has indexed exons"
	
	has_not_calc_ulen = True
	
	# get the reads
	for infile in infiles:
		from collections import defaultdict
		readlength_dict = defaultdict(int) if global_readlength is None else None
		for gene in genes:
			for exon in gene.exons:
				exon.differentreadsarray = []
				exon.reads = 0
		
		if infileformat == "guess":
			# if no input type given, guess from file suffix
			anndict = {'bed':'bed', 'gff':'gtf', 'gtf':'gtf', 'bam':'bamse'}
			try:
				annpre, ending = infile.rsplit('.',1)
				if ending in compressionsuffixes: ending = annpre.rsplit('.',1)[1]
				if ending == 'sam':
					try:
						import pysam
					except ImportError:
						infileformat_l = 'samse'
					else:
						infileformat_l = 'samse' #before: 'bamse'
				else:
					infileformat_l = anndict[ending]
			except:
				infileformat_l = "bed"
		else:
			infileformat_l = infileformat
		
		if infileformat_l == "bed":
			readgen = readsfrombedtabfile
		elif infileformat_l == "bedspace":
			readgen = readsfrombedspacefile
		elif infileformat_l == "bedcompacted":
			readgen = readsfrombedtabfile_rs
		elif infileformat_l == "gtf":
			readgen = readsfromgtffile
		elif infileformat_l == "sam":
			readgen = readsfromsam
		elif infileformat_l == "samse":
			readgen = readsfromsam_se
		elif infileformat_l == "bamse":
			readgen = readsfrombam_se
		elif infileformat_l == "bam":
			readgen = readsfrombam
		elif infileformat_l == "bowtie":
			readgen = readsfrombowtieoutput
		else:
			if vocal: print "Error: unrecognized format:", infileformat
			return 1
		
		usedreads = 0
		totalreads = 0
		
		if randomreads:
			l_totalreads = sum(1 for x in readgen(infile, True, None))
			inclusionlist = tuple(v>=MAXREADS for v in numpy.random.permutation(l_totalreads))
		for chrID, read_tuple in readgen(infile, False, readlength_dict):
			totalreads += 1
			if randomreads:
				if inclusionlist[totalreads-1]: continue
			elif totalreads == MAXREADS: break
			if chrID == -1: continue
			
			if midread:
				if len(read_tuple) == 2:
					read_tuple = ((abs(read_tuple[0])+abs(read_tuple[1]))//2,)
				elif len(read_tuple) == 4:
					if read_tuple[2] is None:
						read_tuple = ((abs(read_tuple[0])+abs(read_tuple[1]))//2, None)
					else:
						read_tuple = ((abs(read_tuple[0])+abs(read_tuple[1]))//2, (abs(read_tuple[2])+abs(read_tuple[3]))//2)
				else:
					if vocal: print "Warning: problem with read_tuple length and -midread"
			
			if mapends:
				exons = []
				for pos in read_tuple:
					if pos is None: continue
					exons.extend(mappos(chrID, abs(pos)))
			else:
				exons = mappos(chrID, read_tuple[0])
				
			if usediffreads and len(exons) > 0:
				if read_tuple in exons[0].differentreadsarray:
					continue
				else:
					exons[0].differentreadsarray.append(read_tuple)
			
			for exon in exons:
				if mapends:
					exon.reads += 1.0/len(read_tuple)
				else:
					exon.reads += 1
		if randomreads: totalreads = MAXREADS
		if vocal: print "Has compared reads to exons"
		
		if has_not_calc_ulen and global_readlength is not None:
			# fixed read length
			calcuniqlength(genes, {global_readlength: 1}, uniquepos_filesuffix)
			has_not_calc_ulen = False
		elif global_readlength is None:
			# multiple read lengths determined by the sample
			# different for different samples
			calcuniqlength(genes, readlength_dict, uniquepos_filesuffix)
		
		# RPKM calculation
		for gene in genes:
			for exon in gene.exons:
				exon.readspersample.append(exon.reads)
				if exon.normalise_with: usedreads += exon.reads
			
			if sum(not exon.forbidden for exon in gene.exons) == 0:
				for tx in gene.transcripts:
					tx.expression.append(-1)
					tx.reads.append(0)
				continue
			for exon in gene.exons:
				if exon.reads > 0 and not exon.forbidden: break
			else: # no reads found
				for tx in gene.transcripts:
					tx.expression.append(0.0)
				continue
				
			# deconvolute gene isoforms
			#  L is the lengths
			#  R is the reads
			#  D is the densities
			#  L*D = R, D=(LT*L)^-1 * LT * R, where ^-1 is inversion and T is transposition

			# first prepare the arrays
			Ll = []	# list that is to become the matrix L
			for exon in gene.exons:
				if exon.forbidden: continue
				Lc = []	# column of L
				for tx in gene.transcripts:
					if tx in exon.transcripts:
						Lc.append(float(exon.length))
					else:
						Lc.append(0.0)
				Ll.append(Lc)
			Rl = []	# list that is to become the matrix R
			for exon in gene.exons:
				if exon.forbidden: continue
				Rl.append([float(exon.reads)])
			
			Dindices = range(len(gene.transcripts))	# which element in gene.transcripts that an element in D corresponds to
			redofornegative = 1
			counter = 0
			
			if ONLYTXUNIQUEEXONS:
				# only use exons that are only found in one isoform
				row = 0
				while row < len(Rl):
					if elementsinrow(Ll[row]) != 1:
						del Ll[row]
						del Rl[row]
					else:
						row += 1
				if len(Rl) == 0:
					redofornegative = 0
					Dindices = []
			
			while redofornegative:			
				# contract the arrays
				(Ll, Rl) = contractarrays(Ll, Rl)
					
				if len(Ll) == 0:
					Dindices = []
					break
					
				# find isoforms that have no reads and remove them
				readthreshold = 0
				zeroreadcolumns = []
				for column in range(len(Ll[0])):
					zeroreadcolumns.append(0)
				row = 0
				while row < len(Rl):
					nonzeros = 0
					crow = 0
					while crow < len(Rl):
						if rowincludes(Ll[row], Ll[crow]):
							if Rl[crow][0] > readthreshold:
								nonzeros += 1
						crow += 1
					if nonzeros == 0:
						for element in range(len(Ll[row])):
							if Ll[row][element]:
								zeroreadcolumns[element] = 1
					row += 1
				count = 0	# to go through the rows with the fewest exons first
				while count < len(Rl):
					row = 0
					while row < len(Rl):
						if Rl[row][0] > readthreshold and elementsinrow(Ll[row]) == count and \
						 rowincludes(zeroreadcolumns, Ll[row]):
							for element in range(len(Ll[row])):
								if Ll[row][element]:
									zeroreadcolumns[element] = 0
						row += 1		
					count += 1			
				columnsremoved = 0	
				
				for column in range(len(Ll[0])):
					if zeroreadcolumns[column] and len(Ll[0]) > 1:
						Ll = removecolumn(Ll, column-columnsremoved)
						del Dindices[column-columnsremoved]
						columnsremoved += 1	
				if columnsremoved:
					(Ll, Rl) = contractarrays(Ll, Rl)
					if len(Ll) > 1:
						(Ll, Rl) = removezeros(Ll, Rl)
			
				# create the matrices and do the matrix calculations
				L = matrix(Ll)
				R = matrix(Rl)
				LTL = L.transpose() * L
				inverseturn = 0
				
				# these three lines fixed a problem 24 Apr 2012 with massively too large RPKM values for certain samples and genes
				# that problem appears to result from a singular matrix which contains rounding errors and therefore didn't make linalg.inv raise an exception, but produce invLTL with lots of big values
				# this code forces the algorithm to treat borderline genes as giving a potentially singular matrix, instead of testing for exception with linalg.inv
				# the number is set to be large enough in relation to the numbers in LTL to not be rounded away
				# side-effect is an 10^-6 error on average for RPKM values
				# 
				detsign, logdet = linalg.slogdet(LTL)
				if detsign == 0 or logdet - 2*math.log(numpy.mean(abs(LTL))) < -4:
					for ii in range(len(LTL)):
						LTL[ii, ii] += numpy.mean(abs(LTL))*0.0000001
				
				while 0 <= inverseturn <= 1:
					try:
						invLTL = linalg.inv(LTL)
					except:
						# this exception should no longer get thrown
						if inverseturn == 1:
							if vocal: print "Could not take the inverse of LTL for", gene.name
						for ii in range(len(LTL)):
							LTL[ii, ii] += 1	# bit cheating, but this ends up with the same expression estimate for both tx, though calculated with a slightly longer length
						inverseturn += 1
					else:
						D = invLTL * L.transpose() * R
						inverseturn += 1000
				
				# check if any density is lower than zero, end the loop if none
				redofornegative = 0
				negatives = []
				for index in range(len(Dindices)):
					if D[index, 0] <= 0:
						negatives.append(index)
						redofornegative = 1
						
				# remove columns that give negative values from the matrix
				# (those densities will instead be set to zero)
				if redofornegative:
					columnsremoved = 0
					for negativepos in negatives:
						Ll = removecolumn(Ll, negativepos-columnsremoved)
						del Dindices[negativepos-columnsremoved]
						columnsremoved += 1
						
					(Ll, Rl) = removezeros(Ll, Rl)
				counter += 1
				if counter == 1000:
					if vocal: print "Warning: high counter in negloop"
			
			# calculate expression from densities
			Danswers = numpy.zeros(len(gene.transcripts))	# so that found to be negative are set to zero
			for index in range(len(Dindices)):
				Danswers[Dindices[index]] = D[index, 0]
			for index in range(len(gene.transcripts)):
				tx = gene.transcripts[index]
				density = Danswers[index]
				tx.expression.append(1e9 * density / totalreads)
				txreads = 0
				for exon in gene.exons:
					if exon.forbidden: continue
					if ONLYTXUNIQUEEXONS and len(exon.transcripts) > 1:
						continue
					if tx in exon.transcripts:
						txreads +=exon.reads
				tx.reads.append(txreads)
		
		if vocal: print "Has calculated expression values for", infile
		if genereadnormalise and not forcedtotal:
			if vocal: print "Reads for normalisation:", usedreads
			if usedreads == 0:
				if vocal: print "Warning: no reads for normalisation, will use _1 read_ as normalisation factor"
				usedreads = 1
		
		usedreadspersample.append(usedreads)
		totalreadspersample.append(totalreads)
		
	tunits = []
	if COLLAPSEGENES and not flattengenes:
		for currentgene in genes:
			for overlapset in currentgene.overlapsets:
				containedgenenames = []
				tunit = Cunit(min(tx.sortnum for tx in overlapset))
				tunits.append(tunit)
				for tx in overlapset:
					if tx.genename not in containedgenenames:
						containedgenenames.append(tx.genename)
				containedgenenames.sort()
				tunit.name1 = "+".join(containedgenenames)
				tunit.name2 = "+".join([tx.ID for tx in overlapset])
				tunit.rpkms = [sum([f.expression[t] for f in overlapset]) for t in range(len(tx.expression))]
				tunit.reads = [0.0 for rpkm in tunit.rpkms]
				for exon in currentgene.exons:
					if exon.length == 0: continue # don't count reads that didn't affect the rpkm value, can occur if non-unique exon with reads extending into intron
					if exon.forbidden: continue #doesn't do anything
					if sum([1 for tx in exon.transcripts if tx in overlapset]):
						for ii in range(len(tunit.rpkms)):
							tunit.reads[ii] += exon.readspersample[ii]
	else:	
		for gene in genes:
			for tx in gene.transcripts:
				tunit = Cunit(tx.sortnum)
				tunit.name1 = tx.genename
				tunit.name2 = tx.ID
				tunit.rpkms = tx.expression
				tunit.reads = [0.0 for rpkm in tunit.rpkms]
				for exon in gene.exons:
					if exon.length == 0: continue # don't count reads that didn't affect the rpkm value, can occur if non-unique exon with reads extending into intron
					if exon.forbidden: continue #doesn't do anything
					if tx in exon.transcripts:
						for ii in range(len(tunit.rpkms)):
							tunit.reads[ii] += exon.readspersample[ii]
				tunits.append(tunit)

	# change normalisation if requested
	for si in range(len(infiles)):
		normreadfactors.append(totalreadspersample[si])
		if forcedtotal or genereadnormalise:
			if forcedtotal:
				samplefactor = totalreadspersample[si]/float(forcedtotal[si])
				normreadfactors[si] = forcedtotal[si]
			elif genereadnormalise:
				samplefactor = totalreadspersample[si]/float(usedreadspersample[si])
				normreadfactors[si] = usedreadspersample[si]
			for tu in tunits:
				if tu.rpkms[si] >= 0:
					tu.rpkms[si] *= samplefactor
					if vocal and tu.rpkms[si] > 1e6:
						print 'Warning: %s has a very high RPKM value of %f'%(tu.name1, tu.rpkms[si])
	
	# output to file
	outfileh = openfile(outfile, "w")
	if outputformat == 'v2':
		print >>outfileh, "\t".join(["#samples"]+names)
		print >>outfileh, "\t".join(["#allmappedreads"] + [str(v) for v in totalreadspersample])
		if forcedtotal:
			print >>outfileh, "\t".join(["#normalizationreads"] + [str(v) for v in forcedtotal])
		else:
			print >>outfileh, "\t".join(["#normalizationreads"] + [str(v) for v in normreadfactors])
		print >>outfileh, "#arguments\t"+" ".join(sys.argv) + "\ttime:" + time.asctime() + '\tversion:'+lastmodified
	
		if originalorder:
			tunits.sort(key=lambda unit: unit.sortnum)
	
		for tu in tunits:
			if readpresent:
				if tu.reads[0] <= 0: continue # Ramu, skip 0 count reads
			if addreadcount:
				if mapends_ceil:
					print >>outfileh, '\t'.join([tu.name1, tu.name2] + [str(v) for v in tu.rpkms] + [str(int(math.ceil(v))) for v in tu.reads])
				else:
					print >>outfileh, '\t'.join([tu.name1, tu.name2] + [str(v) for v in tu.rpkms] + [str(v)[:-2] if str(v).endswith('.0') else str(v) for v in tu.reads])
			else:
				print >>outfileh, '\t'.join([tu.name1, tu.name2] + [str(v) for v in tu.rpkms])
	elif outputformat == 'table':
		print >>outfileh, "\t".join(["Gene"]+names)
		if originalorder:
			tunits.sort(key=lambda unit: unit.sortnum)
		for tu in tunits:
			if any(lvl < 0 for lvl in tu.rpkms): continue
			if tu.name2 in ('.','+','-',''):
				name = tu.name1
			elif tu.name1 in ('.','+','-',''):
				name = tu.name2
			else:
				name = tu.name1.replace('+',' ')+' '+tu.name2
			if addreadcount:
				if mapends_ceil:
					print >>outfileh, '\t'.join([name] + [str(int(math.ceil(v))) for v in tu.reads])
				else:
					print >>outfileh, '\t'.join([name] + [str(v)[:-2] if str(v).endswith('.0') else str(v) for v in tu.reads])
			else:
				print >>outfileh, '\t'.join([name] + [str(v) for v in tu.rpkms])
	outfileh.close()
	if vocal: print "Has written to", outfile
	
	# print information about the gene annotation used to file
	if exportannotation:
		outfileh = open(exportannotation, 'w')
		print >>outfileh, "#numbers for " + infiles[-1]
		print >>outfileh, '#genename\tchromosome\texons'
		print >>outfileh, '#exon:\tstart\tend\tlength\treads\ttranscript_IDs'
		for gene in genes:
			exons = [exon for exon in gene.exons if not exon.forbidden]
			print >>outfileh, gene.name + '\t' + gene.chromosome +'\t' + str(len(exons))
			for exon in exons:
				print >>outfileh, '\t'.join(map(str,['exon:', exon.start, exon.end, exon.length, exon.reads] + [tx.ID+'|'+tx.genename for tx in exon.transcripts]))
	
	return 0

def calcuniqlength(genes, readlength_dict, filesuffix):
	readlength_dict_weights = dict((r,float(n)/sum(readlength_dict.values())) for r,n in readlength_dict.items())
	for gene in genes:
		# forbidden exons do not belong to transcripts
		# calculate length for the rest
		for exon in gene.exons:
			if not exon.forbidden:
				exon.calclength(gene.chromosome, readlength_dict_weights, filesuffix)

if '__main__' == __name__:
	ret = main()
	if ret:	sys.exit(ret)

###ChIP-seq code

Bowtie:
>bowtie2 -p 16 -x [path to genome file here] [.fastq input file name here] > [output file name]

Calling peaks with MACS2:
>macs2 callpeak -t [output from bowtie2, experimental sample] -c [output from bowtie2, control sample] --nomodel -B -g 1.87e9;

Generating peak-centric heatmaps, where -size indicates area around peak center and -hist indicates bin size:

>annotatePeaks.pl [macs2 output or any peaks/bed file really] [genome, like mm9] -size 6000 -hist 100 -ghist -p [other peak file you want to compare macs2 output to] > [output file name] 

###Motif Analysis code

General de novo/known motif analysis:
>extractseqforpeaks_v3.pl [peaks.xls file name] 5 [output file name] 100
>findMotifs.pl [output file from extractseqforpeaks] fasta [output directory] -fasta [background output file]

Determing peak-centric motif occurrence using Homer:
>annotatePeaks.pl [peaks.xls file name] mm9 -size 4000 -hist 25 -m [HOMER .motif files here] > [output file name]

Assigning peaks to gene features:
>map_peak_to_gene_features_v3.pl [peaks.xls file name] 5 2015.Mar.RefFlat.txt 1000 [output file name]
>perl gen-peak-feature-counts.pl [peaks.xls file name] 1 [output file name from peaktogenefeatures] 20000 2000 2000 0 1 > [output file]

extractseqforpeaks_v3.pl
#!/usr/bin/perl

## buffer indicates extension of bp from center of a peak

use strict;

if($#ARGV < 0){
   print "Usage: ./prog.pl <peak file> <threshold> <out.seq> <buffer>\n";
   exit(1);
}

my $PATH = "/Users/bklee/Analysis/genome/mm9" ;
my $peakfile = $ARGV[0];
my $threshold = $ARGV[1];
my $seqfile = $ARGV[2];
my $BUF = $ARGV[3];

my %hash_chr_peaks = ();

&read_seq_peaks($peakfile, $threshold, \%hash_chr_peaks, 3);


open(SEQ,">$seqfile") || die("Err opening $seqfile\n");
foreach my $chr (sort keys %hash_chr_peaks)
{
  my $chrname = $PATH."/".$chr.".fa.gz";

  my $chrseq = &read_chr_seq($chrname);
   for(my $peakctr = 0; $peakctr <= $#{ $hash_chr_peaks{$chr} }; $peakctr ++)
   {

	 my $start_seq = $hash_chr_peaks{$chr}[$peakctr]->{peakpos} - $BUF;
	 my $end_seq = $hash_chr_peaks{$chr}[$peakctr]->{peakpos} + $BUF;

   	 my $start = $hash_chr_peaks{$chr}[$peakctr]->{start};
#    	 print "$start\n" ;
   	 my $end = $hash_chr_peaks{$chr}[$peakctr]->{end};
   	 my $unw_score = $hash_chr_peaks{$chr}[$peakctr]->{unw_score};
   	 my $w_score = $hash_chr_peaks{$chr}[$peakctr]->{w_score};
   	 my $peakpos = $hash_chr_peaks{$chr}[$peakctr]->{peakpos};
   	 my $name = $chr."_".$start_seq."_"."$end_seq" ;

   	 my $seq = substr($chrseq, $start_seq-1, 2*$BUF+1);
#      print "$seq\n" ;
#      print SEQ ">$chr\t$start\t$end\t$w_score\t$unw_score\t$peakpos\t$BUF\n$seq\n";
     print SEQ ">$name\t$chr\t$start\t$end\t$w_score\t$unw_score\t$peakpos\t$BUF\n$seq\n";  ## for nmica input
#      print SEQ ">$name\t$chr\t$start\t$end\t$w_score\t$unw_score\t$peakpos\t$BUF\n$seq\n";  ## for nmica input

   }
}
close(SEQ);

sub read_seq_peaks{
    my($file_pk, $peak_threshold, $ref_hash, $filterCol) = @_;
    $filterCol = 3 if !defined($filterCol);
    my $ct = 0;
    my $FH = &open_file_return_handle($file_pk);
    <$FH>;
    while(<$FH>) {
        $_=~s/\n//;  $_=~s/\r//;
        my @arr = split(/\t/,$_);
        my $chr = $arr[0];
        my $start = $arr[1];
        my $end = $arr[2];
        my $w_score = $arr[3];
        my $unw_score = $arr[4];
        my $peakpos = $arr[6];
        my $sum_pzscore = $arr[10];
        my $sum_nreads = $arr[11];
        my $score = $arr[$filterCol];
		my $ID = $arr[9] ;
        if(!defined($score) || $score < $peak_threshold){ next; }

        $ct++;
        my $peak;
        $peak->{start} = $start;  $peak->{end} = $end;
        $peak->{unw_score} = $unw_score; $peak->{w_score} = $w_score;
        $peak->{center} = ($start+$end)/2; $peak->{peakpos} = $peakpos;
        $peak->{sum_pzscore} = $sum_pzscore;
        $peak->{sum_nreads} = $sum_nreads;
        $peak->{data} = $_ ;
		$peak->{ID} = $ID ;
        push @{ $ref_hash->{$chr} }, $peak;
     }
     close($FH);
     return $ct;
}

sub open_file_return_handle{
    my($filename) = @_;
    if ($filename =~ /.gz/) {
       die("File does not exist: '$filename'") unless -e $filename;
       open(FILE,"gzip -dc $filename |") || die("Err opening $filename");
    }
    elsif ($filename =~ /.zip/) {
       die("File does not exist: '$filename'") unless -e $filename;
       open(FILE,"unzip -qc $filename |") || die("Err opening $filename");
    }
    else {
        open(FILE,"$filename") || die("Err opening $filename");
    }
    my $filehandle = *FILE;
    return $filehandle;
}


sub read_chr_seq{
    my ($file) = @_;
    my $seq = "";

    if ($file =~ /.gz/) {
        open(CHR,"gzip -dc $file |") || die("Err opening $file");
    }
    elsif ($file =~ /.zip/) {
        open(CHR,"unzip -qc $file |") || die("Err opening $file");
    }
    else {
        open(CHR,"$file") || die("Err opening $file");
    }

    <CHR>;
    while(<CHR>)
     {
        $_=~s/\n//;
        $_=~s/\r//;
        $seq = $seq.$_;
     }
    close(CHR);
    return $seq;
}

findMotifs.pl is from Homer (http://homer.ucsd.edu/homer/)

#!/usr/bin/env perl
use warnings;
use lib "/work/04237/lleblanc/HomerNov2018/.//bin";
my $homeDir = "/work/04237/lleblanc/HomerNov2018/./";


# Copyright 2009-2016 Christopher Benner <cbenner@ucsd.edu>
# 
# This file is part of HOMER
#
# HOMER is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# HOMER is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.


use POSIX;
use HomerConfig;
use Statistics;

#default options...
my $accDir = $homeDir . "/data/accession/";
my $seqDir = $homeDir . "/data/promoters/";
my $reduceThresh = 0.6;
my $matchThresh = "T10";
my $knownPvalueThresh = 0.01;
my $motifInfoFileName = "motifFindingParameters.txt";
my $config = HomerConfig::loadConfigFile();
my $idtype = "";
our %toDelete = ();


if (@ARGV < 3) {
	printCMD();
}

sub printCMD {

	print STDERR "\n\tProgram will find de novo and known motifs in a gene list\n\n";
	print STDERR "\t\tUsage:  findMotifs.pl <input list> <promoter set> <output directory> [additoinal options]\n";
	print STDERR "\n\t\texample: findMotifs.pl genelist.txt mouse motifResults/ -len 10\n";
	print STDERR "\n\t\tFASTA example: findMotifs.pl targets.fa fasta motifResults/ -fasta background.fa\n";

	print STDERR "\n\tAvailable Promoter Sets: Add custom promoters sets with loadPromoters.pl\n";
	my $z = 0;
	foreach (keys %{$config->{'PROMOTERS'}}) {
		print STDERR "\t\t$_\t$config->{'PROMOTERS'}->{$_}->{'org'}\t$config->{'PROMOTERS'}->{$_}->{'directory'}"
						. "\t$config->{'PROMOTERS'}->{$_}->{'start'}\t$config->{'PROMOTERS'}->{$_}->{'end'}"
						. "\t$config->{'PROMOTERS'}->{$_}->{'idtype'}\n";
		$z++;
	}
	print STDERR "\n\t\tTry typing \"perl $homeDir/configureHomer.pl -list\" to see available promoter sets\n";
	print STDERR "\t\tTyping \"perl $homeDir/configureHomer.pl -install NNN\" to install promoter set NNN\n";

	print STDERR "\n\tBasic options:\n";
	print STDERR "\t\t-len <#>[,<#>,<#>...] (motif length, default=8,10,12) [NOTE: values greater 12 may cause the program\n";
	print STDERR "\t\t\tto run out of memmory - in these cases decrease the number of sequences analyzed]\n";
	print STDERR "\t\t-bg <background file> (ids to use as background, default: all genes)\n";
	print STDERR "\t\t-start <#> (offset from TSS, default=-300) [max=based on Promoter Set]\n";	
	print STDERR "\t\t-end <#> (offset from TSS, default=50) [max=based on Promoter Set]\n";	
	print STDERR "\t\t-rna (output RNA motif logos and compare to RNA motif database, automatically sets -norevopp)\n";
	print STDERR "\t\t-mask/-nomask (use/don't use repeatmasked files, default: -mask)\n";
	print STDERR "\t\t-S <#> (Number of motifs to optimize, default: 25)\n";
	print STDERR "\t\t-mis <#> (global optimization: searches for strings with # mismatches, default: 1)\n";
	print STDERR "\t\t-noconvert (will not worry about converting input files into unigene ids)\n";
	print STDERR "\t\t-norevopp (do not search the reverse strand for motifs)\n";
	print STDERR "\t\t-nomotif (don't search for de novo motif enrichment)\n";

	print STDERR "\n\tScanning sequence for motifs\n";
	print STDERR "\t\t-find <motif file> (This will cause the program to only scan for motifs)\n";
	print STDERR "\n\tIncluding Enhancers - peak files of enhancer location, peak ID should be gene ID\n";
	print STDERR "\t\t-enhancers <peak file> <genome verion>\n";
	print STDERR "\t\t\t(enhancers to include in search space, peaks/sequences should be named with a gene ID\n";
	print STDERR "\t\t\tIf multiple enhancers per gene, use the same gene ID, and all will be included)\n";
	print STDERR "\t\t-enhancersOnly (do not include promoter sequence in motif search)\n";

	print STDERR "\n\tFASTA files: If you prefer to use your own fasta files, place target sequences and \n";
	print STDERR "\t\tbackground sequences in two separate FASTA formated files (must have unique identifiers)\n";
	print STDERR "\t\tTarget File - use in place of <input list> (i.e. the first argument)\n";
	print STDERR "\t\tBackground File - after output directory (with additional options) use the argument:\n";
	print STDERR "\t\t\t-fastaBg <background fasta file> (This is recommended for fasta based analysis)\n";
	print STDERR "\t\tIn place of the promoter set use \"fasta\", or any valid set (this parameter is ignored)\n";
	print STDERR "\t\tWhen finding motifs [-find], only the target file with be searched)\n";
	print STDERR "\t\t\t-chopify (chops up background regions to match size of target regions)\n";
	print STDERR "\t\t\t\ti.e. if background is a full genome or all mRNAs\n";

	print STDERR "\n\tKnown Motif Options/Visualization:\n";
	print STDERR "\t\t-mset <vertebrates|insects|worms|plants|yeast|all> (check against motif collects, default: auto)\n";
	print STDERR "\t\t-basic (don't check de novo motifs for similarity to known motifs)\n";
	print STDERR "\t\t-bits (scale sequence logos by information content, default: doesn't scale)\n";
	print STDERR "\t\t-nocheck (don't check for similarity between novo motif motifs and known motifs)\n";
	print STDERR "\t\t-mcheck <motif file> (known motifs to check against de novo motifs,\n";
	print STDERR "\t\t-noknown (don't search for known motif enrichment, default: -known)\n";
	print STDERR "\t\t-mknown <motif file> (known motifs to check for enrichment,\n";
	print STDERR "\t\t-nofacts (omit humor)\n";
	print STDERR "\t\t-seqlogo (uses weblogo/seqlogo/ghostscript to visualize motifs, default uses SVG)\n";

	print STDERR "\n\tAdvanced options:\n";
	print STDERR "\t\t-b (use binomial distribution to calculate p-values, hypergeometric is default)\n";
	print STDERR "\t\t-nogo (don't search for gene ontology enrichment)\n";
	print STDERR "\t\t-humanGO (Convert IDs to human for GO analysis)\n";
	print STDERR "\t\t-ontology <ont.genes> [ont.genes] ... (custom ontologies for GO analysis)\n";
	print STDERR "\t\t-noweight (no CG correction)\n";
	print STDERR "\t\t-noredun (Don't remove predetermined redundant promoters/sequences)\n";
	print STDERR "\t\t-g (input file is a group file, i.e. 1st column = id, 2nd = 0 or 1 [1=target,0=back])\n";
	print STDERR "\t\t-cpg (use CpG% instead of GC% for sequence normalization)\n";
	print STDERR "\t\t-rand (randomize labels for target and backgound sequences)\n";
	print STDERR "\t\t-maskMotif <motif file 1> [motif file 2] ... (motifs to mask before motif finding)\n";
	print STDERR "\t\t-opt <motif file 1> [motif file 2] ... (motifs to optimize/change length)\n";
	print STDERR "\t\t-peaks (will produce peak file of promoters to use with findMotifsGenome.pl)\n";
	print STDERR "\t\t-nowarn (no warnings)\n";
	print STDERR "\t\t-keepFiles (don't delete temporary files)\n";
	print STDERR "\t\t-dumpFasta (create target.fa and background.fa files)\n";
	print STDERR "\t\t-min <#> (remove sequences shorter than #, default: 0)\n";
	print STDERR "\t\t-max <#> (remove sequences longer than #, default: 1e10)\n";
	print STDERR "\t\t-reuse (rerun homer using old seq files etc. with new options\n";
	print STDERR "\t\t\t  and ignores input list, organism)\n";
	print STDERR "\t\t-fdr <#> (Calculate empirical FDR for de novo discovery #=number of randomizations)\n";

	print STDERR "\n";
	print STDERR "\thomer2 specific options:\n";
	print STDERR "\t\t-homer2 (use homer2 instead of original homer, default)\n";
	print STDERR "\t\t-nlen <#> (length of lower-order oligos to normalize - general sequences, default: 3)\n";
	print STDERR "\t\t\t-nmax <#> (Max normalization iterations, default: 160)\n";
    print STDERR "\t\t\t-neutral (weight sequences to neutral frequencies, i.e. 25%, 6.25%, etc.)\n";
	print STDERR "\t\t-olen <#> (lower-order oligo normalization for oligo table, use if -nlen isn't working well)\n";
	print STDERR "\t\t-p <#> (Number of processors to use, default: 1)\n";
	print STDERR "\t\t-e <#> (Maximum expected motif instance per bp in random sequence, default: 0.01)\n";
	print STDERR "\t\t-cache <#> (size in MB for statistics cache, default: 500)\n";
	print STDERR "\t\t-quickMask (skip full masking after finding motifs, similar to original homer)\n";
	print STDERR "\t\t-homer1 (to force the use of the original homer)\n";
	print STDERR "\t\t-minlp <#> (stop looking for motifs when seed logp score gets above #, default: -10)\n";
	print STDERR "\n\tOriginal homer specific options:\n";
	print STDERR "\t\t-float (allow adjustment of the degeneracy threshold for known motifs to improve p-value[dangerous])\n";
	print STDERR "\t\t-homer1 (to force the use of the original homer)\n";
	print STDERR "\t\t-depth [low|med|high|allnight] (time spent on local optimization default: med)\n";

	print STDERR "\n";
	exit;
}

my $cmd = parseCMDLine(\@ARGV);

sub parseCMDLine {
	my ($argv) = @_;

	my @len = ();
	my @motifFiles = ();
	my @motifFiles2 = ();
	my @ont = ();
	my $cmd = {fg=>$ARGV[0], bg=>'', 
				promoters=>$ARGV[1], output=>$ARGV[2], 
				start=>-300, end=>50, rand=>0,maxlen=>10, gc=>1,
				noconv=>0,noweight=>0,redundant=>1.5, nogo=>0,motifMask=>\@motifFiles,motifOpt=>\@motifFiles2,
				nomotif=>0,noknown=>0, groupFlag=>0, norevopp=>0,float=>0,
				motif=>'', nomask=>0, len=>\@len, mis=>1,fasta=>'',mask=>1,
				S=>25, reuse=>0, g=>0, find=>'', alg=>'', peaks=>0,
				mcheck=>'', mknown=>'', checkFlag=>1,depth=>0.5,
				nowarn=>0,keepFiles=>0,homer2=>1,nlen=>3,olen=>0,expect=>0,cpus=>1,
				percentSimilar=>0.20,cache=>500,bits=>"",quickMask=>0,rnaMode=>0,
				minLen=>0,maxLen=>1e10,chopify=>0,nofacts=>"",fdr=>0,nmax=>160,neutral=>"",
				enhancersOnly=>0,enhancers=>"",genome=>"",dumpFasta=>0,minlp=>-10,mset=>'auto',
				humanGO=>0,seqlogo=>'',ontology=>\@ont
				};
	print STDERR "\nSelected Options:\n";
	print STDERR "\tInput file = $cmd->{'fg'}\n";
	print STDERR "\tPromoter Set = $cmd->{'promoters'}\n";
	print STDERR "\tOutput Directory = $cmd->{'output'}\n";

	if ($cmd->{'promoters'} =~ /\-mRNA/) {
		print STDERR "\tRunning in mRNA mode (-norevopp -min 200 -max 10000 -noknown)\n";
		$cmd->{'rnaMode'} = 1;
		$cmd->{'noknown'} = 1;
		$cmd->{'norevopp'} = 1;
		$cmd->{'minLen'} = 200;
		$cmd->{'maxLen'} = 10000;
	}
	
	for (my $i=3;$i<@$argv;$i++) { 
		if ($ARGV[$i] eq '-bg') {
			$cmd->{'bg'} = $ARGV[++$i];
			print STDERR "\tbackground file: $cmd->{'bg'}\n";
		} elsif ($ARGV[$i] eq '-len' ) {
			my @lengths = split /\,/, $ARGV[++$i];
			$cmd->{'len'} = \@lengths;
			my $str = '';
			print STDERR "\tMotif length set at ";
			$cmd->{'maxlen'} = 0;
			foreach(@lengths) {
				print STDERR "$_, ";
				$cmd->{'maxlen'} = $_ if ($_ > $cmd->{'maxlen'});
				if ($_ > 12) {
					print STDERR "*** might want to hit ctrl+C to quit if it takes too long!\n";
					print STDERR "*** If you run out of memmory try reducing the number background sequences\n";
				}
			}
			print STDERR "\n";
		} elsif ($ARGV[$i] eq '-enhancersOnly' ) {
			$cmd->{'enhancersOnly'} = 1;
		} elsif ($ARGV[$i] eq '-enhancers' ) {
			if ($i+2 >= @ARGV ) {
				print STDERR "Something's wrong with your command line argument: -enhancers <peak/fasta file> <genome/fasta>\n";
				exit(1);
			}
			$cmd->{'enhancers'} = $ARGV[++$i];
			$cmd->{'genome'} = $ARGV[++$i];
		} elsif ($ARGV[$i] eq '-neutral' ) {
			$cmd->{'neutral'} = " -neutral";
		} elsif ($ARGV[$i] eq '-humanGO' ) {
			$cmd->{'humanGO'} = 1;
		} elsif ($ARGV[$i] eq '-float' ) {
			$cmd->{'float'}=1;
		} elsif ($ARGV[$i] eq '-minlp' ) {
			$cmd->{'minlp'} = $ARGV[++$i];
		} elsif ($ARGV[$i] eq '-mset' ) {
			$cmd->{'mset'} = $ARGV[++$i];
		} elsif ($ARGV[$i] eq '-seqlogo' ) {
			$cmd->{'seqlogo'} = "-seqlogo";
		} elsif ($ARGV[$i] eq '-dumpFasta' ) {
			$cmd->{'dumpFasta'} = 1;
		} elsif ($ARGV[$i] eq '-nmax' ) {
			$cmd->{'nmax'} = $ARGV[++$i];
		} elsif ($ARGV[$i] eq '-fdr' ) {
			$cmd->{'fdr'} = $ARGV[++$i];
			print STDERR "\tWill randomize and repeat motif finding $cmd->{'fdr'} times to estimate FDR\n";
		} elsif ($ARGV[$i] eq '-g' ) {
			$cmd->{'groupFlag'} = 1; 
			print STDERR "\t$ARGV[0] will be treated as a group file\n";
		} elsif ($ARGV[$i] eq '-basic' ) {
			$cmd->{'nofacts'} = " -basic ";
		} elsif ($ARGV[$i] eq '-nofacts' ) {
			$cmd->{'nofacts'} = " -nofacts ";
		} elsif ($ARGV[$i] eq '-keepFiles' ) {
			$cmd->{'keepFiles'} = 1; 
			print STDERR "\tWill keep temporary files\n";
		} elsif ($ARGV[$i] eq '-chopify' ) {
			$cmd->{'chopify'} = 1; 
			print STDERR "\tWill chopify the background FASTA files into the right size chunks\n";
		} elsif ($ARGV[$i] eq '-norevopp' ) {
			$cmd->{'norevopp'} = 1; 
			print STDERR "\tWill not search the reverse strand\n";
		} elsif ($ARGV[$i] eq '-depth' ) {
			if ($ARGV[$i+1] eq 'low') {
				$cmd->{'depth'} = 1; 
			} elsif ($ARGV[$i+1] eq 'med') {
				$cmd->{'depth'} = 0.5; 
			} elsif ($ARGV[$i+1] eq 'high') {
				$cmd->{'depth'} = 0.1; 
			} elsif ($ARGV[$i+1] eq 'allnight') {
				$cmd->{'depth'} = 0.01; 
			} else {
				print STDERR "Don't understand $ARGV[$i+1] as an optimization depth!\n";
				exit;
			}
			print STDERR "\tLocal optimization set on $ARGV[$i+1] depth\n";
			$i++;
		} elsif ($ARGV[$i] eq '-reuse' ) {
			$cmd->{'reuse'} = 1; 
			print STDERR "\tOld files will be used to repeat homer analysis\n";
		} elsif ($ARGV[$i] eq '-rna' ) {
			print STDERR "\tModifying analysis for RNA analysis\n";
			$cmd->{'rnaMode'} = 1;
			$cmd->{'norevopp'} = 1;
			$cmd->{'noknown'} = 1;
			$cmd->{'mknown'} = $homeDir . "/data/knownTFs/known.rna.motifs";
			$cmd->{'mcheck'} = $homeDir . "/data/knownTFs/all.rna.motifs";
		} elsif ($ARGV[$i] eq '-opt') {
			my $bail = 0;
			print STDERR "\tWill optimize motifs in the following files:\n";
			print STDERR "\t\tSkipping known enrichment check\n";
			$cmd->{'noknown'}=1;
			while ($ARGV[++$i] !~ /^\-/) {
				print STDERR "\t\t$ARGV[$i]\n";
				push(@{$cmd->{'motifOpt'}}, $ARGV[$i]);
				if ($i>=@ARGV-1) {
					$bail=1;
					last;
				}
			}
			last if ($bail==1);
			$i--;
	   } elsif ($ARGV[$i] eq '-maskMotif') {
			my $bail = 0;
			print STDERR "\tWill mask motifs in the following files:\n";
			while ($ARGV[++$i] !~ /^\-/) {
				print STDERR "\t\t$ARGV[$i]\n";
				push(@{$cmd->{'motifMask'}}, $ARGV[$i]);
				if ($i>=@ARGV-1) {
					$bail=1;
					last;
				}
			}
			last if ($bail==1);
			$i--;
	   } elsif ($ARGV[$i] eq '-ontology') {
			my $bail = 0;
			print STDERR "\tWill include the following ontology files for GO analysis:\n";
			while ($ARGV[++$i] !~ /^\-/) {
				print STDERR "\t\t$ARGV[$i]\n";
				push(@{$cmd->{'ontology'}}, $ARGV[$i]);
				if ($i>=@ARGV-1) {
					$bail=1;
					last;
				}
			}
			last if ($bail==1);
			$i--;
		} elsif ($ARGV[$i] eq '-cpg' || $ARGV[$i] eq '-CpG') {
			$cmd->{'gc'} = 0;
			print STDERR "\tUsing GC% instead of CpG%\n";
		} elsif ($ARGV[$i] eq '-noweight' ) {
			$cmd->{'noweight'} = 1; 
			print STDERR "\tWill not adjust sequences for CG content\n";
		} elsif ($ARGV[$i] eq '-noredun' ) {
			$cmd->{'redundant'} = 2;
			#print STDERR "\tRemoving redundant sequences > $cmd->{'redundant'} % similar\n";
		} elsif ($ARGV[$i] eq '-nomotif' ) {
			$cmd->{'nomotif'} = 1; 
			print STDERR "\tWill not run homer for de novo motifs\n";
		} elsif ($ARGV[$i] eq '-peaks' ) {
			$cmd->{'peaks'} = 1; 
			print STDERR "\tWill create peak file of target promoters (will print to stdout)\n";
		} elsif ($ARGV[$i] eq '-known' ) {
			$cmd->{'noknown'} = 0;
		} elsif ($ARGV[$i] eq '-noknown' ) {
			$cmd->{'noknown'} = 1; 
			print STDERR "\tWill not search for known motifs\n";
		} elsif ($ARGV[$i] eq '-bits' ) {
			$cmd->{'bits'} = "-bits";
		} elsif ($ARGV[$i] eq '-homer1' ) {
			$cmd->{'homer2'} = 0;
			print STDERR "\tUsing original homer\n";
		} elsif ($ARGV[$i] eq '-homer2' ) {
			$cmd->{'homer2'} = 1;
			print STDERR "\tUsing homer2 (warning, unpredictably awesome results may, or may not, ensue)\n";
		} elsif ($ARGV[$i] eq '-cache' ) {
			$cmd->{'cache'} = $ARGV[++$i];
		} elsif ($ARGV[$i] eq '-ps' ) {
			$cmd->{'percentSimilar'} = $ARGV[++$i];
		} elsif ($ARGV[$i] eq '-quickMask' ) {
			$cmd->{'quickMask'} = 1;
		} elsif ($ARGV[$i] eq '-p' || $ARGV[$i] eq '-cpu') {
			$cmd->{'cpus'} = $ARGV[++$i];
			print STDERR "\tUsing $cmd->{'cpus'} CPUs\n";
			print STDERR "\t\tWill only work with homer2 (i.e. use -homer2)\n" if ($cmd->{'homer2'} == 0);
		} elsif ($ARGV[$i] eq '-olen' ) {
			$cmd->{'olen'} = $ARGV[++$i];
			print STDERR "\tWill normalize background oligos for oligos of length $cmd->{'olen'} bp\n";
			print STDERR "\t\tWill only work with homer2 (i.e. use -homer2)\n" if ($cmd->{'homer2'} == 0);
		} elsif ($ARGV[$i] eq '-nlen' ) {
			$cmd->{'nlen'} = $ARGV[++$i];
			print STDERR "\tWill normalize background sequences for oligos of length $cmd->{'nlen'} bp\n";
			print STDERR "\t\tWill only work with homer2 (i.e. use -homer2)\n" if ($cmd->{'homer2'} == 0);
		} elsif ($ARGV[$i] eq '-e' ) {
			$cmd->{'expect'} = $ARGV[++$i];
		} elsif ($ARGV[$i] eq '-min' ) {
			$cmd->{'minLen'} = $ARGV[++$i];
		} elsif ($ARGV[$i] eq '-max' ) {
			$cmd->{'maxLen'} = $ARGV[++$i];
		} elsif ($ARGV[$i] eq '-nowarn' ) {
			$cmd->{'nowarn'} = 1; 
			print STDERR "\tWill not stop program on warnings\n";
		} elsif ($ARGV[$i] eq '-rand' ) {
			$cmd->{'rand'} = 1; 
			print STDERR "\tWill randomize sequence labels (i.e. find motifs in random dataset)\n";
		} elsif ($ARGV[$i] eq '-nogo' ) {
			$cmd->{'nogo'} = 1; 
			print STDERR "\tWill Skip Gene Ontology Analysis\n";
		} elsif ($ARGV[$i] eq '-mask' ) {
			$cmd->{'nomask'} = 0; 
			$cmd->{'mask'} = 1;
			print STDERR "\tUsing repeat masked sequences\n";
		} elsif ($ARGV[$i] eq '-nomask' ) {
			$cmd->{'nomask'} = 1; 
			print STDERR "\tUsing non-repeat masked sequences\n";
		} elsif ($ARGV[$i] eq '-noconvert' ) {
			$cmd->{'noconv'} = 1; 
			print STDERR "\tWill not convert ids (Input files better be Unigene...)\n";
		} elsif ($ARGV[$i] eq '-b' ) {
			$cmd->{'alg'} = ' -alg binomial '; 
			print STDERR "\tUsing Binomial Distribution for p-values (instead of hypergeometric)\n";
		} elsif ($ARGV[$i] eq '-mis' ) {
			$cmd->{'mis'} = $ARGV[++$i];
			print STDERR "\tWill search for strings with $cmd->{'mis'} mismatches\n";
		} elsif ($ARGV[$i] eq '-S' ) {
			$cmd->{'S'} = $ARGV[++$i];
			print STDERR "\tWill optimize $cmd->{'S'} putative motifs\n";
		} elsif ($ARGV[$i] eq '-fasta' || $ARGV[$i] eq '-fastaBg') {
			$cmd->{'fasta'} = $ARGV[++$i];
			print STDERR "\tWill use FASTA files for motif finding\n";
			print STDERR "\t\tTarget Sequences = $cmd->{'fg'}\n";
			print STDERR "\t\tBackground Sequences = $cmd->{'fasta'}\n";
		} elsif ($ARGV[$i] eq '-start' ) {
			$cmd->{'start'} = $ARGV[++$i];
			print STDERR "\tNew start is $cmd->{'start'} relative to the TSS\n";
		} elsif ($ARGV[$i] eq '-find' ) {
			$cmd->{'find'} = $ARGV[++$i];
			print STDERR "\tWill find motif(s) in $cmd->{'find'}\n";
		} elsif ($ARGV[$i] eq '-end' ) {
			$cmd->{'end'} = $ARGV[++$i];
			print STDERR "\tNew end is $cmd->{'end'} relative to the TSS\n";
		} elsif ($ARGV[$i] eq '-nocheck' ) {
			$cmd->{'checkFlag'} = 0;
			print STDERR "\tWill not check for similarity between de novo and known motifs\n";
		} elsif ($ARGV[$i] eq '-mknown' ) {
			$cmd->{'mknown'} = $ARGV[++$i];
			print STDERR "\tKnown motif file set to $cmd->{'mknown'} (for known motif enrichment)\n";
		} elsif ($ARGV[$i] eq '-mcheck' ) {
			$cmd->{'mcheck'} = $ARGV[++$i];
			print STDERR "\tKnown motif file set to $cmd->{'mcheck'} (for checking de novo motifs)\n";
		} else {
			print STDERR "!! $ARGV[$i] is not recognized option!\n";
			printCMD();
		}
	}	
	if ($cmd->{'genome'} =~ s/r$//) {
		$cmd->{'mask'} = 1;
	}

	if (@{$cmd->{'len'}} < 1) {
		push(@{$cmd->{'len'}}, 8);
		push(@{$cmd->{'len'}}, 10);
		push(@{$cmd->{'len'}}, 12);
	}

	return $cmd;

}

if ($cmd->{'homer2'}==1) {
	if ($cmd->{'alg'} eq ' -alg binomial ') {
		$cmd->{'alg'} = " -stat binomial ";
	} elsif ($cmd->{'alg'} eq '') {
		$cmd->{'alg'} = " -stat hypergeo ";
	}
}


my $tmpID = rand();
my $ugFgFile = $cmd->{'output'} . '/' . "targetIDs.ug.txt";
my $targetGroupFile = $cmd->{'output'} . '/' . "targetIDs.group.txt";
my $ugBgFile = $cmd->{'output'} . '/' . "backgroundIDs.ug.txt";
my $ugGroupFile = $cmd->{'output'} . '/' . "group.ug.txt";
my $randGroupFile = $cmd->{'output'} . '/' . "group.rand.txt";
my $redunFile = $cmd->{'output'} . '/' . $tmpID . ".redun.tmp";
my $findFile = $cmd->{'output'} . '/' . $tmpID . ".find.tmp";
my $findFile2 = $cmd->{'output'} . '/' . $tmpID . ".find2.tmp";
my $findFile3 = $cmd->{'output'} . '/' . $tmpID . ".find3.tmp";
my $tmpSeq1 = $cmd->{'output'} . '/' . $tmpID . ".seq1.tmp";
my $tmpSeq2 = $cmd->{'output'} . '/' . $tmpID . ".seq2.tmp";
my $tmpSeq3 = $cmd->{'output'} . '/' . $tmpID . ".seq3.tmp";
my $tmpRedunDatFile = $cmd->{'output'}  . '/' . $tmpID . "redun.tmp";
my $tmpCGFreq = $cmd->{'output'}  . '/' . $tmpID . "cgfreq.tmp";
my $tmpCGBins = $cmd->{'output'}  . '/' . $tmpID . "cgbins.tmp";
my $adjFile = $cmd->{'output'} . "/group.adj";
my $seqFile = $cmd->{'output'} . "/seq.tsv";
my $noutFile = $cmd->{'output'} . "/seq.autonorm.tsv";
my $knownFile = $cmd->{'output'} . "/knownResults.html";
my $motifInfoFile =  $cmd->{'output'} . "/" . $motifInfoFileName;
my $enhancerSeq = $cmd->{'output'} . '/' . "enhancer.sequence.txt";
my $enhancerConvFile = $cmd->{'output'} . '/' . "enhancer.conv.txt";
my $enhancerCGBins = $cmd->{'output'} . '/' . "enhancer.cgbins";
my $targetFastaDump = $cmd->{'output'} . '/' . "target.fa";
my $backgroundFastaDump = $cmd->{'output'} . '/' . "background.fa";
my $tmpFile1 = $cmd->{'output'} . '/' . "$tmpID.1.tmp";
my $tmpFile2 = $cmd->{'output'} . '/' . "$tmpID.2.tmp";
my $tmpFile3 = $cmd->{'output'} . '/' . "$tmpID.3.tmp";
my $scrambledFasta = $cmd->{'output'} . '/' . "scrambleBg.fasta";


my $numLines = checkFile($cmd->{'fg'});
if ($numLines < 1) {
	print STDERR "There is no data in your input file ($cmd->{'fg'})\n";
	exit;
}
`mkdir -p "$cmd->{'output'}"`;

# check validity of Promoter Set
if ($cmd->{'promoters'} eq 'FASTA' || $cmd->{'promoters'} eq 'fasta') {
	if ($cmd->{'fasta'} eq '') {
		if ($cmd->{'find'}) {
			$cmd->{'fasta'} = 'placeholder';
		} else {
			print STDERR "\n\t!Warning - no background FASTA file specified (Highly recommended)\n";
			print STDERR "\t!Your input sequences will be randomized to serve as a background instead.\n\n";
			`scrambleFasta.pl "$cmd->{'fg'}" > "$scrambledFasta"`;
			$cmd->{'fasta'} = $scrambledFasta;
		}
	}
} elsif (!exists($config->{'PROMOTERS'}->{$cmd->{'promoters'}}) && $cmd->{'fasta'} eq '') {
	print STDERR "\n!!! $cmd->{'promoters'} not found in $homeDir/config.txt\n";
	print STDERR "\tTry typing \"perl $homeDir/configureHomer.pl -list\" to see available promoter sets\n";
	print STDERR "\tIf avaliable, type \"perl $homeDir/configureHomer.pl -install $cmd->{'promoters'}\" to install\n";
	exit;
}
my $promoterSeqOffset = 0;
if (exists($config->{'PROMOTERS'}->{$cmd->{'promoters'}})) {
	$cmd->{'org'} = $config->{'PROMOTERS'}->{$cmd->{'promoters'}}->{'org'};
	$seqDir = $config->{'PROMOTERS'}->{$cmd->{'promoters'}}->{'directory'} . "/";
	$maxTSSDist = $config->{'PROMOTERS'}->{$cmd->{'promoters'}}->{'end'};
	$idtype = $config->{'PROMOTERS'}->{$cmd->{'promoters'}}->{'idtype'};
	$promoterSeqOffset = $config->{'PROMOTERS'}->{$cmd->{'promoters'}}->{'start'};
} else {
	$cmd->{'org'} = 'null';
}

my $customGOID = 1;
if (exists($config->{'ORGANISMS'}->{$cmd->{'org'}})) {
	$customGOID = 0;
	if (exists($config->{'ORGANISMS'}->{$cmd->{'org'}}->{'parameters'}) 
			&& scalar(@{$config->{'ORGANISMS'}->{$cmd->{'org'}}->{'parameters'}}) > 2
			&& $config->{'ORGANISMS'}->{$cmd->{'org'}}->{'parameters'}->[2] eq 'custom') {
		$customGOID = 1;
	}
}
if ($customGOID) {
	print STDERR "\tUsing custom gene IDs for GO analysis\n";
}

my $customGenome = "";
if ($cmd->{'enhancers'} ne '') {
	if (!exists($config->{'GENOMES'}->{$cmd->{'genome'}})) {
		$customGenome = $cmd->{'genome'};
		($cmd->{'genome'},$genomeDir,$preparsedDir) = HomerConfig::parseCustomGenome($cmd->{'genome'});
	} else {
		$genomeDir = $config->{'GENOMES'}->{$cmd->{'genome'}}->{'directory'} . "/";
		$preparsedDir = $genomeDir . "preparsed/";
	}
}

#check mset
my ($msetCheck, $msetKnown) = HomerConfig::checkMSet($cmd->{'mset'}, $cmd->{'org'});
$cmd->{'mcheck'} = $msetCheck if ($cmd->{'mcheck'} eq '');
$cmd->{'mknown'} = $msetKnown if ($cmd->{'mknown'} eq '');



open INFO, ">$motifInfoFile";
print INFO "cmd =";
foreach(@ARGV) {
	print INFO " $_";
}
print INFO "\n";
close INFO;

if ($cmd->{'enhancers'} ne '') {
	if ($cmd->{'genome'} ne 'fasta' && $cmd->{'genome'} ne 'FASTA') {
		my $mflag = "";
		if ($cmd->{'mask'}) {
			$mflag = " -mask ";
		}
		`bed2pos.pl "$cmd->{'enhancers'}" -check > "$tmpSeq1"`;
		`checkPeakFile.pl "$tmpSeq1"`;
		`cleanUpPeakFile.pl "$tmpSeq1" > "$tmpSeq2"`;
		`homerTools extract "$tmpSeq2" "$genomeDir" $mflag > "$tmpSeq1"`;
		`cleanUpSequences.pl "$tmpSeq1" > "$tmpSeq2"`;
		`removePoorSeq.pl "$tmpSeq2" > "$enhancerSeq"`;
		`homerTools freq -gc "$tmpSeq2" "$enhancerSeq" > "$tmpSeq1"`;
		my $col = 1;
		$col = 2 if ($cmd->{'gc'} == 1);
		`freq2group.pl "$tmpSeq2" $col > "$enhancerCGBins"`;
		
		`cut -f1 "$enhancerCGBins" > "$tmpSeq1"`; 	
		if ($cmd->{'noconv'} == 1 || $idtype eq 'null' || $idtype eq 'custom') {
			`duplicateCol.pl "$tmpSeq1" 0 | cut -f1,2 > "$enhancerConvFile"`;
		} else {
			`convertIDs.pl "$tmpSeq1" "$cmd->{'org'}" $idtype no yes | cut -f1,2 > "$enhancerConvFile"`;
		}
		`rm -f "$tmpSeq1" "$tmpSeq2"`;
		$toDelete{$enhancerSeq} = 1;
		$toDelete{$enhancerConvFile} = 1;
		$toDelete{$enhancerCGBins} = 1;
	}
}

###########################################################
# if input is group file, separate it into targets and background
if ($cmd->{'groupFlag'}) {
	`getPos.pl "$cmd->{'fg'}" > "$targetGroupFile"`;
	$cmd->{'bg'} = $cmd->{'fg'};
	$cmd->{'fg'} = $targetGroupFile;
	$numLines = checkFile($cmd->{'fg'});
}

if ($cmd->{'reuse'} == 0 && $cmd->{'fasta'} eq '') {
	print STDERR "\n\tProgress: Step1 - Convert input file to $idtype IDs\n";
	if ($cmd->{'noconv'} == 1) {
		print STDERR "\tskipping - file already converted\n";
		`cp "$cmd->{'fg'}" "$ugFgFile"`;
		if ($cmd->{'bg'} ne '') {
			`cp "$cmd->{'bg'}" "$ugBgFile"`;
			$toDelete{$ugBgFile}=1;
		}
	} else {
		if ($idtype eq 'null' || $idtype eq 'custom') {
			`cp "$cmd->{'fg'}" "$ugFgFile"`;
			if ($cmd->{'bg'} ne '') {
				`cp "$cmd->{'bg'}" "$ugBgFile"`;
				$toDelete{$ugBgFile}=1;
			}
		} else {
			#print STDERR "`convertIDs.pl $cmd->{'fg'} $cmd->{'org'} $idtype > $ugFgFile`;\n";
			`convertIDs.pl "$cmd->{'fg'}" "$cmd->{'org'}" $idtype > "$ugFgFile"`;
			if ($cmd->{'bg'} ne '') {
				`convertIDs.pl "$cmd->{'bg'}" "$cmd->{'org'}" $idtype > "$ugBgFile"`;
				$toDelete{$ugBgFile}=1;
			}
		}
	}
	$toDelete{$ugFgFile}=1;

	my $newNumLines = checkFile($ugFgFile);
	my $convPercent = $newNumLines/$numLines;
	my $zzz = sprintf("%.1f",$convPercent*100);
	print STDERR "\tPercentage of IDs converted into $idtype: $zzz"."% ($newNumLines out of $numLines)\n";
	if ($convPercent < 0.05) {
		print STDERR "!!!! Homer converted less than 5% of the rows in your file.\n";
		print STDERR "!!!! Check to be sure the input file has valid gene identifiers\n";
		print STDERR "!!!! Check to be sure the input file has valid gene identifiers\n";
		cleanUpAndExit() if ($cmd->{'nowarn'} == 0);
	}

	if ($cmd->{'peaks'} == 1) {
		my $promoterFile = $seqDir . $cmd->{'promoters'} . ".pos";
		my %ids = ();
		open IN, $ugFgFile;
		while (<IN>) {
			chomp;
			s/\r//g;
			my @line = split /\t/;
			$ids{$line[0]} =1;
		}
		close IN;
		open IN, $promoterFile;
		while (<IN>) {
			chomp;
			s/\r//g;
			my @line= split /\t/;
			next if (!exists($ids{$line[0]}));
			my $start = $line[2]-$promoterSeqOffset-150;
			my $end = $line[2]-$promoterSeqOffset+50;
			if ($line[4] == 1) {
				$start = $line[3]+$promoterSeqOffset-50;
				$end = $line[3]+$promoterSeqOffset+150;
			}
			print "$line[0]\t$line[1]\t$start\t$end\t$line[4]\t$line[5]\n";
		}
		cleanUpAndExit();
	}

	############################################################
	print STDERR "\n\tProgress: Step2 - prepare sequence files\n";

	my $startFile = $seqDir . $cmd->{'promoters'} . ".mask";
	if ($cmd->{'nomask'}) {
		$startFile = $seqDir . $cmd->{'promoters'} . ".seq";
	}
	if ($cmd->{'rnaMode'} == 0) {
		my $start = $cmd->{'start'};
		my $end = $cmd->{'end'};
		if ($start < -1*$maxTSSDist || $start > $maxTSSDist) {
			print STDERR "Sequence start = $start is out of range (+/-$maxTSSDist)\n";
			exit;
		} 
		if ($end < -1*$maxTSSDist || $end > $maxTSSDist) {
			print STDERR "Sequence end = $end is out of range (+/-$maxTSSDist)\n";
			exit;
		}
		if ($end < $start || $end-$start < $cmd->{'maxlen'}) {
			print STDERR "Start and End values are too close!\n"; 
			exit;
		}
		`getPartOfPromoter.pl "$startFile" $start $end $promoterSeqOffset > "$tmpSeq1"`;
		`cleanUpSequences.pl "$tmpSeq1" -min $cmd->{'minLen'} -max $cmd->{'maxLen'} > "$seqFile"`;
		`rm "$tmpSeq1"`;
	} else {
		`cleanUpSequences.pl "$startFile" -min $cmd->{'minLen'} -max $cmd->{'maxLen'} > "$seqFile"`;
	}

	$toDelete{$seqFile}=1;

} elsif ($cmd->{'fasta'} ne '') {

	#############################################################
	# prepare and parse fasta files
	print STDERR "\tParsing FASTA format files...\n";
	`fasta2tab.pl "$cmd->{'fg'}" > "$tmpSeq1"`;
	`cleanUpSequences.pl "$tmpSeq1" -min $cmd->{'minLen'} -max $cmd->{'maxLen'} > "$tmpSeq2"`;
	`cleanUpPeakFile.pl "$tmpSeq2" > "$tmpSeq1"`;
	#`mv "$tmpSeq2" "$tmpSeq1"`;
	if ($cmd->{'find'} ne '') {
		my $options = '';

		if ($cmd->{'homer2'}) {
			if ($cmd->{'norevopp'} == 1) {
				$options .= " -strand +";
			}
			`homer2 find -s "$tmpSeq1" $options -m "$cmd->{'find'}" > "$findFile"`;
			print "FASTA ID\tOffset\tSequence\tMotif Name\tStrand\tMotifScore\n";
		} else {
			if ($cmd->{'norevopp'} == 1) {
				$options .= " -norevopp";
			}
			`homer -s "$tmpSeq1" $options -a FIND -m "$cmd->{'find'}" > "$findFile"`;
			print "FASTA ID\tOffset\tSequence\tConservation\tStrand\tMotif Name\tMotifScore\n";
		}

		open IN, $findFile;
		while (<IN>) {
			print $_;
		}
		close IN;
		`rm "$tmpSeq1" "$findFile"`;
		exit;
	}
	`fasta2tab.pl "$cmd->{'fasta'}" > "$tmpSeq2"`;
	if ($cmd->{'chopify'}==1) {
		`chopUpBackground.pl "$tmpSeq1" "$tmpSeq2" > "$tmpSeq3"`;
		`mv "$tmpSeq3" "$tmpSeq2"`;
	}
	`cleanUpSequences.pl "$tmpSeq2" > "$tmpSeq3"`;
	`cleanUpPeakFile.pl "$tmpSeq3" BG > "$tmpSeq2"`;

	`cat "$tmpSeq1" "$tmpSeq2" > "$seqFile"`;
	`makeBinaryFile.pl "$seqFile" "$tmpSeq1" > "$ugGroupFile"`;
	`rm -f "$tmpSeq1" "$tmpSeq2" "$tmpSeq3"`;

	$toDelete{$seqFile}=1;
	$toDelete{$ugGroupFile}=1;
}

#################################################
if ($cmd->{'find'} eq '' && $cmd->{'reuse'} == 0 && $cmd->{'fasta'} eq '') {
	print STDERR "\n\tProgress: Step3 - creating foreground/background file\n";
	if ($cmd->{'g'} == 1) {
		`cp "$ugFgFile" "$ugGroupFile"`;
	} else {
		if ($cmd->{'bg'} ne '') {
			`makeBinaryFile.pl "$ugBgFile" "$ugFgFile" > "$ugGroupFile"`;
		} else {
			my $baseFile = $seqDir .  $cmd->{'promoters'} . '.base';
			`makeBinaryFile.pl "$baseFile" "$ugFgFile" > "$ugGroupFile"`;
		} 
	}
	$toDelete{$ugGroupFile}=1;
}
##############################################################
if ($cmd->{'rand'} ne '0') {
	print STDERR "Randomizing Group File!!\n";
	`randomizeGroupFile.pl "$ugGroupFile" > "$randGroupFile"`;
	`mv "$randGroupFile" "$ugGroupFile"`;
}

if ($cmd->{'find'} eq '' && $cmd->{'reuse'} == 0) {
	###########################################################
	print STDERR "\n\tProgress: Step4 - removing redundant promoters\n";
	if ($cmd->{'redundant'} > 1.99) {
		print STDERR "\tskipping...\n";
		`cp "$ugGroupFile" "$redunFile"`;
	} else {
		if ($cmd->{'fasta'} ne '') {
			`cp "$ugGroupFile" "$redunFile"`;
			# need to calculate redundant sequences
			#`findRedundantBLAT.pl "$seqFile" $cmd->{'redundant'} > "$tmpRedunDatFile"`;
			#`adjustRedunGroupFile.pl "$ugGroupFile" "$tmpRedunDatFile" > "$redunFile"`;
			#`rm "$tmpRedunDatFile"`;
		} else {
			my $redunDatFile = $seqDir . $cmd->{'promoters'} . ".redun";
			#print STDERR "`adjustRedunGroupFile.pl $ugGroupFile $redunDatFile > $redunFile`\n";
			`adjustRedunGroupFile.pl "$ugGroupFile" "$redunDatFile" > "$redunFile"`;
		}
	}
	$toDelete{$redunFile}=1;
	if ($cmd->{'dumpFasta'} == 1) {
		print STDERR "\tDumping FASTA files of target and background sequences...\n";
		`getPos.pl "$redunFile" > "$tmpFile1"`;
		`filterListBy.pl "$seqFile" "$tmpFile1" 0 1 > "$tmpFile2"`;
		`tab2fasta.pl "$tmpFile2" > "$targetFastaDump"`;

		`getPos.pl "$redunFile" 1 > "$tmpFile1"`;
		`filterListBy.pl "$seqFile" "$tmpFile1" 0 1 > "$tmpFile2"`;
		`tab2fasta.pl "$tmpFile2" > "$backgroundFastaDump"`;

		`rm "$tmpFile1" "$tmpFile2"`;
	}


	if ($cmd->{'enhancers'} ne '') {
		`cat "$seqFile" "$enhancerSeq" > "$tmpSeq1"`;
		`mv "$tmpSeq1" "$seqFile"`;
		my %groups = ();
		open IN, "$ugGroupFile";
		while (<IN>) {
			chomp;
			s/\r//g;
			my @line = split /\t/;
			$groups{$line[0]} = $line[1];
		}
		close IN;
		if ($cmd->{'enhancersOnly'} ) {
			open REDUN, ">$redunFile";
		} else {
			open REDUN, ">>$redunFile";
		}
		open IN, "$enhancerConvFile";
		while (<IN>) {
			chomp;
			s/\r//g;
			my @line = split /\t/;
			if (exists($groups{$line[0]})) {
				print REDUN "$line[1]\t$groups{$line[0]}\n";
			}
		}
		close IN;
	}

	############################################################
	print STDERR "\n\tProgress: Step5 - adjusting background sequences for GC/CpG content...\n";
	if ($cmd->{'noweight'} == 1) {
		print STDERR "\tskipping...\n";
		`cp "$redunFile" "$adjFile"`;
	} else {
		if ($cmd->{'rnaMode'} == 0) {
			if ($cmd->{'fasta'} eq '') {
				my $weightFile = $seqDir . $cmd->{'promoters'} . ".cgbins";
				if ($cmd->{'gc'} == 1) {
					$weightFile = $seqDir . $cmd->{'promoters'} . ".gcbins";
				}
				if ($cmd->{'enhancers'} ne '') {
					`cat "$weightFile" "$enhancerCGBins" > "$tmpSeq1"`;
					`assignGeneWeights.pl "$redunFile" "$tmpSeq1" > "$adjFile"`;
					`rm "$tmpSeq1";`
				} else {
					`assignGeneWeights.pl "$redunFile" "$weightFile" > "$adjFile"`;
				}
			} else {
				`homerTools freq "$seqFile" -gc "$tmpCGFreq" > /dev/null`;
				my $col = 1;
				if ($cmd->{'gc'} == 1) {
					$col = 2;
				}
				`freq2group.pl "$tmpCGFreq" $col > "$tmpCGBins"`;
				`assignGeneWeights.pl "$redunFile" "$tmpCGBins" > "$adjFile"`;
				`rm "$tmpCGFreq" "$tmpCGBins"`;
			}
		} else {
			`cp "$redunFile" "$adjFile"`;
		}

		if ($cmd->{'nlen'} > 0) {
			print STDERR "\n\tNormalizing lower order oligos using homer2\n";
			my $options = "";
			$options .= " -strand + " if ($cmd->{'norevopp'} == 1);
			$options .= " -nmax $cmd->{'nmax'}";
			$options .= $cmd->{'neutral'};
			#print STDERR "`homer2 norm -g $adjFile -s $seqFile -nlen $cmd->{'nlen'} $options -nout $noutFile > $tmpCGFreq`;\n";
			`homer2 norm -g "$adjFile" -s "$seqFile" -nlen $cmd->{'nlen'} $options -nout "$noutFile" > "$tmpCGFreq"`;
			`mv "$tmpCGFreq" "$adjFile"`;
		}
	}
	$toDelete{$adjFile}=1;
}
if ($cmd->{'find'} eq '') {

	##########################################################
	print STDERR "\n\tProgress: Step6 - Gene Ontology Enrichment Analysis\n";
	if ($cmd->{'nogo'} == 1 || $cmd->{'fasta'} ne '') {
		print STDERR "\tSkipping...\n";
	} else {
		my $options = " -cpu $cmd->{'cpus'}";
		if ($cmd->{'bg'} ne '') {
			$options .= " -bg \"$cmd->{'bg'}\"";
		}
		if ($cmd->{'humanGO'} == 1) {
			$options .= " -human ";
		}
		#if ($cmd->{'noconv'} == 1 || $idtype eq 'null' || $idtype eq 'custom' || $idtype eq '') {
		if ($cmd->{'noconv'} == 1 || $customGOID) {
			$options .= " -customID ";
		}
		if (@{$cmd->{'ontology'}} > 0) {
			$options .= " -ontology";
			foreach (@{$cmd->{'ontology'}}) {
				$options .= " \"$_\"";
			}
		}

		#print STDERR "`findGO.pl $cmd->{'fg'} $cmd->{'org'} $cmd->{'output'} $options`;\n";
			
		`findGO.pl "$cmd->{'fg'}" "$cmd->{'org'}" "$cmd->{'output'}" $options`;
	}

	####################################################
	if (scalar(@{$cmd->{'motifMask'}}) > 0) {
		print STDERR "Masking given motifs...\n";
		my $files = '';
		foreach(@{$cmd->{'motifMask'}}) {
			$files .= " \"$_\"";
		}
		`cat $files > "$tmpSeq1"`;

		if ($cmd->{'homer2'}) {
			my $options = "";
			$options .= " -strand + " if ($cmd->{'norevopp'} == 1);
			`homer2 mask -s "$seqFile" -m "$tmpSeq1" $options > "$tmpSeq2"`;
		} else {
			`homer -s "$seqFile" -m "$tmpSeq1" -a REMOVE > "$tmpSeq2"`;
		}
		`mv "$tmpSeq2" "$seqFile"`;
		`rm "$tmpSeq1"`;
	}

	##########################################################
	print STDERR "\n\tProgress: Step7 - Known motif enrichment\n";
	if ($cmd->{'noknown'} == 1 || $cmd->{'reuse'}==1 ) {
		print STDERR "\tSkipping...\n";
	} else {
		my $floatAction = "GETPVALUE";
		$floatAction = "OPTPVALUE" if ($cmd->{'float'} == 1);
		my $options = $cmd->{'seqlogo'};
		$options = " -optimize" if ($floatAction eq 'OPTPVALUE');
		if ($cmd->{'homer2'}) {
			$options .= " -homer2 -p $cmd->{'cpus'}";
			$options .= " $cmd->{'alg'}";
		}
		`findKnownMotifs.pl -s "$seqFile" -g "$adjFile" -o "$cmd->{'output'}" -pvalue $knownPvalueThresh -m "$cmd->{'mknown'}" $options`;
	}

	##########################################################
	print STDERR "\n\tProgress: Step8 - De novo motif finding (HOMER)\n";
	if ($cmd->{'nomotif'} == 1) {
		print STDERR "\tSkipping...\n";
	} else {
		my $options = " -S $cmd->{'S'} ";
		$options .= $cmd->{'alg'};
		$options .= " -mis $cmd->{'mis'}";
		my $coptions = $cmd->{'alg'};
		my $cpuOptions = "";

		if ($cmd->{'homer2'}) {

			if (scalar(@{$cmd->{'motifOpt'}}) > 0) {
				print STDERR "\tOptimizing given motifs...\n";
				my $files = '';
				foreach(@{$cmd->{'motifOpt'}}) {
					$files .= " \"$_\"";
				}
				`cat $files > "$tmpSeq1"`;
				$options .= " -opt \"$tmpSeq1\" ";
			}

			if ($cmd->{'norevopp'} == 1) {
				$options .= " -strand + ";
				$coptions .= " -strand + ";
			}
			$cpuOptions .= " -p $cmd->{'cpus'} ";

			$options .= " -cache $cmd->{'cache'} ";
			$coptions .= " -cache $cmd->{'cache'} ";
			if ($cmd->{'quickMask'} > 0) {
				$options .= " -quickMask";
			}
			if ($cmd->{'expect'} > 0) {
				$options .= " -e $cmd->{'expect'} ";
			}
			$options .= " -minlp $cmd->{'minlp'}  ";
			$options .= " -o $cmd->{'olen'} " if ($cmd->{'olen'} > 0);
			foreach(@{$cmd->{'len'}}) {
				my $len = $_;
				my $outfile .= " -o \"$cmd->{'output'}/homerMotifs.motifs$len\" ";
				`homer2 denovo -s "$seqFile" -g "$adjFile" $options $cpuOptions -len $len $outfile`;

				if ($cmd->{'fdr'} > 0) {
					`mkdir -p \"$cmd->{'output'}/randomizations/\"`;
					print STDERR "\tPerforming empirical FDR calculation for length $len (n=$cmd->{'fdr'})\n";

					my $realPvalues = readPvalues("$cmd->{'output'}/homerMotifs.motifs$len");
					my @randPvalues = ();
					my @randFiles = ();
					my $cpus = 0;
					for (my $i=0;$i<$cmd->{'fdr'};$i++) {
						my $ii = $i+1;
						print STDERR "\t\t$ii of $cmd->{'fdr'}\n";
						my $outputfile = "$cmd->{'output'}/randomizations/homerMotifs.r$i.motifs$len";
						push(@randFiles,$outputfile);
						$pid = fork();
						$cpus++;
						if ($pid == 0) {
							#child proces
							my $randGroupFile = "$cmd->{'output'}/randomizations/rand$i.group";
							`randomizeGroupFile.pl "$adjFile" > "$randGroupFile"`;
							my $output = " -o \"$outputfile\" ";
							my $cmdStr = "homer2 denovo -s \"$seqFile\" -g \"$randGroupFile\" $options -len $len";
							$cmdStr .= $output;
							`$cmdStr 2> /dev/null`;
							`rm -f "$randGroupFile"`;
							exit(0);
						}
						if ($cpus >= $cmd->{'cpus'}) {
							wait();
							$cpus--;
						}
					}
					my $id = 0;
					while ($id >= 0) {
						$id = wait();
					}
					foreach(@randFiles) {
						my $file = $_;
						my $rPvalues = readPvalues($file);
						foreach(@$rPvalues) {
							push(@randPvalues, $_);
						}
					}
					my $fdrs = Statistics::empiricalFDR2($realPvalues,\@randPvalues, $cmd->{'fdr'});
					addFDR("$cmd->{'output'}/homerMotifs.motifs$len",$fdrs);

				}
			}
		} else {
			if ($cmd->{'noweight'} == 0) {
				$options .= ' -w ';
			}
			if ($cmd->{'norevopp'} == 1) {
				$options .= ' -norevopp';
			}
			$options .= " -o $cmd->{'output'}/homerMotifs ";
			$options .= " -branch $cmd->{'depth'}";
			foreach(@{$cmd->{'len'}}) {
				`homer -s "$seqFile" -g "$adjFile" $options -len $_ -a MOTIFS`;
			}
			$toDelete{".tmp.motifs"}=1;
			$toDelete{".mer.motifs"}=1;
		}


		if ($cmd->{'checkFlag'} == 1) {
			my $outdir = $cmd->{'output'};
			$outdir =~ s/ /\\ /g;
			if ($cmd->{'homer2'}) {
				`cat "$outdir/homerMotifs.motifs"* > "$cmd->{'output'}/homerMotifs.all.motifs"`;
				#`homer2 known -s "$seqFile" -g "$adjFile" $coptions -m "$cmd->{'output'}/homerMotifs.all.approx.motifs" -siteReduce $cmd->{'percentSimilar'} -mout "$cmd->{'output'}/homerMotifs.all.motifs" -offset $cmd->{'start'} > /dev/null`;
			} else {
				`cat "$outdir/homerMotifs.motifs"* > "$cmd->{'output'}/homerMotifs.all.motifs"`;
			}
			my $rnaOpt = "";
			$rnaOpt = " -rna " if ($cmd->{'rnaMode'});
			#print STDERR "`compareMotifs.pl $cmd->{'output'}/homerMotifs.all.motifs $cmd->{'output'}/ -reduceThresh $reduceThresh -matchThresh $matchThresh -known $cmd->{'mcheck'} BITS: $cmd->{'bits'} FACTS: $cmd->{'nofacts'} $rnaOpt `;\n";
			`compareMotifs.pl "$cmd->{'output'}/homerMotifs.all.motifs" "$cmd->{'output'}/" -reduceThresh $reduceThresh -matchThresh $matchThresh -known "$cmd->{'mcheck'}" $cmd->{'bits'} $cmd->{'nofacts'} -cpu $cmd->{'cpus'} $rnaOpt $cmd->{'seqlogo'}`;
		}
	}
	

	print STDERR "\tJob finished\n\n";

} else {
	my $options = '';
	`makeBinaryFile.pl "$ugFgFile" "$ugFgFile" > "$findFile"`;
	`mv "$findFile" "$ugFgFile"`;
	if ($cmd->{'homer2'}) {
		$options .= " -strand +" if ($cmd->{'norevopp'} == 1);
		`homer2 find -s "$seqFile" -g "$ugFgFile" $options -m "$cmd->{'find'}" -offset $cmd->{'start'} > "$findFile"`;
		print "GeneID\tPromoterID\tOffset\tSequence\tMotif Name\tStrand\tMotifScore\tUnigene\tRefseq\tEnsembl\tName\tAlias\tOrf\tChr\tDescription\tType\n";
	} else {
		$options .= " -norevopp" if ($cmd->{'norevopp'} == 1);
		`homer -s "$seqFile" -g "$ugFgFile" $options -a FIND -m "$cmd->{'find'}" -offset $cmd->{'start'} > "$findFile"`;
		print "GeneID\tPromoterID\tOffset\tSequence\tConservation\tStrand\tMotif Name\tMotifScore\tUnigene\tRefseq\tEnsembl\tName\tAlias\tOrf\tChr\tDescription\tType\n";
	}
	`convertIDs.pl "$findFile" $cmd->{'org'} gene no yes > "$findFile2"`;
	`addData.pl "$findFile2" "$accDir/$cmd->{'org'}.description" > "$findFile"`;
	open IN, $findFile;
	while (<IN>) {
		chomp;
		s/\r//g;
		my @line = split /\t/;
		for (my $i=0;$i<@line;$i++) {
			print "\t" if ($i > 0);
			print "$line[$i]";
		}
		print "\n";
	}
	close IN;
	`rm "$findFile2" "$findFile"`;
}
cleanUpAndExit();
exit;

sub checkFile {
	my ($file) = @_;
	open IN, $file or die "!!!!\nCould not open file $file\n!!!!\n";
	my $c = 0;
	while (<IN>) {
		$c++;
	}
	return $c;
}

sub cleanUpAndExit {
	if ($cmd->{'keepFiles'} == 0) {
		foreach(keys %toDelete) {
			`rm "$_"`;
		}
	}
	exit;
}
sub readPvalues {
    my ($motifFile) = @_;
    open IN, $motifFile;
    my @values = ();
    while (<IN>) {
        chomp;
        if (/^>/) {
            my @line = split /\t/;
            push(@values, $line[3]);
        }
    }
    close IN;
    return \@values;
}
sub addFDR {
    my ($mfile, $fdrs) = @_;
    my $index = 0;
    my $tmp = rand() . ".tmp";
    open OUT, ">$tmp";
    open IN, $mfile;
    while (<IN>) {
        my $og = $_;
        if (/^>/) {
            chomp;
            my @line = split /\t/;
            print OUT "$line[0]";
            for (my $i=1;$i<@line;$i++) {
                print OUT "\t$line[$i]";
                if ($i==5) {
                    print OUT ",FDR:$fdrs->[$index]";
                }
            }
            print OUT "\n";
            $index++;
        } else {
            print OUT $og;
        }
    }
    close IN;
    close OUT;
    `mv "$tmp" "$mfile"`;
}

annotatePeaks.pl is from Homer
#!/usr/bin/env perl
use warnings;
use lib "/work/04237/lleblanc/HomerNov2018/.//bin";
my $homeDir = "/work/04237/lleblanc/HomerNov2018/./";


# Copyright 2009 - 2018 Christopher Benner <cbenner@ucsd.edu>
# 
# This file is part of HOMER
#
# HOMER is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# HOMER is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.

use POSIX;
use HomerConfig;
use Statistics;


my $config = HomerConfig::loadConfigFile();

sub printCMD {

	print STDERR "\n\tUsage: annotatePeaks.pl <peak file | tss> <genome version>  [additional options...]\n";
	print STDERR "\n\tAvailable Genomes (required argument): (name,org,directory,default promoter set)\n";
	foreach(keys %{$config->{'GENOMES'}}) {
		print STDERR "\t\t$_\t$config->{'GENOMES'}->{$_}->{'org'}\t$config->{'GENOMES'}->{$_}->{'directory'}"
					. "\t$config->{'GENOMES'}->{$_}->{'promoters'}\n";
	}
	print STDERR "\t\t\t-- or --\n";
    print STDERR "\t\tCustom: provide the path to genome FASTA files (directory or single file)\n";
    print STDERR "\t\tIf no genome is available, specify 'none'.\n";
	print STDERR "\t\tIf using FASTA file or none, may want to specify '-organism <...>'\n";

	print STDERR "\n\tUser defined annotation files (default is UCSC refGene annotation):\n";
	print STDERR "\t\tannotatePeaks.pl accepts GTF (gene transfer formatted) files to annotate positions relative\n";
	print STDERR "\t\tto custom annotations, such as those from de novo transcript discovery or Gencode.\n";
	print STDERR "\t\t-gtf <gtf format file> (Use -gff and -gff3 if appropriate, but GTF is better)\n";
	print STDERR "\t\t-gid (by default the GTF file is processed by transcript_id, use this option for gene_id)\n";
	print STDERR "\t\t-ann <custom homer annotation file> (created by assignGenomeAnnotation, see website)\n";
	print STDERR "\n\tPeak vs. tss/tts/rna mode (works with custom GTF file):\n";
	print STDERR "\t\tIf the first argument is \"tss\" (i.e. annotatePeaks.pl tss hg18 ...) then a TSS centric\n";
	print STDERR "\t\tanalysis will be carried out.  Tag counts and motifs will be found relative to the TSS.\n";
	print STDERR "\t\t(no position file needed) [\"tts\" now works too - e.g. 3' end of gene]\n";
	print STDERR "\t\t[\"rna\" specifies gene bodies, will automaticall set \"-size given\"]\n";
	print STDERR "\t\tNOTE: The default TSS peak size is 4000 bp, i.e. +/- 2kb (change with -size option)\n";
	print STDERR "\t\t-list <gene id list> (subset of genes to perform analysis [unigene, gene id, accession,\n";
	print STDERR "\t\t\t probe, etc.], default = all promoters)\n";

	#print STDERR "\t\t-TSS <promoter set> (promoter definitions, default=genome default)\n";
	print STDERR "\t\t-cTSS <promoter position file i.e. peak file> (should be centered on TSS)\n";

	#print STDERR "\n\tAvailable Promoter Sets (use with -TSS): (name,org,directory,genome,masked genome)\n";
	#foreach(keys %{$config->{'PROMOTERS'}}) {
	#	print STDERR "\t\t$_\t$config->{'PROMOTERS'}->{$_}->{'org'}\t$config->{'PROMOTERS'}->{$_}->{'directory'}"
	#				. "\t$config->{'PROMOTERS'}->{$_}->{'genome'}\t$config->{'PROMOTERS'}->{$_}->{'mgenome'}\n";
	#}
	
	print STDERR "\n\tPrimary Annotation Options:\n";
    print STDERR "\t\t-mask (Masked repeats, can also add 'r' to end of genome name)\n";
	print STDERR "\t\t-m <motif file 1> [motif file 2] ... (list of motifs to find in peaks)\n";
	print STDERR "\t\t\t-mscore (reports the highest log-odds score within the peak)\n";
	print STDERR "\t\t\t-nmotifs (reports the number of motifs per peak)\n";
	print STDERR "\t\t\t-mdist (reports distance to closest motif)\n";
	print STDERR "\t\t\t-mfasta <filename> (reports sites in a fasta file - for building new motifs)\n";
	print STDERR "\t\t\t-fm <motif file 1> [motif file 2] (list of motifs to filter from above)\n";
	print STDERR "\t\t\t-rmrevopp <#> (only count sites found within <#> on both strands once, i.e. palindromic)\n";
	print STDERR "\t\t\t-matrix <prefix> (outputs a motif co-occurrence files:\n";
	print STDERR "\t\t\t\tprefix.count.matrix.txt - number of peaks with motif co-occurrence\n";
	print STDERR "\t\t\t\tprefix.ratio.matrix.txt - ratio of observed vs. expected  co-occurrence\n";
	print STDERR "\t\t\t\tprefix.logPvalue.matrix.txt - co-occurrence enrichment\n";
	print STDERR "\t\t\t\tprefix.stats.txt - table of pair-wise motif co-occurrence statistics\n";
	print STDERR "\t\t\t\tadditional options:\n";
	print STDERR "\t\t\t\t-matrixMinDist <#> (minimum distance between motif pairs - to avoid overlap, default: 4)\n";
	print STDERR "\t\t\t\t-matrixMaxDist <#> (maximum distance between motif pairs)\n";
	print STDERR "\t\t\t-mbed <filename> (Output motif positions to a BED file to load at UCSC (or -mpeak))\n";
	print STDERR "\t\t\t-mlogic <filename> (will output stats on common motif orientations)\n";
	print STDERR "\t\t-d <tag directory 1> [tag directory 2] ... (list of experiment directories to show\n";
	print STDERR "\t\t\ttag counts for) NOTE: -dfile <file> where file is a list of directories in first column\n";
	#print STDERR "\t\t-i <input tag directory 1> [input directory 2] ... (list of controls for log2 ratio cal)\n";
	#print STDERR "\t\t\t-pseudo <#> (pseudo count value to add, default: 5)\n";
	print STDERR "\t\t-bedGraph <bedGraph file 1> [bedGraph file 2] ... (read coverage counts from bedGraph files)\n";
	print STDERR "\t\t-wig <wiggle file 1> [wiggle file 2] ... (read coverage counts from wiggle files)\n";
	print STDERR "\t\t-p <peak file> [peak file 2] ... (to find nearest peaks)\n";
	print STDERR "\t\t\t-pdist to report only distance (-pdist2 gives directional distance)\n";
	print STDERR "\t\t\t-pcount to report number of peaks within region\n";
	#print STDERR "\t\t-se <super enhancer peaks> [...] (will report SE overlap and SE rank ordered from peak file)\n";
	print STDERR "\t\t-vcf <VCF file> (annotate peaks with genetic variation infomation, one col per individual)\n";
	print STDERR "\t\t\t-editDistance (Computes the # bp changes relative to reference)\n";
	print STDERR "\t\t\t-individuals <name1> [name2] ... (restrict analysis to these individuals)\n";
	print STDERR "\t\t\t-editDistance (Computes the # bp changes relative to reference)\n";
	print STDERR "\t\t\t-individuals <name1> [name2] ... (restrict analysis to these individuals)\n";
	print STDERR "\t\t-gene <data file> ... (Adds additional data to result based on the closest gene.\n";
	print STDERR "\t\t\tThis is useful for adding gene expression data.  The file must have a header,\n";
	print STDERR "\t\t\tand the first column must be a GeneID, Accession number, etc.  If the peak\n";
	print STDERR "\t\t\tcannot be mapped to data in the file then the entry will be left empty.\n";
	print STDERR "\t\t-go <output directory> (perform GO analysis using genes near peaks)\n";
	print STDERR "\t\t-genomeOntology <output directory> (perform genomeOntology analysis on peaks)\n";
	print STDERR "\t\t\t-gsize <#> (Genome size for genomeOntology analysis, default: 2e9)\n";

	print STDERR "\n\tAnnotation vs. Histogram mode:\n";
	print STDERR "\t\t-hist <bin size in bp> (i.e 1, 2, 5, 10, 20, 50, 100 etc.)\n";
	print STDERR "\t\tThe -hist option can be used to generate histograms of position dependent features relative\n";
	print STDERR "\t\tto the center of peaks.  This is primarily meant to be used with -d and -m options to map\n";
	print STDERR "\t\tdistribution of motifs and ChIP-Seq tags.  For ChIP-Seq peaks for a Transcription factor\n";
	print STDERR "\t\tyou might want to use the -center option (below) to center peaks on the known motif\n";
	print STDERR "\t\t** If using \"-size given\", histogram will be scaled to each region (i.e. 0-100%), with\n";
	print STDERR "\t\tthe -hist parameter being the number of bins to divide each region into.\n";
	print STDERR "\t\t\tHistogram Mode specific Options:\n";
	print STDERR "\t\t\t-nuc (calculated mononucleotide frequencies at each position,\n";
	print STDERR "\t\t\t\tWill report by default if extracting sequence for other purposes like motifs)\n";
	print STDERR "\t\t\t-di (calculated dinucleotide frequencies at each position)\n";
	print STDERR "\t\t\t-histNorm <#> (normalize the total tag count for each region to 1, where <#> is the\n";
	print STDERR "\t\t\t\tminimum tag total per region - use to avoid tag spikes from low coverage\n";
	print STDERR "\t\t\t-ghist (outputs profiles for each gene, for peak shape clustering)\n";
	print STDERR "\t\t\t-rm <#> (remove occurrences of same motif that occur within # bp)\n";

	print STDERR "\n\tPeak Centering: (other options are ignored)\n";
	print STDERR "\t\t-center <motif file> (This will re-center peaks on the specified motif, or remove peak\n";
	print STDERR "\t\t\tif there is no motif in the peak.  ONLY recentering will be performed, and all other\n";
	print STDERR "\t\t\toptions will be ignored.  This will output a new peak file that can then be reanalyzed\n";
	print STDERR "\t\t\tto reveal fine-grain structure in peaks (It is advised to use -size < 200) with this\n";
	print STDERR "\t\t\tto keep peaks from moving too far (-mirror flips the position)\n";
	print STDERR "\t\t-multi (returns genomic positions of all sites instead of just the closest to center)\n";

	print STDERR "\n\tGenome comparisons (need genome & liftOver)\n";
	print STDERR "\t\t-cmpGenome <genome1> [genome2] (Genomes to compare for sequence/motifs)\n";
	print STDERR "\t\t-cmpLiftover <liftover1> [genome2] (Genomes to compare for sequence/motifs)\n";
	#print STDERR "\t\t-revLiftover <liftover1> [genome2] (Genomes to compare for sequence/motifs)\n";
	#
	print STDERR "\n\tNormalization options:\n";
	print STDERR "\t\t-fpkm (normalize read counts to million reads or fragments per kilobase mapped)\n";
	print STDERR "\t\t-raw (do not adjust the tag counts based on total tags sequenced, -noadj works too)\n";
	print STDERR "\t\t-norm <#> (normalize tags to this tag count, default=1e7, 0=average tag count in all directories)\n";
	print STDERR "\t\t-normLength <#> (Fragment length to normlize to for experiments with different lens, def: 100)\n";
	print STDERR "\t\t-log (output tag counts as log2(x+1+rand) values - for scatter plots)\n";
	print STDERR "\t\t-sqrt (output tag counts as sqrt(x+rand) values - for scatter plots)\n";
	print STDERR "\t\t-ratio (process tag values as ratios - i.e. chip-seq, or mCpG/CpG)\n";

	print STDERR "\n\tAdvanced normalization options: (-rlog and -vst require R and DESeq2 to be installed)\n";
	print STDERR "\t\t-rlog (quantile/variance normalization on reported genes using DESeq2 rlog funcition, needs R)\n";
	print STDERR "\t\t-vst (quantile/variance normalization on reported genes using DESeq2 vst function, needs R)\n";
	#print STDERR "\t\t-quantile (quantile normalization on reported genes)\n";

	print STDERR "\n\tAdvanced Options:\n";
	print STDERR "\t\t-len <#> / -fragLength <#> (Fragment length, default=auto, might want to set to 1 for 5'RNA)\n";
	print STDERR "\t\t-size <#> (Peak size[from center of peak], default=inferred from peak file)\n";
	print STDERR "\t\t\t-size #,# (i.e. -size -10,50 count tags from -10 bp to +50 bp from center)\n";
	print STDERR "\t\t\t-size \"given\" (count tags etc. using the actual regions - for variable length regions)\n";
	print STDERR "\t\t-strand <+|-|both> (Count tags on specific strands relative to peak, default: both)\n";
	#print STDERR "\t\t-local # (size in bp to count tags as a local background\n";
	print STDERR "\t\t-pc <#> (maximum number of tags to count per bp, default=0 [no maximum], -tbp <#> works too)\n";
	#print STDERR "\t\t-cons (Retrieve conservation information for peaks/sites)\n";
	print STDERR "\t\t-CpG (Calculate CpG/GC content)\n";
	print STDERR "\t\t-nfr (report nuclesome free region scores instead of tag counts, also -nfrSize <#>)\n";
	print STDERR "\t\t-norevopp (do not search for motifs on the opposite strand [works with -center too])\n";
	print STDERR "\t\t-gwasCatalog <gwasCatalog file from UCSC> (list overlapping GWAS risk SNPs)\n";
	#print STDERR "\t\t-snp <file> <id> (genotype file)\n";
	print STDERR "\t\t-pdist (only report distance to nearest peak using -p, not peak name)\n";
	print STDERR "\t\t-map <mapping file> (mapping between peak IDs and promoter IDs, overrides closest assignment)\n";
	print STDERR "\t\t-noann, -nogene (skip genome annotation step, skip TSS annotation)\n";
	print STDERR "\t\t-homer1/-homer2 (by default, the new version of homer [-homer2] is used for finding motifs)\n";
	print STDERR "\t\t-cpu <#> (Number of processors to use when possible - only some parts utilize multiple cores)\n";
	print STDERR "\t\t-noblanks (remove peaks/rows with missing data)\n";
	print STDERR "\n";
	exit;
	

}

if (@ARGV < 2) { 
	printCMD();
}

my $cmd = "annotatePeaks.pl";
for (my $i=0;$i<@ARGV;$i++) {
	$cmd .= " $ARGV[$i]";
}

print STDERR "\n";
my %toDelete = ();

$maxHomer2SeqLength = 1e7;
my $showRstderr = '2> /dev/null';
#$showRstderr = "";

my $maxCPUs = 1;
my $seqFlag = 0;
my $consFlag = 0;
my $cpgFlag = 0;
my $skipBlastn = 1;
my $maskFlag = 0;
my $noblanksFlag = 0;

my $advNormMethod = '';
my $annStatFile = '';
my $normLength = 100;
my $gwasCatalog = "";
my $customAnnotationFile = "";
my $newPeakHistogramFlag=1;
my $homer2Flag = 1;
my $nfrFlag = 0;
my $nfrSize = 100;
my $mfastaFile = "";
my $fpkmFlag = 0;
my $gsize=2e9;
my $removeCloseMotifs=0;
my $strandFlag = "both";
my $logFlag = 0;
my $mdistFlag= 0;
my $mlogicFile = '';
my $sqrtFlag = 0;
my $pCountFlag = 0;
my $ratioFlag = 0;
my $snpFile = '';
my $snpID = '';
my $mscoreFlag = 0;
my $nscoreFlag = 0;
my $normCpGFlag = 0;
my $mirrorFlag = 0;
my $fragLength = 'auto';
my $revoppFlag = 1;
my $init2One = 0;
my $size = 300;
my $sizeMove = 0;
my $updateSize = 0;
my $local = 0;
my $adjustFlag = 1;
my $normValue = 1e7;
my $tssFlag = 0;
my $geneListFile = '';
my $ugFile = '';
my @geneDataFiles = ();
my $histBinSize = 0;
my $diFlag = 0;
my $nucFlag = 0;
my $mapFile = "";
my $centerMotif = '';
my $cpromoter = '';
my $histNorm = 0;
my $ghistFlag = 0;
my $goDir = '';
my $genomeOntologyDir = '';
my $pDistFlag = 0;
my $multiFlag = 0;
my $matrixPrefix = '';
my $matrixMinDist = 4;
my $matrixMaxDist = 1e10;
my $mbedFile = '';
my $noAnnFlag = 0;
my $noGeneFlag = 0;
my $mpeak = 0;
my $gtfFile = "";
my $rmMotifThresh = 10;
my $removeRevoppMotifs = 0;
my $vcfFile = "";
my @individuals = ();
my $editDistanceFlag = 0;
my $gffFlag = '';
my $bestScoreFlag = 0;
my $gidFlag = '';
my @cmpGenomes = ();
my @cmpLiftover = ();
my @revLiftover = ();

my $peakFile = $ARGV[0];
my $genome = $ARGV[1];
if ($genome =~ s/r$//) {
	$maskFlag = 1;
}

my $organism = "unknown";
my $promoter = "default";
my $consDir = "";
my $genomeDir = "";
my $genomeParseDir = "";
my $customGenome = 0;
if ($genome eq 'none') {
	print STDERR "\tNo genome selected (\"none\") - limits what you can do\n";
	$customGenome = -1;
	$genomeDir = "none";
} elsif (!exists($config->{'GENOMES'}->{$genome})) {
	$customGenome = 1;
	($genome,$genomeDir,$genomeParseDir) = HomerConfig::parseCustomGenome($genome);
} else {
	$genomeDir = $config->{'GENOMES'}->{$genome}->{'directory'};	
	$organism = $config->{'GENOMES'}->{$genome}->{'org'};	
	$promoter = $config->{'GENOMES'}->{$genome}->{'promoters'};
	$consDir = $config->{'GENOMES'}->{$genome}->{'directory'} . "/conservation/";
}
if ($ARGV[0] eq 'rna') {
	$size = 'given';
	$updateSize = 1;
}


print STDERR "\tPeak file = $peakFile\n";
print STDERR "\tGenome = $genome\n";
print STDERR "\tOrganism = $organism\n";


my @motifFiles = ();
my @tagDirs = ();
my @inputDirs = ();
my $pseudo = 5;
my @wigFiles = ();
my @bedGraphFiles = ();
my @peakFiles = ();
my @seFiles = ();
my %filterMotifFiles = ();

for (my $i=2;$i<@ARGV;$i++) {
	if ($ARGV[$i] eq '-m' || $ARGV[$i] eq '-fm') {
		my $code = $ARGV[$i];
		$seqFlag =1;
		my $bail = 0;
		print STDERR "\tMotif files:\n";
		while ($ARGV[++$i] !~ /^\-/) {
			push(@motifFiles, $ARGV[$i]);
			$filterMotifFiles{$ARGV[$i]}=$code;
			print STDERR "\t\t$ARGV[$i]\t$code\n";
			if ($i>=@ARGV-1) {
				$bail = 1;
				last;
			}
		}
		last if ($bail == 1);
		$i--;
	} elsif ($ARGV[$i] eq '-p') {
		print STDERR "\tPeak Files:\n";
		my $bail = 0;
		while ($ARGV[++$i] !~ /^\-/) {
			push(@peakFiles, $ARGV[$i]);
			print STDERR "\t\t$ARGV[$i]\n";
			if ($i>=@ARGV-1) {
				$bail = 1;
				last;
			}
		}
		last if ($bail == 1);
		$i--;
	} elsif ($ARGV[$i] eq '-se') {
		print STDERR "\tSuper Enhancer Peak Files:\n";
		my $bail = 0;
		while ($ARGV[++$i] !~ /^\-/) {
			push(@seFiles, $ARGV[$i]);
			print STDERR "\t\t$ARGV[$i]\n";
			if ($i>=@ARGV-1) {
				$bail = 1;
				last;
			}
		}
		last if ($bail == 1);
		$i--;
	} elsif ($ARGV[$i] eq '-bedGraph') {
		print STDERR "\tbedGraph Files:\n";
		my $bail = 0;
		while ($ARGV[++$i] !~ /^\-/) {
			push(@bedGraphFiles, $ARGV[$i]);
			print STDERR "\t\t$ARGV[$i]\n";
			if ($i>=@ARGV-1) {
				$bail = 1;
				last;
			}
		}
		last if ($bail == 1);
		$i--;
	} elsif ($ARGV[$i] eq '-wig') {
		print STDERR "\tWiggle Files:\n";
		my $bail = 0;
		while ($ARGV[++$i] !~ /^\-/) {
			push(@wigFiles, $ARGV[$i]);
			print STDERR "\t\t$ARGV[$i]\n";
			if ($i>=@ARGV-1) {
				$bail = 1;
				last;
			}
		}
		last if ($bail == 1);
		$i--;
	} elsif ($ARGV[$i] eq '-cmpGenome') {
		print STDERR "\tGenomes for comparison:\n";
		$seqFlag =1;
		my $bail = 0;
		while ($ARGV[++$i] !~ /^\-/) {
			push(@cmpGenomes, $ARGV[$i]);
			print STDERR "\t\t$ARGV[$i]\n";
			if ($i>=@ARGV-1) {
				$bail = 1;
				last;
			}
		}
		last if ($bail == 1);
		$i--;
	} elsif ($ARGV[$i] eq '-cmpLiftover') {
		my $bail = 0;
		while ($ARGV[++$i] !~ /^\-/) {
			push(@cmpLiftover, $ARGV[$i]);
			if ($i>=@ARGV-1) {
				$bail = 1;
				last;
			}
		}
		last if ($bail == 1);
		$i--;
	} elsif ($ARGV[$i] eq '-revLiftover') {
		my $bail = 0;
		while ($ARGV[++$i] !~ /^\-/) {
			push(@revLiftover, $ARGV[$i]);
			if ($i>=@ARGV-1) {
				$bail = 1;
				last;
			}
		}
		last if ($bail == 1);
		$i--;
	} elsif ($ARGV[$i] eq '-d') {
		print STDERR "\tTag Directories:\n";
		my $bail = 0;
		while ($ARGV[++$i] !~ /^\-/) {
			push(@tagDirs, $ARGV[$i]);
			print STDERR "\t\t$ARGV[$i]\n";
			if ($i>=@ARGV-1) {
				$bail = 1;
				last;
			}
		}
		last if ($bail == 1);
		$i--;
	} elsif ($ARGV[$i] eq '-i') {
		print STDERR "\tInput Directories:\n";
		my $bail = 0;
		while ($ARGV[++$i] !~ /^\-/) {
			push(@inputDirs, $ARGV[$i]);
			print STDERR "\t\t$ARGV[$i]\n";
			if ($i>=@ARGV-1) {
				$bail = 1;
				last;
			}
		}
		last if ($bail == 1);
		$i--;
	} elsif ($ARGV[$i] eq '-individuals') {
		print STDERR "\tVCF file individuals to analyze:\n";
		my $bail = 0;
		while ($ARGV[++$i] !~ /^\-/) {
			push(@individuals, $ARGV[$i]);
			print STDERR "\t\t$ARGV[$i]\n";
			if ($i>=@ARGV-1) {
				$bail = 1;
				last;
			}
		}
		last if ($bail == 1);
		$i--;
	} elsif ($ARGV[$i] eq '-gene') {
		print STDERR "\tGene Data Files:\n";
		my $bail = 0;
		while ($ARGV[++$i] !~ /^\-/) {
			push(@geneDataFiles, $ARGV[$i]);
			print STDERR "\t\t$ARGV[$i]\n";
			if ($i>=@ARGV-1) {
				$bail = 1;
				last;
			}
		}
		last if ($bail == 1);
		$i--;
	} elsif ($ARGV[$i] eq '-hist') {
		$histBinSize = $ARGV[++$i];
		print STDERR "\t-----------------------------------------------------\n";
		if ($size eq 'given') {
			print STDERR "\tHistogram mode activated (bin size = 1/$histBinSize)\n";
		} else {
			print STDERR "\tHistogram mode activated (bin size = $histBinSize bp)\n";
		}
		#print STDERR "\tHistogram mode activated (bin size = $histBinSize bp)\n";
		print STDERR "\t-----------------------------------------------------\n";
	} elsif ($ARGV[$i] eq '-dfile') {
		open IN, $ARGV[++$i];
		print STDERR "\tAdding Tag Directories:\n";
		while (<IN>) {
			chomp;
			s/\r//g;
			my @line = split /\t/;
			push(@tagDirs, $line[0]);
			print STDERR "\t\t$line[0]\n";
		}
		close IN;
	} elsif ($ARGV[$i] eq '-size') {
		$size = $ARGV[++$i];
		if ($size eq 'given') {
			print STDERR "\tUsing actual sizes of regions\n";
		} elsif ($size =~ /\,/) {
			my @a = split /\,/, $size;
			my $sizeStart= $a[0];
			my $sizeEnd = $a[1];
			if ($sizeEnd < $sizeStart) {
				print STDERR "!!! Size end must be less than the size start range in -size $sizeStart,$sizeEnd\n";
				exit;
			}
			$sizeMove = floor(($sizeStart+$sizeEnd)/2);
			$size = $sizeEnd - $sizeStart;
		}
		$updateSize = 1;
		print STDERR "\tPeak Region set to $size\n";
	} elsif ($ARGV[$i] eq '-matrixMaxDist') {
		$matrixMaxDist = $ARGV[++$i];
		print STDERR "\tWhen producing a motif co-occurence matrix, will only consider co-bound if < $matrixMaxDist bp away\n";
	} elsif ($ARGV[$i] eq '-map') {
		print STDERR "\tWill map peaks to promoters using map file: $mapFile\n";
		$mapFile = $ARGV[++$i];
	} elsif ($ARGV[$i] eq '-matrixMinDist') {
		$matrixMinDist = $ARGV[++$i];
		print STDERR "\tWhen producing a motif co-occurence matrix, will only consider co-bound if > $matrixMinDist bp away\n";
	} elsif ($ARGV[$i] eq '-noblanks' || $ARGV[$i] eq '-noBlanks') {
		print STDERR "\tWill remove rows with data values of '' or 'NA'\n";
		$noblanksFlag = 1;
	} elsif ($ARGV[$i] eq '-matrix') {
		$matrixPrefix = $ARGV[++$i];
		print STDERR "\tWill produce a motif co-occurence of motifs analysis files, prefix: $matrixPrefix\n";
	} elsif ($ARGV[$i] eq '-rmrevopp') {
		$removeCloseMotifs = -1;
		$removeRevoppMotifs = 1;
		$rmMotifThresh = $ARGV[++$i];
	} elsif ($ARGV[$i] eq '-rm') {
		$removeCloseMotifs = 1;
		$rmMotifThresh = $ARGV[++$i];
		if ($rmMotifThresh < 0) {
			$removeCloseMotifs = -1;
			$rmMotifThresh = abs($rmMotifThresh);
		}
	} elsif ($ARGV[$i] eq '-pseudo') {
		$pseudo = $ARGV[++$i];
		print STDERR "\tSetting pseudo count to: $pseudo\n";
	} elsif ($ARGV[$i] eq '-organism') {
		$organism = $ARGV[++$i];
		print STDERR "\tSetting organism to: $organism\n";
	} elsif ($ARGV[$i] eq '-mlogic') {
		$mlogicFile = $ARGV[++$i];
		print STDERR "\tWill outptu motif orientation stats to file: $mlogicFile\n";
	} elsif ($ARGV[$i] eq '-normLength') {
		$normLength = $ARGV[++$i];
		print STDERR "\tSetting normalization length to $normLength (set to 0 to disable)\n";
	} elsif ($ARGV[$i] eq '-vcf') {
		$vcfFile = $ARGV[++$i];
		print STDERR "\tWill get SNP info from VCF file: $vcfFile\n";
	} elsif ($ARGV[$i] eq '-editDistance') {
		$editDistanceFlag = 1;
		print STDERR "\tWill calculate total variation (edit disance) from reference sequence\n";
	} elsif ($ARGV[$i] eq '-gwasCatalog') {
		$gwasCatalog = $ARGV[++$i];
		print STDERR "\tWill annotate transcripts using GWAS catalog (from file: $gwasCatalog)\n";
	} elsif ($ARGV[$i] eq '-mbed') {
		$mbedFile = $ARGV[++$i];
		print STDERR "\tWill produce a motif bed file: $mbedFile\n";
	} elsif ($ARGV[$i] eq '-mpeak') {
		$mbedFile = $ARGV[++$i];
		$mpeak = 1;
		print STDERR "\tWill produce a motif peak file: $mbedFile\n";
	} elsif ($ARGV[$i] eq '-mask') {
		$maskFlag = 1;
		print STDERR "\tWill use repeat-masked sequences\n";
	} elsif ($ARGV[$i] eq '-TSS') {
		$promoter = $ARGV[++$i];
		print STDERR "\tPromoter Set will be $promoter\n";
	} elsif ($ARGV[$i] eq '-annStats') {
		$annStatFile = $ARGV[++$i];
	} elsif ($ARGV[$i] eq '-snp') {
		$snpFile = $ARGV[++$i];
		$snpID = $ARGV[++$i];
		print STDERR "\tWill use $snpFile with ID: $snpID for genotype\n";
	} elsif ($ARGV[$i] eq '-strand') {
		$strandFlag = $ARGV[++$i];
		print STDERR "\tWill count tags on strand: $strandFlag\n";
	} elsif ($ARGV[$i] eq '-mfasta') {
		$mfastaFile = $ARGV[++$i];
		print STDERR "\tWill output motif site sequences to FASTA file: $mfastaFile\n";
	} elsif ($ARGV[$i] eq '-cTSS') {
		$cpromoter = $ARGV[++$i];
		print STDERR "\tCustom promoter set will be $cpromoter\n";
	} elsif ($ARGV[$i] eq '-ann') {
		$customAnnotationFile = $ARGV[++$i];
		print STDERR "\tCustom annotation file: $customAnnotationFile\n";
	} elsif ($ARGV[$i] eq '-cpu') {
		$maxCPUs = $ARGV[++$i];
		print STDERR "\tWill use up to $maxCPUs CPUs in parts that can use them\n";
	} elsif ($ARGV[$i] eq '-pc' || $ARGV[$i] eq '-tbp') {
		$init2One = $ARGV[++$i];
		print STDERR "\tMaximum count per bp will be set to $init2One\n";
	} elsif ($ARGV[$i] eq '-nmotifs') {
		$nscoreFlag = 1;
		print STDERR "\tWill report the number of motifs in each peak\n";
	} elsif ($ARGV[$i] eq '-mscore') {
		$mscoreFlag = 1;
		print STDERR "\tWill report max log-odds score for motif in each peak\n";
	} elsif ($ARGV[$i] eq '-log') {
		$logFlag = 1;
		print STDERR "\tWill output log(1+rand()+x) for tag counts\n";
	} elsif ($ARGV[$i] eq '-sqrt') {
		$sqrtFlag = 1;
		print STDERR "\tWill output sqrt(rand()+x) for tag counts\n";
	} elsif ($ARGV[$i] eq '-normCpG') {
		$normCpGFlag = 1;
		print STDERR "\tTreating tag files like CpG methylation ratios\n";
	} elsif ($ARGV[$i] eq '-gsize') {
		$gsize = $ARGV[++$i];
		print STDERR "\tEffective Genome size set to $gsize\n";
	} elsif ($ARGV[$i] eq '-nfr') {
		$nfrFlag = 1;
		print STDERR "\tReporting tag directory counts as Nucleosome Free Region scores ($nfrSize bp size)\n";
	} elsif ($ARGV[$i] eq '-nfrSize') {
		$nfrSize = $ARGV[++$i];
		print STDERR "\tUsing Nucleosome Free Region size of $nfrSize bp\n";
	} elsif ($ARGV[$i] eq '-ratio' || $ARGV[$i] eq '-mC') {
		$ratioFlag = 1;
		print STDERR "\tTreating tag values like ratios\n";
	} elsif ($ARGV[$i] eq '-pdist2') {
		$pDistFlag = 2;
		print STDERR "\tWill only report distance for nearest peaks\n";
	} elsif ($ARGV[$i] eq '-mdist') {
		$mdistFlag = 1;
		print STDERR "\tReports distance to nearest motif\n";
	} elsif ($ARGV[$i] eq '-pdist') {
		$pDistFlag = 1;
		print STDERR "\tWill only report absolute distance for nearest peaks\n";
	} elsif ($ARGV[$i] eq '-pcount') {
		$pCountFlag = 1;
		print STDERR "\tWill report number of peaks\n";
	} elsif ($ARGV[$i] eq '-ghist') {
		$ghistFlag = 1;
		print STDERR "\tWill create histogram for each gene\n";
	} elsif ($ARGV[$i] eq '-noadj' || $ARGV[$i] eq '-raw') {
		$adjustFlag = 0;
		print STDERR "\tWill NOT normalize tag counts\n";
	} elsif ($ARGV[$i] eq '-rlog') {
		$adjustFlag = 0;
		$advNormMethod = 'rlog';
		print STDERR "\tNormalizing with rlog (R must be installed with package DESeq2)\n";
	} elsif ($ARGV[$i] eq '-vst') {
		$adjustFlag = 0;
		$advNormMethod = 'vst';
		print STDERR "\tNormalizing with vst (R must be installed with package DESeq2)\n";
	} elsif ($ARGV[$i] eq '-mirror') {
		$mirrorFlag =1;
		print STDERR "\tReturning mirrored positions\n";
	} elsif ($ARGV[$i] eq '-list') {
		$geneListFile = $ARGV[++$i];
		print STDERR "\tPromoters for genes found in $geneListFile will be analyzed\n";
	} elsif ($ARGV[$i] eq '-nogene') {
		$noGeneFlag = 1;
		print STDERR "\tWill Skip closest gene annotation\n";
	} elsif ($ARGV[$i] eq '-noann') {
		$noAnnFlag = 1;
		print STDERR "\tWill Skip peak annotation\n";
	} elsif ($ARGV[$i] eq '-bestScore') {
		$bestScoreFlag = 1;
	} elsif ($ARGV[$i] eq '-center') {
		$centerMotif = $ARGV[++$i];
		$seqFlag = 1;
		print STDERR "\tPeaks/Regions will be centered on motif in file $centerMotif\n";
	} elsif ($ARGV[$i] eq '-norm') {
		$normValue = $ARGV[++$i];
		if ($normValue == 0) {
			print STDERR "\tNormalzing tags to the Average Tag totals\n";
		} else {
			print STDERR "\tWill normalize tag counts to $normValue per experiment\n";
		}
	} elsif ($ARGV[$i] eq '-norevopp') {
		$revoppFlag = 0;
		print STDERR "\tWill not search for motifs on the opposite strand\n";
	} elsif ($ARGV[$i] eq '-go') {
		$goDir = $ARGV[++$i];
		print STDERR "\tWill perform Gene Ontology analysis - output to directory = $goDir\n";
	} elsif ($ARGV[$i] eq '-genomeOntology') {
		$genomeOntologyDir = $ARGV[++$i];
		print STDERR "\tWill perform Genome Ontology analysis - output to directory = $genomeOntologyDir\n";
		print STDERR "\t\tWarning - might want to set the genome size with -gsize (currently $gsize)\n";
	} elsif ($ARGV[$i] eq '-multi') {
		$multiFlag = 1;
		print STDERR "\tWill return all motif positions when centering...\n";
	} elsif ($ARGV[$i] eq '-local') {
		$local = $ARGV[++$i];
		print STDERR "\tWill count tags in local backgound in $local bp around peak\n";
	} elsif ($ARGV[$i] eq '-blastn') {
		$skipBlastn = 0;
	} elsif ($ARGV[$i] eq '-cons') {
		$consFlag = 1;
		print STDERR "\tWill extract conservation information\n";
	} elsif ($ARGV[$i] eq '-len' || $ARGV[$i] eq '-fragLength') {
		$fragLength = $ARGV[++$i];
		print STDERR "\tFragment Length set to $fragLength\n";
	} elsif ($ARGV[$i] eq '-di') {
		$seqFlag =1;
		$diFlag = 1;
		print STDERR "\tWill report dinucleotide frequencies\n";
	} elsif ($ARGV[$i] eq '-gtf') {
		$gtfFile = $ARGV[++$i];
		print STDERR "\tCustom annotation GTF file: $gtfFile (using transcript_id)\n";
	} elsif ($ARGV[$i] eq '-gid') {
		$gidFlag = " -gid";
		print STDERR "\tUsing gene_ids for GTF file\n";
	} elsif ($ARGV[$i] eq '-gff') {
		$gtfFile = $ARGV[++$i];
		$gffFlag = ' -gff';
		print STDERR "\tCustom annotation GFF file: $gtfFile (better to get GTF file)\n";
	} elsif ($ARGV[$i] eq '-gff3') {
		$gtfFile = $ARGV[++$i];
		$gffFlag = ' -gff3';
		print STDERR "\tCustom annotation GFF3 file: $gtfFile (better to get GTF file)\n";
	} elsif ($ARGV[$i] eq '-nuc') {
		$seqFlag =1;
		print STDERR "\tWill report nucleotide frequencies\n";
	} elsif ($ARGV[$i] eq '-histNorm') {
		$histNorm = $ARGV[++$i];
		print STDERR "\tWill normalize Tag histograms with minimum total of $histNorm\n";
	} elsif ($ARGV[$i] eq '-homer2') {
		$homer2Flag = 1;
		print STDERR "\tUsing homer2...\n";
	} elsif ($ARGV[$i] eq '-fpkm' || $ARGV[$i] eq '-rpkm') {
		$fpkmFlag = 1;
		$normValue = 1e6;
		print STDERR "\tWill normalized reads to FPKM\n";
	} elsif ($ARGV[$i] eq '-homer1') {
		$homer2Flag = 0;
		print STDERR "\tUsing original homer...\n";
	} elsif ($ARGV[$i] eq '-CpG') {
		$cpgFlag = 1;
		$seqFlag = 1;
		print STDERR "\tWill calculate CpG/GC content\n";
	} else {
		print STDERR "$ARGV[$i] not recognized\n\n";
		printCMD();
	
	}
}

if ($advNormMethod ne '') {
	my $a = `which R`;
	if ($a =~ /^which: no R/) {
		print STDERR "!!! Error - could not find R installation in \$PATH variable\n";
    	exit;
	}
}


my %alpha2index = ();
$alpha2index{'A'} = 0;
$alpha2index{'a'} = 0;
$alpha2index{'C'} = 1;
$alpha2index{'c'} = 1;
$alpha2index{'G'} = 2;
$alpha2index{'g'} = 2;
$alpha2index{'T'} = 3;
$alpha2index{'t'} = 3;


if ($updateSize == 0 && $histBinSize == 0) {
	#print STDERR "size set to given\n";
	$size = "given";
	$updateSize = 1;
}

my $cmpGenomeFlag = 0;
if (scalar(@cmpGenomes) > 0 || scalar(@cmpLiftover) > 0) {
	$cmpGenomeFlag = 1;
	if (scalar(@cmpGenomes) != scalar(@cmpLiftover)) {
		print STDERR "!!! Error - Each -cmpGenome genome entry needs a matching -cmpLiftover\n";
		exit;
	}
	if (scalar(@revLiftover) > 0) {
		if (scalar(@revLiftover) != scalar(@cmpLiftover)) {
			print STDERR "!!! Error - Each -cmpLiftover should match a -revLiftover\n";
			exit;
		}
	}
	if ($skipBlastn) {
		print STDERR "\tAdd '-blastn' to command to calculate % identities (can take a while...)\n";
	} else {
		print STDERR "\tWill use blastn to calculate % identities\n";
	}
}

my %cmpGenomeInfo = ();
for (my $i=0;$i<@cmpGenomes;$i++) {
	my $cgenome = $cmpGenomes[$i];
	my $cmaskFlag = 0;
	if ($cgenome =~ s/r$//) {
		$cmaskFlag = 1;
	}

	my $cgenomeDir = "";
	my $ccustomGenome = 0;

	if ($cgenome eq 'none') {
		print STDERR "\tA -cmpGenome cannot be 'none'\n";
		exit;
		$ccustomGenome = -1;
		$cgenomeDir = "none";
	} elsif (!exists($config->{'GENOMES'}->{$cgenome})) {
		$ccustomGenome = 1;
		($cgenome,$cgenomeDir,$cgenomeParseDir) = HomerConfig::parseCustomGenome($cgenome);
		$cgenomeParseDir = '';
	} else {
		$cgenomeDir = $config->{'GENOMES'}->{$cgenome}->{'directory'};	
	}
	$cmpGenomeInfo{$cmpGenomes[$i]} = {genome=>$cgenome,dir=>$cgenomeDir,custom=>$ccustomGenome,mask=>$cmaskFlag};
}

my $mflag = '';
if ($maskFlag) {
	$mflag = " -mask ";
}

$promoterIDtype = 'refseq';
if ($promoter ne 'default') {
	if (!exists($config->{'PROMOTERS'}->{$promoter})) {
		print STDERR "!! Promoter Set $promoter not recognized\n";
		exit;
	}
	$promoterIDtype = $config->{'PROMOTERS'}->{$promoter}->{'idtype'};
}
if ($gtfFile) {
	$promoterIDtype = 'custom';
}


my $halfLocal = floor($local/2);

my %peaks = ();
my %gene = ();

#tmp files
my $rand = rand();
my $tmpfile = $rand . ".tmp";
my $tmpfile2 = $rand . ".2.tmp";
my $tmpfile3 = $rand . ".3.tmp";
my $tmpfile4 = $rand . ".4.tmp";
my $tmpfile5 = $rand . ".5.tmp";
my $tmpfile6 = $rand . ".6.tmp";
my $tmpfile7 = $rand . ".7.tmp";
my $seqFile = $rand . ".seq";
my $seqFaFile = $rand . ".seq.fa";
my $consFile = $rand . ".cons";
my $posFile = $rand . ".pos";
my $gtfTSSFile = $rand . ".gtf.tss";
my $tmpPeakFile = $rand . ".peak";
my $cleanPosFile = $rand . ".clean.pos";

my $ogPeakFile = $peakFile;

my %geneList = ();
my $specialFile = $genomeDir . "/" . $genome . "." . $peakFile;
if ($peakFile eq 'tss' || $peakFile eq 'tts' || $peakFile eq 'rna' || -e $specialFile) {

	print STDERR "\tFound special file: $specialFile\n";
	my $postfix = $peakFile;

	$tssFlag = 1;
	print STDERR "\n\t*****************\n\t$postfix Mode enabled\n";
	print STDERR "\t*****************\n\n";

	if ($gtfFile ne '') {
		`parseGTF.pl "$gtfFile" $peakFile $gffFlag $gidFlag > "$gtfTSSFile"`;
		$size = 4000 if ($updateSize == 0 && $peakFile ne 'rna');
		$peakFile = $gtfTSSFile;
		$toDelete{$gtfTSSFile}=1;
	} elsif ($cpromoter ne '') {
		$size = 4000 if ($updateSize == 0 && $peakFile ne 'rna');
		$peakFile = $cpromoter;
	} elsif ($promoter eq 'default') {
		$size = 4000 if ($updateSize == 0 && $peakFile ne 'rna');
		$peakFile = $genomeDir . "/" . $genome . "." . $postfix;	
		if (-f $peakFile) {
		} else {
			print STDERR "!!! This isn't going to work - can't find $peakFile !!!\n";
			print STDERR "!!! Have you upgraded lately? !!!\n";
			print STDERR "!!! Can't use the \"tss\" option with a custom genome - don't know where the TSS are... !!!\n";
			exit;
		}
	} else {
		$size = 2*$config->{'PROMOTERS'}->{$promoter}->{'end'} if ($updateSize == 0 && $peakFile ne 'rna');
		$peakFile = $config->{'PROMOTERS'}->{$promoter}->{'directory'} . "/" . $promoter . ".pos";
	}


	if ($geneListFile ne '') {
		my $cflag = 0;
		if ($promoterIDtype eq 'null' || $promoterIDtype eq 'custom' || $organism eq 'unknown') {
			`cut -f1 "$geneListFile" > $tmpfile`;
		} else {
			$cflag = 1;
			`convertIDs.pl "$geneListFile" $organism $promoterIDtype no yes yes > "$tmpfile"`;
		}
		`cat "$geneListFile" "$tmpfile" | cut -f1 | sort | uniq > "$tmpfile2"`;
		`cp "$tmpfile2" "$tmpfile3"`;
		`cat "$tmpfile2" "$tmpfile3" | sort | uniq > "$tmpfile"`;
		#print STDERR "`mergeData.pl $tmpfile $peakFile 0 -accVer  | sort | uniq >  $tmpPeakFile`\n";
		`mergeData.pl "$tmpfile" "$peakFile" 0 -accVer  | sort | uniq >  "$tmpPeakFile"`;
		$peakFile = $tmpPeakFile;
		#$toDelete{$tmpPeakFile} = 1;
		`rm "$tmpfile" "$tmpfile2" "$tmpfile3"`;
	}
} else {
	`bed2pos.pl "$peakFile" -check -unique > "$cleanPosFile"`;
	`checkPeakFile.pl "$cleanPosFile"`;
	`cleanUpPeakFile.pl "$cleanPosFile" > "$tmpfile"`;
	`mv "$tmpfile" "$cleanPosFile"`;
	$toDelete{$cleanPosFile}=1;
	$peakFile = $cleanPosFile;
}

if ($updateSize && $size ne 'given') {
	print STDERR "\tResizing peaks...\n";
	`resizePosFile.pl "$peakFile" $size $sizeMove > "$posFile"`;
} else {
	#print STDERR "`cp $peakFile $posFile`;\n";
	open OUT, ">$posFile";
	open IN, $peakFile;
	while (<IN>) {
		chomp;	
		s/\r//g;
		my @line = split /\t/;
		foreach(@line) {
			s/^\s*//g;
			s/\s*$//g;
		}
		print OUT "$line[0]";
		for (my $i=1;$i<@line;$i++) {
			print OUT "\t$line[$i]";
		}
		print OUT "\n";
	}
	close IN;
	close OUT;
}

`rm "$tmpPeakFile"` if ($tmpPeakFile eq $peakFile);


#first extract sequence for positions
if ($seqFlag) {
	$cpgFlag = 1;
	print STDERR "\tExtracting Sequence...\n";
	if ($genome eq 'none') {
		print STDERR "!!! Cannot do anything with genomic sequences (i.e. motifs) if genome is set to: $genome !!!\n";
		print STDERR "!!! Try installing a genome through HOMER or provide a FASTA directory/file\n";
		exit;
	}
	`homerTools extract "$posFile" "$genomeDir" $mflag > "$seqFile"`;
	if ($snpFile ne '') {

	}
}


print STDERR "\tReading Positions...\n";

my $avgSize = 0;
my $avgSizeN = 0;
my $maxSize = 0;
open IN, $posFile;
my @peakOrder = ();
while (<IN>) {
	chomp;
	s/\r//g;
	next if (/^#/);
	my @line = split /\t/;
	my $hit = $line[0];
	my $chr = $line[1];
	my $start = $line[2];
	next if ($start =~ /^[^\d\-\.]/);
	my $end = $line[3];
	my $peakSize = $end-$start+1;
	if ($updateSize == 0) {
		$size = $peakSize;
	}
	my $direction = '+';
	if ($line[4] eq '-' || $line[4] eq '1') {
		$direction = '-';
	}
	my $value = 0;
	my $fdr = 'NA';
	if (@line > 5) {
		$value = $line[5];
	}
	if (@line > 6) {
		$fdr = $line[6];
	}
	my %a = ();
	my %b = ();
	my %c = ();
	$avgSize += $peakSize;
	$avgSizeN++;
	$maxSize = $peakSize if ($peakSize > $maxSize);

	my @compData = ();
	if ($cmpGenomeFlag) {
		foreach(@cmpGenomes) {
			my $compData = {map=>0,score=>0,indel=>0,var=>0,pid=>'NA',paln=>0};
			push(@compData, $compData);
		}
	}


	$peaks{$hit} = {tss=>'NA', tssDist=>'NA',m=>\%a, p=>\%c, cons=>'NA', size=>$peakSize, tssUG=>'NA',
				s=>$start, e=>$end, d=>$direction, v=>$value, c=>$chr, t=>\%b, gc=>'NA',cpg=>'NA',
				centerDist=>1e10,centerDir=>1e10,centerScore=>-10,fdr=>$fdr,ann=>'NA',
				fullann=>'NA',gComp=>\@compData};
	push(@peakOrder, $hit);
}
close IN;
my $halfSize = 0;
if ($size ne 'given') {
	$halfSize = floor($size/2);
	$avgSize = $size;
} else {
	$avgSize /= $avgSizeN if ($avgSizeN > 0);
}

if (-e "$seqFile" && $maxSize > $maxHomer2SeqLength) {
	`tab2fasta.pl "$seqFile" > "$seqFaFile"`;
	$toDelete{$seqFaFile}=1;
}

if ($seqFlag && $snpFile ne '') {
	#open IN, $ARGV[0];	
}


if ($centerMotif ne '') {
	print STDERR "\tLooking for motifs to center regions...\n";

	my $offset = -1*$halfSize;
	#my $offset = -1*$halfSize+$sizeMove;
	#print STDERR "halfSize=$halfSize\nsizeMove=$sizeMove\noffset=$offset\n";
	if ($size eq 'given') {
		$offset = 0;
	}

	my $mfile = $centerMotif;
	my $options = '';

	my $seqFile2Use = " -s \"$seqFile\"";
	if ($maxSize > $maxHomer2SeqLength) {
		$seqFile2Use = " -i \"$seqFaFile\"";
	}

	if ($homer2Flag) {
		$options .= " -strand +" if ($revoppFlag == 0);
		$options .= " -p $maxCPUs";
		if ($mscoreFlag) {
			`homer2 find $seqFile2Use -offset $offset -m "$mfile" -mscore $options > "$tmpfile"`;
		} else {
			`homer2 find $seqFile2Use -offset $offset -m "$mfile" $options > "$tmpfile"`;
			#print STDERR "`homer2 find $seqFile2Use -offset $offset -m $mfile $options > $tmpfile`;\n";
		}
	} else {
		$options .= " -norevopp" if ($revoppFlag == 0);
		if ($mscoreFlag) {
			`homer -s "$seqFile" -a GENESCORE -offset $offset -m "$mfile" $options > "$tmpfile"`;
		} else {
			`homer -s "$seqFile" -a FIND -offset $offset -m "$mfile" $options > "$tmpfile"`;
		}
	}
	my $numMultiMotifs = 0;

	open IN, $tmpfile;
	while (<IN>) {
		chomp;
		my @line = split /\t/;
		my $hit = "";
		my $pos = "";
		my $seq = "";
		my $con = 0;
		my $dir = 0;
		my $score = 0;
		if ($homer2Flag) {
			$hit = $line[0];
			$pos = $line[1];
			$seq = $line[2];
			$con = 0;
			$dir = $line[4];
			$score = $line[5];
		} else {
			$hit = $line[0];
			$pos = $line[1];
			$seq = $line[2];
			$con = $line[3];
			$dir = $line[4];
			$score = $line[6];
		}
		if ($dir eq '+' || $dir eq '0') {
			$dir = 0;
		} else {
			$dir = 1;
			$pos += length($seq)-1 unless ($homer2Flag);
		}
		if ($mirrorFlag==1) {
			$pos *= -1;
		}
		if ($size eq 'given') {
			my $L = $peaks{$hit}->{'e'}-$peaks{$hit}->{'s'};
			$pos = $pos-floor($L/2);
		}
				
		#provide information about centering
		if (abs($pos) < abs($peaks{$hit}->{'centerDist'})) {
			$peaks{$hit}->{'centerDist'} = $pos;
			$peaks{$hit}->{'centerDir'} = $dir;
			$peaks{$hit}->{'centerScore'} = $score;
		}
		if ($multiFlag) {
			my $chr = $peaks{$hit}->{'c'};
			my $start = $peaks{$hit}->{'s'};
			my $end = $peaks{$hit}->{'e'};
			my $v = $peaks{$hit}->{'v'};
			my $dd = $peaks{$hit}->{'d'};
			$dd = 1 if ($dd eq '-');
			$dd = 0 if ($dd eq '+');
			if ($dd == 0) {
				$start += $pos;
				$end += $pos;
				$dd = 1 if ($dir == 1);
			} else {
				$start -= $pos;
				$end -= $pos;
				$dd = 0 if ($dir == 1);
			}
			print "$hit\t$chr\t$start\t$end\t$dd\t$v\t$score\n";
			$numMultiMotifs++;
		}
	}
	close IN;
	`rm "$tmpfile" "$posFile" "$seqFile"`;
	if ($multiFlag) {
		deleteFiles();
		print STDERR "\tCentered on $numMultiMotifs motifs total\n\n";
		exit;
	}

	my $total = 0;
	my $goodPeaks = 0;
	my $totalChange = 0;
	foreach(keys %peaks) {
		$total++;
		my $hit = $_;
		next if ($peaks{$hit}->{'centerDist'} > 1e8);
		if ($size ne 'given') {
			next if (abs($peaks{$hit}->{'centerDist'}) > $size);
		}
		$goodPeaks++;
		my $start = $peaks{$hit}->{'s'};
		my $end = $peaks{$hit}->{'e'};
		my $dir = $peaks{$hit}->{'d'};
		if ($dir eq '+' || $dir eq '0') {
			$dir = 0;
		} else {
			$dir = 1;
		}
		my $chr = $peaks{$hit}->{'c'};
		my $v = $peaks{$hit}->{'v'};

		my $posOffset = $peaks{$hit}->{'centerDist'};
		my $score = $peaks{$hit}->{'centerScore'};
		$totalChange+=abs($posOffset);

		my $changeDir = $peaks{$hit}->{'centerDir'};
		if ($dir == 0) {
			$start += $posOffset;
			$end += $posOffset;
			$dir = 1 if ($changeDir == 1);
		} else {
			$start -= $posOffset;
			$end -= $posOffset;
			$dir = 0 if ($changeDir == 1);
		}
		print "$hit\t$chr\t$start\t$end\t$dir\t$v\t$score\n";
	}
	my $avgChange = 'N/A';
	my $status = 'failed';
	if ($goodPeaks > 0) {
		$avgChange = $totalChange/$goodPeaks;
		$status = "successful";
	}
	print STDERR "\nTotal Peaks/Regions:     $total\n";
	print STDERR "Total Peaks re-centered: $goodPeaks\n";
	print STDERR "Avg. Adjustement size:   $avgChange\n";
	print STDERR "\nPeak/Region centering was $status\n"; 
	deleteFiles();
	exit;
}


if ($consFlag == 1) {
	if (open IN, "$consDir/chr1.fa") {
		close IN;
		print STDERR "\tExtracting Conservation...\n";
		`homerTools extract "$posFile" "$consDir" > "$consFile"`;
	} else {
		print STDERR "Conservation information not present in $consDir (refer to documentation)\n";	
		print STDERR "Skipping Conservation\n";	
		$consFlag = 0;
	}

	print STDERR "\tCalculating average conservation...\n";
	`conservationPerLocus.pl "$consFile" > "$tmpfile"`;
	open IN, $tmpfile;
	while (<IN>) {
		chomp;
		my @line = split /\t/;
		next if (!exists($peaks{$line[0]}));
		$peaks{$line[0]}->{'cons'} = $line[1];
	}
	close IN;
	`rm "$tmpfile"`;
}

if ($seqFlag==1 && $cpgFlag == 1) {
	print STDERR "\tCalculating CpG/GC Content of Peaks\n";
	`homerTools freq "$seqFile" -gc "$tmpfile" > /dev/null`;
	open IN, $tmpfile;
	while (<IN>) {
		chomp;
		my @line = split /\t/;
		next if (!exists($peaks{$line[0]}));
		$peaks{$line[0]}->{'cpg'} = $line[1];
		$peaks{$line[0]}->{'gc'} = $line[2];
	}
	close IN;
	`rm "$tmpfile"`;
}

#find motifs in fragments
my %motifNames = ();
my @motifOrder = ();

print STDERR "\t-----------------------\n";

my $offset = -1*$halfSize+$sizeMove;
if ($size eq 'given') {
	$offset = 0;
}
my @motifNames = ();

if ($mbedFile ne '') {
	my $crapass = '';
	my $shitface = '';
	open MBED, ">$mbedFile" or print STDERR "Could not open file $mbedFile for writing!\n";
	if ($mpeak==0) {
		print MBED "track name=\"$ogPeakFile motifs\" description=\"$ogPeakFile motifs\""
					. " visibility=\"pack\" useScore=\"1\"\n";
	}
}

@motifFiles = sort {$filterMotifFiles{$a} cmp $filterMotifFiles{$b}} @motifFiles;
my @copyMotifFiles = @motifFiles;
my %motifMask = ();

if ($mfastaFile ne '') {
	open MFASTA, ">$mfastaFile";
}

for (my $i=0;$i<@motifFiles;$i++) {
	if ($i==0) {
		print STDERR "\tLooking for Motifs...\n";
	}
	my $mfile = $motifFiles[$i];


	open IN, $mfile;
	while (<IN>) {
		chomp;
		s/\r//g;
		my @line = split /\t/;
		next if (@line < 2);
		if ($line[0] =~ /^>/) {
			my $mname = $line[1];

			if (!exists($motifNames{$mname})) {
				$motifNames{$mname} = 1;
				push(@motifNames, $mname);
			}
		}
	}
	close IN;



	my $options = '';
	my $code = $filterMotifFiles{$mfile};

	if ($homer2Flag) {
		$options .= " -strand +" if ($revoppFlag == 0);
		$options .= " -p $maxCPUs";
		my $seqFile2Use = " -s \"$seqFile\"";
		if ($maxSize > $maxHomer2SeqLength) {
			print STDERR "!!! Unfortunately, HOMER cannot currently scan sequences >1e7 in length)\n";
			print STDERR "!!! Try using scanMotifGenomeWide.pl to scan for motifs in large FASTA files\n";
			exit;
			$seqFile2Use = " -i \"$seqFaFile\"";
		}
		if ($mscoreFlag) {
			`homer2 find $seqFile2Use -offset $offset -m "$mfile" -mscore $options > "$tmpfile"`;
		} else {
			#print STDERR "Here(maxSize=$maxSize, $seqFile2Use): `homer2 find $seqFile2Use -offset $offset -m $mfile $options > $tmpfile`;\n";
			`homer2 find $seqFile2Use -offset $offset -m "$mfile" $options > "$tmpfile"`;
		}
	} else {
		$options .= " -norevopp" if ($revoppFlag == 0);
		if ($mscoreFlag) {
			`homer -s "$seqFile" -a GENESCORE -offset $offset -m "$mfile" $options > "$tmpfile"`;
		} else {
			`homer -s "$seqFile" -a FIND -offset $offset -m "$mfile" $options > "$tmpfile"`;
		}
	}

	if ($consFlag==1) {
		`mv "$tmpfile" "$tmpfile2"`;
		`getSiteConservation.pl "$consFile" "$tmpfile2" $offset > "$tmpfile"`;
		`rm "$tmpfile2"`;
	}

	open IN, $tmpfile;
	while (<IN>) {
		chomp;
		my @line = split /\t/;
		my $hit = $line[0];
		my $pos = $line[1];
		my $seq = $line[2];
		my $con = $line[3];
		my $dir = $line[4];
		my $mname = $line[5];
		my $score = 0;
		if ($homer2Flag) {
			$mname = $line[3];
			$score = $line[5];
			$con = 0 ;
			if ($dir eq '-' || $dir eq '1') {
				$pos -= length($seq)-1;
			}
		} else {
			$score = $line[6];
		}
		my $midpoint = $pos + floor(length($seq)/2);

		if ($mfastaFile ne '') {
			my $mfastaSeq = $seq;
			if ($dir eq '-' || $dir eq '1') {
				$mfastaSeq = HomerConfig::revopp($mfastaSeq);
			}
			print MFASTA ">$mname $score $hit $pos $dir\n$mfastaSeq\n";

		}


		if ($code eq '-fm') {
			if (!exists($mask{$hit})) {
				my %a = ();
				$mask{$hit} = \%a;
			}
			for (my $j=$pos;$j<=$pos+length($seq);$j++) {
				$mask{$hit}->{$j}=1;
			}
			next;
		} elsif ($code eq '-m') {
			my $bad = 0;
			if (exists($mask{$hit})) {
				for (my $j=$pos;$j<=$pos+length($seq);$j++) {
					if (exists($mask{$hit}->{$j})) {
						$bad = 1;
						last;
					}
				}
			}
			if ($bad) {
				next;
			}
		}


		my $pChr = $peaks{$hit}->{'c'};
		my $pStart = $peaks{$hit}->{'s'};
		my $pDir = $peaks{$hit}->{'d'};
		my $bedStart = $pStart + $pos - $offset-1;
		my $bedEnd = $bedStart + length($seq);
		if ($pDir eq '1' || $pDir eq '-') {
			$bedEnd = $peaks{$hit}->{'e'} - $pos + $offset;
			$bedStart = $bedEnd - length($seq);
		}
		my $bedDir = '+';
		if ($dir eq '-' || $dir eq '1') {
			$bedDir = '-';
			if ($pDir eq '-' || $pDir eq '1') {
				$bedDir = '+';
			}
		} else {
			if ($pDir eq '-' || $pDir eq '1') {
				$bedDir = '-';
			}
		}

		if ($mbedFile ne '') {
			if ($mpeak==0) {
				my $ss=$score;
				print MBED "$pChr\t$bedStart\t$bedEnd\t$mname\t$ss\t$bedDir\n";
			} else {
				$bedStart += 1;
				print MBED "$mname\t$pChr\t$bedStart\t$bedEnd\t$bedDir\t$score\t$seq\n";
			}
		}
		if (!exists($peaks{$hit}->{'m'}->{$mname})) {
			my @a = ();
			$peaks{$hit}->{'m'}->{$mname} = \@a;
		}

		my @compData = ();
		if ($cmpGenomeFlag) {
			foreach(@cmpGenomes) {
				my $compData = {map=>0,score=>0,indel=>0,var=>0,m=>0};
				push(@compData, $compData);
			}
		}


		my $m = {p=>$pos, s=>$seq, c=>$con, d=>$dir, valid=>1,score=>$score,mp=>$midpoint,
					gc=>$pChr,gs=>$bedStart,ge=>$bedEnd,gd=>$bedDir,gComp=>\@compData};
		push(@{$peaks{$hit}->{'m'}->{$mname}}, $m);

		if ($removeCloseMotifs != 0) {
			my $num = scalar(@{$peaks{$hit}->{'m'}->{$mname}});
			for (my $i=0;$i<$num;$i++) {
				next if ($peaks{$hit}->{'m'}->{$mname}->[$i]->{'valid'}==0);
				my $p1 = $peaks{$hit}->{'m'}->{$mname}->[$i]->{'p'};
				my $d1 = $peaks{$hit}->{'m'}->{$mname}->[$i]->{'d'};
				for (my $j=0;$j<$num;$j++) {
					next if ($j==$i);
					my $p2 = $peaks{$hit}->{'m'}->{$mname}->[$j]->{'p'};
					my $d2 = $peaks{$hit}->{'m'}->{$mname}->[$j]->{'d'};
					if (abs($p1-$p2) < $rmMotifThresh) {
						if (($removeCloseMotifs == 1 && $d2 eq $d1) 
								|| ($removeCloseMotifs == -1 && $d2 ne $d1)) {
							$peaks{$hit}->{'m'}->{$mname}->[$i]->{'valid'}=0 if ($removeRevoppMotifs == 0);
							$peaks{$hit}->{'m'}->{$mname}->[$j]->{'valid'}=0;
						}
					}
				}
			}
		}

		if (!exists($motifNames{$mname})) {
			$motifNames{$mname} = 1;
			push(@motifNames, $mname);
		}


		#provide information about centering
	}
	close IN;
	`rm "$tmpfile"`;
}
if ($mbedFile ne '') {
	close MBED;
}
if ($mfastaFile ne '') {
	close MFASTA;
}
if ($mlogicFile ne '') {
	calcMotifLogic(\%peaks,\@motifNames,$strandFlag,$mlogicFile);
}


## cmpGenomes section ....................................................................
my %motifSubMatrix = ();
if ($cmpGenomeFlag) {
	for (my $i=0;$i<@motifFiles;$i++) {
		my $mfile = $motifFiles[$i];
		open IN, $mfile;
		my $cur = '';
		my @matrix1 = ();
		my $matrix = \@matrix1;
		while (<IN>) {
			chomp;
			s/\r//g;
			my @line = split /\t/;
			if ($line[0] =~ /^>/) {
				if ($cur ne '') {
					$motifSubMatrix{$cur} = $matrix;
				}
				$cur = $line[1];
				my @matrix = ();
				$matrix = \@matrix;
			} else {
				push(@$matrix, [0, 0, 0, 0]);
			}
		}
		close IN;
		if ($cur ne '') {
			$motifSubMatrix{$cur} = $matrix;
		}
	}
}
my @cpeaks = ();
for (my $z=0;$z<@cmpGenomes;$z++) {
	my $liftOver = $cmpLiftover[$z];
	my $revliftOver = "";
	$revliftOver = $revLiftover[$z] if (@revLiftover > $z);
	my $cgenome = $cmpGenomes[$z];
	my $cgenomeDir = $cmpGenomeInfo{$cgenome}->{'dir'};
	my $cmaskFlag = $cmpGenomeInfo{$cgenome}->{'mask'};
	#$cmpGenomeInfo{$cmpGenomes[$z]} = {genome=>$cgenome,dir=>$cgenomeDir,custom=>$ccustomGenome,mask=>$cmaskFlag};
	

	my $extractOption = "";
	if ($cmaskFlag) {
		$extractOption = "-mask";
	}

	print STDERR "\n\tComparing motifs in genome $cgenome\n\n";

	print STDERR "\tLifting over peak positions for $cgenome\n";
	`convertCoordinates.pl "$liftOver" "$posFile" "$tmpfile" -type peaks -p $maxCPUs`;
	print STDERR "\tExtracting sequence for liftover positions in $cgenome\n";
	`homerTools extract "$tmpfile" "$cgenomeDir" $extractOption > "$tmpfile2"`;


	# compare sequences using blastn.....................
	if ($skipBlastn == 0) {
		print STDERR "\tChecking genome peak alignments\n";
		my %seq1 = ();
		my %seq2 = ();
		open IN, $seqFile;
		while (<IN>) {
			chomp;
			my @line = split /\t/;
			$seq1{$line[0]}=$line[1];
		}
		close IN;
		open IN, $tmpfile2;
		while (<IN>) {
			chomp;
			my @line = split /\t/;
			$seq2{$line[0]}=$line[1];
			}
		close IN;
		my $counter = 0;
		foreach(keys %peaks) {
			$counter++;
			if ($counter % 100 ==0) {
				print STDERR "\t\t$counter\n";
			}
			my $hit = $_;
			my $s1 = '';
			my $s2 = '';
			if (!exists($seq1{$hit}) || !exists($seq2{$hit})) {
				$peaks{$hit}->{'gComp'}->[$z]->{'pid'} = "NA";
					$peaks{$hit}->{'gComp'}->[$z]->{'paln'} = 0;
				next;
			}
			my $length = $peaks{$hit}->{'e'} - $peaks{$hit}->{'s'};
			$length = 1 if ($length < 1);
			$s1 = $seq1{$hit};
			$s2 = $seq2{$hit};
			#print STDERR "s1=$s1\ns2=$s2\n";
			open OUT, ">$tmpfile5";
			print OUT ">$hit\n$s1\n";
			close OUT;
			open OUT, ">$tmpfile6";
			print OUT ">$hit\n$s2\n";
			close OUT;
			`blastn -query "$tmpfile5" -subject "$tmpfile6" -outfmt 6 > $tmpfile7`;
			open IN, $tmpfile7;
			while (<IN>) {
				chomp;
				my @line = split /\t/;
				my $pidentity = $line[2];
				my $aln  = $line[3]/$length;
				$aln = 1 if ($aln > 1);
				$peaks{$hit}->{'gComp'}->[$z]->{'pid'} = $pidentity;
				$peaks{$hit}->{'gComp'}->[$z]->{'paln'} = $aln;
				last;
			}
			close IN;
			`rm $tmpfile5 $tmpfile6 $tmpfile7`;
		}
	}

	my %cpeaks = ();
	open IN, $tmpfile;
	while (<IN>) {
		chomp;
		s/\r//g;
		next if (/^#/);
		my @line = split /\t/;
		my $hit = $line[0];
		my $chr = $line[1];
		my $start = $line[2];
		next if ($start =~ /^[^\d\-\.]/);
		my $end = $line[3];
		my $peakSize = $end-$start+1;
		if ($updateSize == 0) {
			$size = $peakSize;
		}
		my $direction = '+';
		if ($line[4] eq '-' || $line[4] eq '1') {
			$direction = '-';
		}
		my $value = 0;
		my $fdr = 'NA';
		if (@line > 5) {
			$value = $line[5];
		}
		my %a = ();
		my %b = ();
		my %c = ();

		$cpeaks{$hit} = {tss=>'NA', tssDist=>'NA',m=>\%a, p=>\%c, cons=>'NA', size=>$peakSize, tssUG=>'NA',
				s=>$start, e=>$end, d=>$direction, v=>$value, c=>$chr, t=>\%b, gc=>'NA',cpg=>'NA',
				centerDist=>1e10,centerDir=>1e10,centerScore=>-10,fdr=>$fdr,ann=>'NA',
				fullann=>'NA'};

		if (!exists($peaks{$hit})) {
			print STDERR "!!! Something is wrong - can't find $hit in original peak file!\n";
			exit;
		}
		#let us know that this peak was 'liftoverable' - should add more stats like # mutations;
		$peaks{$hit}->{'gComp'}->[$z]->{'map'} = 1;
	}
	close IN;



	print STDERR "\tChecking motifs across genomes\n";

	my %cmask = ();
	for (my $i=0;$i<@motifFiles;$i++) {
		my $mfile = $motifFiles[$i];
		#print STDERR "$mfile\n";
		my $options = '';
		my $code = $filterMotifFiles{$mfile};

		if ($homer2Flag) {
			$options .= " -strand +" if ($revoppFlag == 0);
			$options .= " -p $maxCPUs";
			if ($mscoreFlag) {
				`homer2 find -s "$tmpfile2" -offset $offset -m "$mfile" -mscore $options > "$tmpfile3"`;
			} else {
				`homer2 find -s "$tmpfile2" -offset $offset -m "$mfile" $options > "$tmpfile3"`;
			}
		} else {
			$options .= " -norevopp" if ($revoppFlag == 0);
			if ($mscoreFlag) {
				`homer -s "$tmpfile2" -a GENESCORE -offset $offset -m "$mfile" $options > "$tmpfile3"`;
			} else {
				`homer -s "$tmpfile2" -a FIND -offset $offset -m "$mfile" $options > "$tmpfile3"`;
			}
		}

		open IN, $tmpfile3;
		while (<IN>) {
			chomp;
			my @line = split /\t/;
			my $hit = $line[0];
			if (!exists($cpeaks{$hit})) {
				print STDERR "!!!!!!! problem can't find $hit in cpeaks\n";
			}
			my $pos = $line[1];
			my $seq = $line[2];
			my $con = $line[3];
			my $dir = $line[4];
			my $mname = $line[5];
			my $score = 0;
				if ($homer2Flag) {
				$mname = $line[3];
				$score = $line[5];
				$con = 0 ;
				if ($dir eq '-' || $dir eq '1') {
					$pos -= length($seq)-1;
				}
			} else {
				$score = $line[6];
			}
			my $midpoint = $pos + floor(length($seq)/2);
	
			if ($code eq '-fm') {
				if (!exists($cmask{$hit})) {
						my %a = ();
					$cmask{$hit} = \%a;
				}
				for (my $j=$pos;$j<=$pos+length($seq);$j++) {
					$cmask{$hit}->{$j}=1;
				}
				next;
			} elsif ($code eq '-m') {
				my $bad = 0;
				if (exists($cmask{$hit})) {
					for (my $j=$pos;$j<=$pos+length($seq);$j++) {
						if (exists($cmask{$hit}->{$j})) {
							$bad = 1;
							last;
						}
					}
				}
				if ($bad) {
					next;
				}
			}

			my $pChr = $cpeaks{$hit}->{'c'};
			my $pStart = $cpeaks{$hit}->{'s'};
			my $pDir = $cpeaks{$hit}->{'d'};
			my $bedStart = $pStart + $pos - $offset-1;
			my $bedEnd = $bedStart + length($seq);
			if ($pDir eq '1' || $pDir eq '-') {
				$bedEnd = $cpeaks{$hit}->{'e'} - $pos + $offset;
				$bedStart = $bedEnd - length($seq);
			}
			my $bedDir = '+';
			if ($dir eq '-' || $dir eq '1') {
				$bedDir = '-';
				if ($pDir eq '-' || $pDir eq '1') {
					$bedDir = '+';
				}
			} else {
				if ($pDir eq '-' || $pDir eq '1') {
					$bedDir = '-';
				}
			}

			if (!exists($cpeaks{$hit}->{'m'}->{$mname})) {
				my @a = ();
				$cpeaks{$hit}->{'m'}->{$mname} = \@a;
			}		
			my @compData=();
			if ($cmpGenomeFlag) {
				my $compData = {map=>0,score=>0,indel=>0,var=>0,m=>0};
				push(@compData, $compData);
			}


			my $m = {p=>$pos, s=>$seq, c=>$con, d=>$dir, valid=>1,score=>$score,mp=>$midpoint,
					gc=>$pChr,gs=>$bedStart,ge=>$bedEnd,gd=>$bedDir,gComp=>\@compData};
			push(@{$cpeaks{$hit}->{'m'}->{$mname}}, $m);

			if ($removeCloseMotifs != 0) {
				my $num = scalar(@{$cpeaks{$hit}->{'m'}->{$mname}});
				for (my $i=0;$i<$num;$i++) {
					next if ($cpeaks{$hit}->{'m'}->{$mname}->[$i]->{'valid'}==0);
					my $p1 = $cpeaks{$hit}->{'m'}->{$mname}->[$i]->{'p'};
					my $d1 = $cpeaks{$hit}->{'m'}->{$mname}->[$i]->{'d'};
					for (my $j=0;$j<$num;$j++) {
						next if ($j==$i);
						my $p2 = $cpeaks{$hit}->{'m'}->{$mname}->[$j]->{'p'};
						my $d2 = $cpeaks{$hit}->{'m'}->{$mname}->[$j]->{'d'};
						if (abs($p1-$p2) < $rmMotifThresh) {
							if (($removeCloseMotifs == 1 && $d2 eq $d1) 
									|| ($removeCloseMotifs == -1 && $d2 ne $d1)) {
								$cpeaks{$hit}->{'m'}->{$mname}->[$i]->{'valid'}=0 if ($removeRevoppMotifs == 0);
								$cpeaks{$hit}->{'m'}->{$mname}->[$j]->{'valid'}=0;
							}
						}
					}
				}
			}
		}
		close IN;
		`rm "$tmpfile3"`;
	}



	open MBED, ">$tmpfile";
	foreach(keys %peaks) {
		my $hit = $_;
		foreach(keys %{$peaks{$hit}->{'m'}}) {
			my $mname= $_;
			next if (!exists($peaks{$hit}->{'m'}->{$mname}));
			my $i=0;
			foreach(@{$peaks{$hit}->{'m'}->{$mname}}) {
				my $m = $_;
				my $seq = $m->{'s'};
				my $c = $m->{'gc'};
				my $s = $m->{'gs'};
				my $e = $m->{'ge'};
				my $d = $m->{'gd'};
				my $ss = floor($m->{'score'});
				my $name = $hit . "||" . $mname . "||" . $i . "||" . $m->{'p'} . "||" . $m->{'d'} . "||" . $seq;
				print MBED "$c\t$s\t$e\t$name\t$ss\t$d\n";
				$i++;
			}
		}
	}
	close MBED;

	print STDERR "\tLifting over motif positions for $cgenome\n";
	print STDERR "`convertCoordinates.pl $liftOver $tmpfile $tmpfile2 -type bed -p $maxCPUs`;\n";
	`convertCoordinates.pl "$liftOver" "$tmpfile" "$tmpfile2" -type bed -p $maxCPUs`;
#exit;
	print STDERR "\tExtracting sequence for liftover positions in $cgenome\n";
	`homerTools extract "$tmpfile2" "$cgenomeDir" $extractOption > "$tmpfile3"`;

	open IN, $tmpfile3;
	while (<IN>) {
		chomp;
		my @line = split /\t/;
		my $sname = $line[0];
		my $seq = $line[1];

		my @info = split /\|\|/, $sname;
		next if (@info < 4);
		my $hit = $info[0];
		my $mname = $info[1];
		my $mindex = $info[2];
		my $p = $info[3];
		my $strand = $info[4];
		my $ogseq = $info[5];

		if (!exists($peaks{$hit})) {
			print STDERR "!! Problem - couldn't find peak name...\n";
			exit;
		}
		if (!exists($peaks{$hit}->{'m'}->{$mname})) {
			print STDERR "!! Couldn't find $mname in peak $hit\n";
			exit;
		}

		$peaks{$hit}->{'m'}->{$mname}->[$mindex]->{'gComp'}->[$z]->{'map'}=1;
		
		my $rv = $peaks{$hit}->{'m'}->{$mname}->[$mindex]->{'d'};
		if ($rv eq '-') {
			$seq = HomerConfig::revopp($seq);
		}
		#print STDERR ">$hit $rv\n\t$ogseq\n\t$seq\n";
		if ($ogseq eq $seq) {
			$peaks{$hit}->{'m'}->{$mname}->[$mindex]->{'gComp'}->[$z]->{'m'}=1;
		} else {
			$peaks{$hit}->{'m'}->{$mname}->[$mindex]->{'gComp'}->[$z]->{'m'}=0;
			if (length($ogseq) != length($seq)) {
				$peaks{$hit}->{'m'}->{$mname}->[$mindex]->{'gComp'}->[$z]->{'indel'}=1;
			} else {
				my @subs = getSubs($ogseq,$seq);
				foreach(@subs) {
					my $p1 = $_->[0];
					my $n1 = $_->[1];
					next if (!exists($alpha2index{$n1}));
					my $p2 = $alpha2index{$n1};
					$motifSubMatrix{$mname}->[$p1]->[$p2]+=1;
				}
			}
		}
		#my $compData = {map=>0,score=>0,indel=>0,var=>0};
	}
	close IN;


	open MBED, ">$tmpfile";
	foreach(keys %cpeaks) {
		my $hit = $_;
		foreach(keys %{$cpeaks{$hit}->{'m'}}) {
			my $mname= $_;
			next if (!exists($cpeaks{$hit}->{'m'}->{$mname}));
			my $i=0;
			foreach(@{$cpeaks{$hit}->{'m'}->{$mname}}) {
				my $m = $_;
				my $seq = $m->{'s'};
				my $c = $m->{'gc'};
				my $s = $m->{'gs'};
				my $e = $m->{'ge'};
				my $d = $m->{'gd'};
				my $ss = floor($m->{'score'});
				my $name = $hit . "||" . $mname . "||" . $i . "||" . $m->{'p'} . "||" . $m->{'d'} . "||" . $seq;
				print MBED "$c\t$s\t$e\t$name\t$ss\t$d\n";
				$i++;
			}
		}
	}
	close MBED;

	print STDERR "\tLifting over motif positions from $cgenome back to $genome\n";
	print STDERR "`convertCoordinates.pl $revliftOver $tmpfile $tmpfile2 -type bed -p $maxCPUs`;\n";
	`convertCoordinates.pl "$revliftOver" "$tmpfile" "$tmpfile2" -type bed -p $maxCPUs`;
#exit;
	print STDERR "\tExtracting sequence for liftover positions in $cgenome\n";
	`homerTools extract "$tmpfile2" "$genomeDir" $extractOption > "$tmpfile3"`;

	open IN, $tmpfile3;
	while (<IN>) {
		chomp;
		my @line = split /\t/;
		my $sname = $line[0];
		my $seq = $line[1];

		my @info = split /\|\|/, $sname;
		next if (@info < 4);
		my $hit = $info[0];
		my $mname = $info[1];
		my $mindex = $info[2];
		my $p = $info[3];
		my $strand = $info[4];
		my $ogseq = $info[5];

		if (!exists($cpeaks{$hit})) {
			print STDERR "!! Problem - couldn't find peak name...\n";
			exit;
		}
		if (!exists($cpeaks{$hit}->{'m'}->{$mname})) {
			print STDERR "!! Couldn't find $mname in peak $hit\n";
			exit;
		}

		$cpeaks{$hit}->{'m'}->{$mname}->[$mindex]->{'gComp'}->[0]->{'map'}=1;
		
		my $rv = $cpeaks{$hit}->{'m'}->{$mname}->[$mindex]->{'d'};
		if ($rv eq '-') {
			$seq = HomerConfig::revopp($seq);
		}
		#print STDERR ">$hit $rv\n\t$ogseq\n\t$seq\n";
		if ($ogseq eq $seq) {
			$cpeaks{$hit}->{'m'}->{$mname}->[$mindex]->{'gComp'}->[0]->{'m'}=1;
		} else {
			$cpeaks{$hit}->{'m'}->{$mname}->[$mindex]->{'gComp'}->[0]->{'m'}=0;
			if (length($ogseq) != length($seq)) {
				$cpeaks{$hit}->{'m'}->{$mname}->[$mindex]->{'gComp'}->[0]->{'indel'}=1;
			} else {
				my @subs = getSubs($ogseq,$seq);
				foreach(@subs) {
					my $p1 = $_->[0];
					my $n1 = $_->[1];
					next if (!exists($alpha2index{$n1}));
					my $p2 = $alpha2index{$n1};
					$motifSubMatrix{$mname}->[$p1]->[$p2]+=1;
				}
			}
		}
		#my $compData = {map=>0,score=>0,indel=>0,var=>0};
	}
	push(@cpeaks, \%cpeaks);
}

if ($cmpGenomeFlag) {
	open OUT, ">submatrix.motif";
	foreach(keys %motifSubMatrix) {
		my $mname = $_;
		print OUT ">$mname\t$mname\t0\n";
		foreach(@{$motifSubMatrix{$mname}}) {
			print OUT "$_->[0]";
			for (my $i=1;$i<4;$i++) {
				print OUT "\t$_->[$i]";
			}
			print OUT "\n";
		}
	}
	close OUT;
}

sub getSubs {
	my ($s1,$s2) = @_;
	my @subs = ();
	for (my $i=0;$i<length($s1);$i++) {
		my $n1 = substr($s1,$i,1);
		my $n2 = substr($s2,$i,1);
		if ($n1 ne $n2) {
			push(@subs, [$i,$n2]);
		}
	}
	return @subs;
}
# End cmpGenomes section.................................


# find nearest Peaks
for (my $i=0;$i<@peakFiles;$i++) {
	my $peakFile = $peakFiles[$i];
	print STDERR "\tFinding nearby peaks in $peakFile\n";

	`bed2pos.pl "$peakFile" -check -unique > "$tmpfile"`;
	`annotateRelativePosition.pl "$posFile", "$tmpfile", 0 > "$tmpfile2"`;

	open IN, $tmpfile2;
	while (<IN>) {
		chomp;
		my @line = split /\t/;
		my $hit = $line[0];
		my $dist = $line[2];
		my $peakID = $line[1];
		my $peakCount = 0;
		#my $peakCount = $line[4];
		my $relStrand = $line[5];
		next if (!exists($peaks{$hit}));

		my %peakDist = ();
		if (@line > 6) {
			my @pp = split /\,/,$line[6];
			foreach(@pp) {
				next if ($_ eq '');
				my @a = split /\=/;
				my $pp = $a[0];
				my @b = split /\|/,$a[1];
				my $v = 0;
				foreach(@b) {
					if ($_ ne 'NA') {
						$v += $_;
					}
				}
				$peakDist{$pp} = $v;
			}
		}

		$peaks{$hit}->{'p'}->{$peakFile} = {d=>$dist,id=>$peakID,s=>$peakCount,rs=>$relStrand,pd=>\%peakDist};
	}
	close IN;


	my $opt = '';
	if ($strandFlag eq '+' ) {
		$opt = " -strand ";
	} elsif ($strandFlag eq '-') {
		$opt = " -strand ";
		`adjustPeakFile.pl "$tmpfile" -flipStrand > "$tmpfile2"`;
		`cp "$tmpfile2" "$tmpfile"`;
	}
	`mergePeaks "$posFile" "$tmpfile" -cobound 1 -prefix "$rand" $opt`;

	open IN, "$rand.coBoundBy1.txt";
	while (<IN>) {
		next if (/^\#/);
		chomp;
		my @line = split /\t/;
		my $hit = $line[0];
		next if (@line < 8);
		my @overlaps = split /\,/,$line[7];
		my $noverlaps = scalar(@overlaps);
		next if (!exists($peaks{$hit}));
		$peaks{$hit}->{'p'}->{$peakFile}->{'s'} = $noverlaps
	}
	close IN;
	`rm "$tmpfile" "$tmpfile2" "$rand.coBoundBy1.txt" "$rand.coBoundBy0.txt"`;

}

# find overlap with super enhancers
my @seRanks = ();
for (my $i=0;$i<@seFiles;$i++) {
	my %seRanks = ();
	my $peakFile = $seFiles[$i];
	print STDERR "\tFinding overlap with super enhancers in file: $peakFile\n";

	my %rank = ();
	`bed2pos.pl -check -unique "$peakFile" > "$tmpfile"`;
	open IN, $tmpfile;
	my $rank = 1;
	while (<IN>) {
		chomp;
		s/\r//g;
		next if (/^#/);
		my @line = split /\t/;
		next if ($line[2] !~ /\d+/ || $line[3] !~ /\d+/);
		my $pid = $line[0];
		$rank{$pid} = $rank++;
	}
	close IN;

	`mergePeaks "$posFile" "$tmpfile" -cobound 1 -prefix $rand`;
	open IN, $rand . ".coBoundBy1.txt";
	while (<IN>) {
		chomp;
		s/\r//g;
		next if (/^#/);
		my @line = split /\t/;
		my $peakID = $line[0];
		next if (!exists($peaks{$peakID}));
		next if (@line < 7);
		my @se = split /\,/, $line[7];
		my $bestRank = 1e9;
		foreach(@se) {
			if (!exists($rank{$_})) {
				print STDERR "Problem with SE overlap???\n";
			}
			my $r = $rank{$_};
			if ($r < $bestRank) {
				$bestRank = $r;
			}
		}
		if ($bestRank < 9e8) {
			$seRanks{$peakID} = $bestRank;
		}
	}
	close IN;	
	push(@seRanks, \%seRanks);
	`rm "$tmpfile" "$rand.coBoundBy0.txt" "$rand.coBoundBy1.txt"`;
}

if ($matrixPrefix ne '') {

	my @peaks = keys %peaks;
	my $numPeaks = scalar(@peaks);
	open OUT, ">$matrixPrefix.stats.txt";
	print OUT "Motif1:Motif2\tTotalPeaks\tPeaks with Motif2\tPeaks with Motif1"
				. "\tPeaks with Both\tExpected Overlap\tObserved/Expected\tlogPvalue\n";

	print STDERR "\tCalculating motif co-occurrence matrix\n";
	my %matrix=();
	my %matrixRatio=();
	my %matrixCount=();
	my %numSites = ();

	for (my $i=0;$i<@motifNames;$i++) {
		for (my $j=$i;$j<@motifNames;$j++) {
			my $n1 = 0;
			my $n2 = 0;
			my $n = 0;
			my $nx = 0;
			my $ny = 0;
			my $nz = 0;
			my $nmotifs1 = 0;
			my $nmotifs2 = 0;

			my $curMinDist = $matrixMinDist;;
			if ($i==$j && $curMinDist < 4) {
				# if checking against self, lets always eliminate double counting
				$curMinDist = 4;
			}
			foreach(@peaks) {
				my $pname = $_;
				my $p1 = 0;
				my $p2 = 0;
				my $nm1 = 0;
				my $nm2 = 0;
				if (exists($peaks{$pname}->{'m'}->{$motifNames[$i]})) {
					$p1=1;
					$n1++;
					$nm1=1;
					# this part counts how many motifs are present in the peak (> than curMinDist apart)
					my $narray = scalar(@{$peaks{$pname}->{'m'}->{$motifNames[$i]}});
					for (my $k=0;$k<$narray-1;$k++) {
						my $pos1 = $peaks{$pname}->{'m'}->{$motifNames[$i]}->[$k]->{'mp'};
						my $z = $k+1;
						while ($z < $narray && abs($pos1 -
									$peaks{$pname}->{'m'}->{$motifNames[$i]}->[$z]->{'mp'}) < $curMinDist) {
							$z++;
						}
						if ($z < $narray) {
							$nm1++;
							$k = $z;
						}
					}
					$nmotifs1 += $nm1;
					#print STDERR "$pname $narray $nm1\n";
				}
				if (exists($peaks{$pname}->{'m'}->{$motifNames[$j]})) {
					$p2=1;
					$n2++;
					$nm2=1;
					my $narray = scalar(@{$peaks{$pname}->{'m'}->{$motifNames[$j]}});
					for (my $k=0;$k<$narray-1;$k++) {
						my $pos1 = $peaks{$pname}->{'m'}->{$motifNames[$j]}->[$k]->{'mp'};
						my $z = $k+1;
						while ($z < $narray && abs($pos1 -
									$peaks{$pname}->{'m'}->{$motifNames[$j]}->[$z]->{'mp'}) < $curMinDist) {
							$z++;
						}
						if ($z < $narray) {
							$nm2++;
							$k = $z;
						}
					}
					$nmotifs2 += $nm2;
				}
				if ($p1==1 && $p2 == 1) {
					my $hit = 0;
					foreach(@{$peaks{$pname}->{'m'}->{$motifNames[$i]}}) {
						my $m1 = $_;
						foreach(@{$peaks{$pname}->{'m'}->{$motifNames[$j]}}) {
							my $m2 = $_;
							my $d = abs($m1->{'mp'} - $m2->{'mp'});
							if ($d >= $curMinDist && $d <= $matrixMaxDist) {
								$hit = 1;
							}
						}
					}
					if ($hit == 1)  {
						$n++;
					}
				}
			}


			my $expectedOverlap = 0.0;
			my $ratio = 1.0;
			my $logp = 0.0;

			# in theory this is only for when there's no matrixMin/MaxDist, but for now it will do
			if (1) { #$matrixMinDist <= 0 && $matrixMaxDist > 1e9) {
				$expectedOverlap = $n1*$n2/$numPeaks;
				if ($i==$j) {
					my $avgMotifPerPeak = $nmotifs1/$numPeaks;
					#use binomial distribution to apprixmate how many peaks have 2 or more motifs present
					#in them by chance given the total number of motifs in the whole set of peaks
					my $ptotal = exp(Statistics::logbinomial($avgSize,2,$avgMotifPerPeak/$avgSize,$avgSize*$numPeaks));
					$expectedOverlap = $ptotal*$numPeaks;
				}
			}
			my $expected = $expectedOverlap;
			if ($expectedOverlap < 0.5) {
				$expectedOverlap = 0.5;
			}
			$ratio = $n/$expectedOverlap;
			my $backRatio = $expectedOverlap/$numPeaks;

			if ($ratio >= 1.0) {
				$logp = Statistics::logbinomial($numPeaks, $n, $backRatio, $numPeaks);
			} else {
				$logp = -1*Statistics::ilogbinomial($numPeaks, $n, $backRatio, $numPeaks);
			}


			if (!exists($matrix{$motifNames[$i]})) {
				my %a = ();	
				my %b = ();	
				my %c = ();	
				$matrix{$motifNames[$i]}=\%a;
				$matrixRatio{$motifNames[$i]}=\%b;
				$matrixCount{$motifNames[$i]}=\%c;
			}
			if (!exists($matrix{$motifNames[$j]})) {
				my %a = ();	
				my %b = ();	
				my %c = ();	
				$matrix{$motifNames[$j]}=\%a;
				$matrixRatio{$motifNames[$j]}=\%b;
				$matrixCount{$motifNames[$j]}=\%c;
			}
			$matrix{$motifNames[$i]}->{$motifNames[$j]} = $logp;
			$matrix{$motifNames[$j]}->{$motifNames[$i]} = $logp;
			$matrixRatio{$motifNames[$i]}->{$motifNames[$j]} = $ratio;
			$matrixRatio{$motifNames[$j]}->{$motifNames[$i]} = $ratio;
			$matrixCount{$motifNames[$i]}->{$motifNames[$j]} = $n;
			$matrixCount{$motifNames[$j]}->{$motifNames[$i]} = $n;
			$numSites{$motifNames[$i]} = $n1;
			$numSites{$motifNames[$j]} = $n2;

			print OUT "$motifNames[$i]:$motifNames[$j]\t$numPeaks\t$n1\t$n2\t$n\t$expected\t$ratio\t$logp\n";
		}
	}
	close OUT;


	open OUT, ">$matrixPrefix.logPvalue.matrix.txt";
	open OUT2, ">$matrixPrefix.ratio.matrix.txt";
	open OUT3, ">$matrixPrefix.count.matrix.txt";
	print OUT "Motif Name (# sites) [values are natural log, + values for divergence]";
	print OUT2 "Motif Name (# sites) [observed/expected]";
	print OUT3 "Motif Name (# sites) [Overlap Counts]";
	foreach(@motifNames) {
		print OUT "\t$_ ($numSites{$_})";
		print OUT2 "\t$_ ($numSites{$_})";
		print OUT3 "\t$_ ($numSites{$_})";
	}
	print OUT "\n";
	print OUT2 "\n";
	print OUT3 "\n";
	for (my $i=0;$i<@motifNames;$i++) {
		print OUT "$motifNames[$i] ($numSites{$motifNames[$i]})";
		print OUT2 "$motifNames[$i] ($numSites{$motifNames[$i]})";
		print OUT3 "$motifNames[$i] ($numSites{$motifNames[$i]})";
		for (my $j=0;$j<@motifNames;$j++) {
			my $logp = 0;
			$logp = $matrix{$motifNames[$i]}->{$motifNames[$j]};
			print OUT "\t$logp";
			my $logRatio = $matrixRatio{$motifNames[$i]}->{$motifNames[$j]};
			print OUT2 "\t$logRatio";
			my $v = $matrixCount{$motifNames[$i]}->{$motifNames[$j]};
			print OUT3 "\t$v";
		}
		print OUT "\n";
		print OUT2 "\n";
		print OUT3 "\n";
	}

	close OUT;
	close OUT2;
	close OUT3;
}



######################################################################
########  Histogram Mode..................  ##########################
######################################################################
if ($histBinSize > 0) {
	if ($size eq 'given') {
		print STDERR "\tCompiling per % Histograms...\n";
	} else {
		print STDERR "\tCompiling per bp Histograms...\n";
	}

	my %histograms = ();
	my @histogramNames = ();

	print STDERR "\tFinding Tags in Peaks from each directory...\n";
	my %fragLengths = ();
	for (my $i=0;$i<@tagDirs;$i++) {
		my $dir = $tagDirs[$i];
		my ($t, $p, $flen, $slen) = HomerConfig::readTagInfo($dir,$init2One);

		$tagTotals{$dir} = $t;
		if ($fragLength eq 'auto') {
			if ($flen eq 'NA') {
				print STDERR "\tDefault fragment length set to 150\n";
				$flen = 150;
			} elsif ($flen < 42) {
				print STDERR "\t!warning, ChIP-Fragment length for $dir seems short ($flen)\n";
			}
			$fragLengths{$dir} = $flen;
		} else {
			$fragLengths{$dir} = $fragLength;
		}
		$normFactors{$dir} = 1;
	}

	if ($adjustFlag == 1) {
		if ($normValue < 1) {
			my $nn = 0;
			foreach(@tagDirs) {
				$normValue += $tagTotals{$_};
				$nn++;
			}
			$normValue /= $nn if ($nn > 0);
		}
		foreach(@tagDirs) {
			my $total = $tagTotals{$_};
			next if ($total < 1);
			my $ratio = $normValue / $total;
			print STDERR "\tRatio for $_ : $ratio\n";
			my $vv= sprintf("%.2f",$ratio);
			$normFactors{$_} = $vv;
		}
	}

	my %ghistData = ();
	my $emptyRow = '';
	my @allDirs = (@peakFiles, @motifNames, @tagDirs, @bedGraphFiles, @wigFiles);
	if ($ghistFlag == 1) {

		# should add motif, wig, and peak files to this at some point
		print STDERR "\n\tOrder of experiments in output file:\n";
		print "Gene";	
		if ($size eq 'given') {
			my $startBin = 0;
			for (my $i=0;$i<@allDirs;$i++) {
				print STDERR "\t\t$allDirs[$i]\n";
				for (my $j=1;$j<=$histBinSize;$j++) {
					my $v = sprintf("%.1f",$j/$histBinSize*100) . '%';
					print "\t$v";
					if ($i==0) {
						$emptyRow .= "\t0";
					}
				}
			}
			print "\n";
		} else {
			my $startBin = -1*floor($halfSize/$histBinSize+0.5)*$histBinSize;
			my $endBin = floor($halfSize/$histBinSize+0.5)*$histBinSize;
			for (my $j=0;$j<@allDirs;$j++) {
				print STDERR "\t\t$allDirs[$j]\n";
				for (my $i=$startBin;$i<=$endBin;$i+=$histBinSize) {
					print "\t$i";
					if ($j==0) {
						$emptyRow .= "\t0";
					}
				}
			}
			print "\n";
		}
	}


	foreach(@peakFiles) {
		my $peakFile = $_;
		my %total = ();
		foreach(@peakOrder) {
		#foreach(keys %peaks) {
			my $peakID = $_;
			my %currentPeak = ();
			if (exists($peaks{$peakID}->{'p'}->{$peakFile})) {
				if ($newPeakHistogramFlag) {
					foreach(keys %{$peaks{$peakID}->{'p'}->{$peakFile}->{'pd'}}) {
						my $d = $_;
						my $v = $peaks{$peakID}->{'p'}->{$peakFile}->{'pd'}->{$d};
						if ($size eq 'given') {
							my $peakWidth = $peaks{$peakID}->{'e'}-$peaks{$peakID}->{'s'};
							$v /= ($peakWidth/$histBinSize) if ($peakWidth > 0);
							$d = $d+($peakWidth/2);
							if ($d >= 0 && $d <= $peakWidth) {
								my $binValue = floor(($d/($peakWidth))*$histBinSize);
								$total{$binValue}+=$v;
								$currentPeak{$binValue}+=$v;
							}
						} else {
							my $binValue = floor($d/$histBinSize+0.5)*$histBinSize;
							$total{$binValue}+=$v;
							$currentPeak{$binValue}+=$v;
						}
					}

				} else {
					my $d = $peaks{$peakID}->{'p'}->{$peakFile}->{'d'};
					if ($size eq 'given') {
						my $peakWidth = $peaks{$peakID}->{'e'}-$peaks{$peakID}->{'s'};
						$d = $d+($peakWidth/2);
						if ($d >= 0 && $d <= $peakWidth) {
							my $binValue = floor(($d/($peakWidth))*$histBinSize);
							$total{$binValue}++;
							$currentPeak{$binValue}++;
						}
					} else {
						my $binValue = floor($d/$histBinSize+0.5)*$histBinSize;
						$total{$binValue}++;
						$currentPeak{$binValue}++;
					}
				}
			}

			if ($ghistFlag == 1) {
				my $startBin = -1*floor($halfSize/$histBinSize+0.5)*$histBinSize;
				my $endBin = floor($halfSize/$histBinSize+0.5)*$histBinSize;
				my $incSize = $histBinSize;
				if ($size eq 'given') {
					$startBin = 1;
					$endBin = $histBinSize;
					$incSize = 1;
				}
				my $ghistStr = "";
				for (my $b=$startBin;$b<=$endBin;$b+=$incSize) {
					my $v = 0;
					if (exists($currentPeak{$b})) {
						$v = $currentPeak{$b};
					}
					$ghistStr .= "\t$v";
				}
				if (!exists($ghistData{$peakID})) {
					my %c = ();
					$ghistData{$peakID} = \%c;
				}
				$ghistData{$peakID}->{$peakFile} = $ghistStr;
			}
		}
		push(@histogramNames, $peakFile);
		$histograms{$peakFile}=\%total;
	}

	foreach(@motifNames) {
		my $mname = $_;
		my %p5motifs = ();
		my %p3motifs = ();
		my %total = ();
		foreach(@peakOrder) {
		#foreach(keys %peaks) {
			my $peakID = $_;
			my $curPeak = $peaks{$peakID};
			my $peakWidth = $curPeak->{'e'}-$curPeak->{'s'};
			my %peakTotal = ();
			if (exists($curPeak->{'m'}->{$mname})) {
				foreach(@{$curPeak->{'m'}->{$mname}}) {
					my $pos = $_->{'p'};
					my $valid = $_->{'valid'};
					next if ($valid == 0);
					my $dir = $_->{'d'};
					my $seqLen = length($_->{'s'});
					if ($dir eq '+' || $dir eq "0") {
						$dir = 0;
					} else {
						$dir = 1;
						$pos += $seqLen-1;
					}
					my $midPoint = 0;
					if ($size eq 'given') {
						my $v = 1;
						$v /= ($peakWidth/$histBinSize) if ($peakWidth > 0);
						if ($pos >= 0 && $pos <= $peakWidth) {
							my $binValue = floor($pos/$peakWidth*$histBinSize);
							if ($dir == 0) {
								$p5motifs{$binValue}+=$v;
							} else {
								$p3motifs{$binValue}+=$v;
							}
						}
						if ($dir == 0) {
							$midPoint = $pos + floor($seqLen/2);
						} else {
							$midPoint = $pos - floor($seqLen/2);
						}
						if ($midPoint >= 0 && $midPoint <= $peakWidth) {
							my $binValue = floor($midPoint/$peakWidth*$histBinSize);
							$total{$binValue}+=$v;
							$peakTotal{$binValue}+=$v;
						}
					} else {
						my $binValue = floor($pos/$histBinSize+0.5)*$histBinSize;
						if ($dir == 0) {
							$p5motifs{$binValue}++;
							$midPoint = $pos + floor($seqLen/2);
						} else {
							$p3motifs{$binValue}++;
							$midPoint = $pos - floor($seqLen/2);
						}
						my $midBinValue = floor($midPoint/$histBinSize+0.5)*$histBinSize;
						$total{$midBinValue}++;
						$peakTotal{$midBinValue}++;
					}
				}
			}
			if ($ghistFlag == 1) {
				my $startBin = -1*floor($halfSize/$histBinSize+0.5)*$histBinSize;
				my $endBin = floor($halfSize/$histBinSize+0.5)*$histBinSize;
				my $incSize = $histBinSize;
				if ($size eq 'given') {
					$startBin = 1;
					$endBin = $histBinSize;
					$incSize = 1;
				}
				my $ghistStr = "";
				for (my $b=$startBin;$b<=$endBin;$b+=$incSize) {
					my $v = 0;
					if (exists($peakTotal{$b})) {
						$v = $peakTotal{$b};
					}
					$ghistStr .= "\t$v";
				}
				if (!exists($ghistData{$peakID})) {
					my %c = ();
					$ghistData{$peakID} = \%c;
				}
				$ghistData{$peakID}->{$mname} = $ghistStr;
			}
		}
		my $p5name = $mname . " + sites";
		my $p3name = $mname . " - sites";
		my $totalname = $mname . " total sites";
		push(@histogramNames, $totalname);
		push(@histogramNames, $p5name);
		push(@histogramNames, $p3name);
		$histograms{$totalname}=\%total;
		$histograms{$p5name}=\%p5motifs;
		$histograms{$p3name}=\%p3motifs;

	}

	for (my $i=0;$i<@tagDirs;$i++) {
		my %p5tags = ();
		my %p3tags = ();
		my %coverage = ();
		my %diffMap = ();

		#keeps track of number of tags added for ratio mode
		my %p5tagsN = ();
		my %p3tagsN = ();
		my %coverageN = ();
		my %diffMapN = ();

		my $dir = $tagDirs[$i];
		my $offset = $halfSize*-1+$sizeMove;
		my $sizeRegion = $size;
		my $optStr = " -strand $strandFlag";
		if ($size eq 'given') {
			$optStr .= ' -fixed';
			$sizeRegion = '';
		} else {
			#my $halfRegionSize = floor($sizeRegion/2);
			my $halfRegionSize = floor($sizeRegion/2)+$histBinSize*2+abs($fragLengths{$dir});
			$optStr .= " -offset $offset -start -$halfRegionSize -end $halfRegionSize ";
		}
		
		#print STDERR "`getRelativeTagPositions.pl $posFile,$offset $dir $sizeRegion > $tmpfile`\n";
		#print STDERR "`getPeakTags $posFile $dir -peaktags $optStr > $tmpfile`;\n";
		#`getRelativeTagPositions.pl "$posFile",$offset "$dir" $sizeRegion > "$tmpfile"`;
		#print STDERR "`getPeakTags $posFile $dir -peaktags $optStr > $tmpfile`;\n";
		`getPeakTags "$posFile" "$dir" -peaktags $optStr > "$tmpfile"`;
		#print STDERR "$tmpfile\n";
		#exit;

		my $p5StrandFlag = 1;
		my $p3StrandFlag = 1;
		if ($strandFlag eq '+') {
			$p3StrandFlag = 0;
		} elsif ($strandFlag eq '-') {
			$p5StrandFlag = 0;
		}

		my $normLengthFactor = 1.0;
		if ($normLength > 1e-10) {
			$normLengthFactor = $normLength/$fragLengths{$dir};
		}
		if ($ratioFlag) {
			$normLengthFactor = 1.0;
		}

		print STDERR "\tProcessing tags from $dir\n";
		my $count = 0;
		open IN, $tmpfile;
		while (<IN>) {
			$count++;
			if ($count % 10000 == 0) {
				print STDERR "\t$count\n";
			}
			chomp;
			my @line = split /\t/;
			next if (@line < 2);
			my $peakID = $line[0];

			my $peakWidth = $peaks{$peakID}->{'e'}-$peaks{$peakID}->{'s'};
			my %peakMap = ();
			my %peakMapN = ();
			my @pos = split /\,/,$line[1];
			my $p3total = 0;
			my $p5total = 0;
			my $alltotal = 0;
			if ($histNorm > 0) {
				foreach(@pos) {
					my @pair = split /\=/;
					my $pos = $pair[0];
					my @values = split /\|/, $pair[1];
					if ($init2One > 0) {
						foreach(@values) {
							next if ($_ eq 'NA');
							$_ = $init2One if ($_ > $init2One);
						}
					}
					if ($adjustFlag ==1 && exists($normFactors{$dir})) {
						foreach(@values) {
							next if ($_ eq 'NA');
							$_ *= $normFactors{$dir};
						}
					}
					if ($revoppFlag == 0) {
						$p3total=0;
					}
					if ($values[0] ne 'NA' && $p5StrandFlag) {
						$p5total += $values[0];
						$alltotal += $values[0];
					}
					if ($values[1] ne 'NA' && $p3StrandFlag) {
						$p3total += $values[1];
						$alltotal += $values[1];
					}
				}
				$p5total = $histNorm if ($p5total < $histNorm);
				$p3total = $histNorm if ($p3total < $histNorm);
				$alltotal = $histNorm if ($alltotal < $histNorm);
			} else {
				$p3total = 1;
				$p5total = 1;
				$alltotal = 1;
				if ($size eq 'given') {
					$p3total = $peakWidth/$histBinSize;
					$p5total = $peakWidth/$histBinSize;
					$alltotal = $peakWidth/$histBinSize;
				}
			}
			#print STDERR "$p3total\t$p5total\t$alltotal\n";	

			foreach(@pos) {
				my @pair = split /\=/;
				my $pos = $pair[0];
				my @values = split /\|/, $pair[1];
				if ($init2One > 0) {
					foreach(@values) {
						next if ($_ eq 'NA');
						$_ = $init2One if ($_ > $init2One);
					}
				}
				if ($adjustFlag ==1 && exists($normFactors{$dir})) {
					foreach(@values) {
						next if ($_ eq 'NA');
						$_ *= $normFactors{$dir} if ($ratioFlag == 0);
					}
				}
				if ($revoppFlag == 0) {
					#$p3total=0; ### Why???
				}
				my $binValue = 0;
				if ($size eq 'given') {
					$binValue = floor($pos/$peakWidth*$histBinSize);
				} else {
					$binValue = floor($pos/$histBinSize+0.5)*$histBinSize;
				}

				if ($values[0] ne 'NA' && $p5StrandFlag) {
					if ($ratioFlag == 1) {
						$p5tags{$binValue} += $values[0];
						$p5tagsN{$binValue} ++;
					} else {
						$p5tags{$binValue} += $values[0]/$p5total;
						$p5tagsN{$binValue} += $p5total;
					}

					my $endPos = $pos+$fragLengths{$dir};
					my $startBin = $binValue;
					my $endBin = $binValue;
					if ($size eq 'given') {
						$endBin = floor($endPos/$peakWidth*$histBinSize)+1;
					} else {
						$endBin = floor($endPos/$histBinSize+0.5)*$histBinSize+$histBinSize;
					}
					my $v = $values[0]/$alltotal*$normLengthFactor;
					if ($ratioFlag == 1) {
						$v = $values[0]/$normLengthFactor;
					}
					$diffMap{$startBin} += $v;
					$diffMap{$endBin} -= $v;
					$peakMap{$startBin} += $v;
					$peakMap{$endBin} -= $v;
					$diffMapN{$startBin}++;
					$diffMapN{$endBin}--;
					$peakMapN{$startBin}++;
					$peakMapN{$endBin}--;
				}
				if ($values[1] ne 'NA' && $p3StrandFlag) {
					if ($ratioFlag == 1) {
						$p3tags{$binValue} += $values[1];
						$p3tagsN{$binValue}++;
					} else {
						$p3tags{$binValue} += $values[1]/$p3total;
						$p3tagsN{$binValue}+= $p3total;
					}

					my $endPos = $pos-$fragLengths{$dir};
					my $startBin = $binValue;
					my $endBin = $binValue;
					if ($size eq 'given') {
						$startBin = floor($endPos/$peakWidth*$histBinSize);
						$endBin ++;
					} else {
						$startBin = floor($endPos/$histBinSize+0.5)*$histBinSize;
						$endBin += $histBinSize;
					}
					my $v = $values[1]/$alltotal*$normLengthFactor;
					if ($ratioFlag == 1) {
						$v = $values[1]/$normLengthFactor;
					}
					$diffMap{$startBin} += $v;
					$diffMap{$endBin} -= $v;
					$peakMap{$startBin} += $v;
					$peakMap{$endBin} -= $v;
					$diffMapN{$startBin}++;
					$diffMapN{$endBin}--;
					$peakMapN{$startBin}++;
					$peakMapN{$endBin}--;
				}
			}
			if ($ghistFlag == 1) {
				my %peakCoverage=();
				my @peakSortBins = sort {$a <=> $b} keys %peakMap;
				my $startBin = $sortBins[0];
				my $endBin = $sortBins[@sortBins-1];
				my $value = 0;
				my $N = 0;
				foreach(@peakSortBins) {
					if (exists($peakMap{$_})) {
						$value += $peakMap{$_};
						$N += $peakMapN{$_};
					}
					my $v = $value;
					if ($ratioFlag) {
						$v /= $N if ($N>0);
					}
					$peakCoverage{$_}=$v;
				}
				$startBin = -1*floor($halfSize/$histBinSize+0.5)*$histBinSize;
				$endBin = floor($halfSize/$histBinSize+0.5)*$histBinSize;
				my $incSize = $histBinSize;
				if ($size eq 'given') {
					$startBin = 1;
					$endBin = $histBinSize;
					$incSize = 1;
				}

				my $last = 0;
				my $ghistStr = "";
				for (my $b=$startBin;$b<=$endBin;$b+=$incSize) {
					my $v = $last;
					if (exists($peakCoverage{$b})) {
						$v = $peakCoverage{$b};
						$last = $v;
					}
					$ghistStr .= "\t$v";
				}
				if (!exists($ghistData{$peakID})) {
					my %c = ();
					$ghistData{$peakID} = \%c;
				}
				$ghistData{$peakID}->{$dir} = $ghistStr;
			}
		}
		close IN;
		`rm "$tmpfile"`;
		#print STDERR "\tDone processing Tags\n";

		if ($ghistFlag == 1) {
			next;
		}

		#build coverage map
		my @sortBins = sort {$a <=> $b} keys %diffMap;
		if (@sortBins > 1) {

			my %tmpCoverage = ();
			my $value = 0;
			my $N = 0;
			foreach(@sortBins) {
				my $b = $_;
				$value += $diffMap{$b};
				$N += $diffMapN{$b};
				my $v = $value;
				if ($ratioFlag == 1) {
					$v /= $N if ($N > 0);
				}
				$tmpCoverage{$b} = $v;
			}
			

			my $startBin = $sortBins[0];
			my $endBin = $sortBins[@sortBins-1];
			my $incSize = $histBinSize;
			my $lastValue = 0;

			if ($size eq 'given') {
				$startBin = 0;
				$endBin = $histBinSize;
				$incSize = 1;
			}
			for (my $b=$startBin;$b<=$endBin;$b+=$incSize) {
				if (exists($tmpCoverage{$b})) {
					$lastValue = $tmpCoverage{$b};
				}
				$coverage{$b} = $lastValue;
				if ($ratioFlag != 1) {
					$coverage{$b} *= $histBinSize;
				}
			}
		}
		if ($ratioFlag==1) {
			# && $size ne 'given') {
			foreach(keys %p5tags) {
				$p5tags{$_} /= $p5tagsN{$_};
			}
			foreach(keys %p3tags) {
				$p3tags{$_} /= $p3tagsN{$_};
			}
		}

		my $p5name = "$dir" . " + Tags";
		my $p3name = "$dir" . " - Tags";
		my $coverageName = "$dir" . " Coverage";
		push(@histogramNames, $coverageName);
		push(@histogramNames, $p5name);
		push(@histogramNames, $p3name);
		$histograms{$p5name} = \%p5tags;
		$histograms{$p3name} = \%p3tags;
		$histograms{$coverageName} = \%coverage;
	}

	my @covFiles = ();
	my @typeCovFiles = ();
	foreach(@bedGraphFiles) {
		push(@covFiles, $_);
		push(@typeCovFiles, "-bedGraph");
	}
	foreach(@wigFiles) {
		push(@covFiles, $_);
		push(@typeCovFiles, "-wig");
	}

	for (my $i=0;$i<@covFiles;$i++) {
		my $bedGraphFile = $covFiles[$i];
		my $type = $typeCovFiles[$i];
		my %p5tags = ();
		my %p3tags = ();
		my %coverage = ();
		my %diffMap = ();

		#keeps track of number of tags added for ratio mode
		my %p5tagsN = ();
		my %p3tagsN = ();
		my %coverageN = ();
		my %diffMapN = ();

		my $dir = $tagDirs[$i];
		my $offset = $halfSize*-1+$sizeMove;
		my $sizeRegion = $size;
		my $optStr = " -strand $strandFlag";
		if ($size eq 'given') {
			$optStr .= ' -fixed';
			$sizeRegion = '';
		} else {
			my $halfRegionSize = floor($sizeRegion/2)+$histBinSize*2;
			$optStr .= " -offset $offset -start -$halfRegionSize -end $halfRegionSize ";
		}
		
		`getPeakTags "$posFile" $type "$bedGraphFile" -peaktags $optStr > "$tmpfile"`;
		#print STDERR "`getPeakTags $posFile -bedGraph $bedGraphFile -peaktags $optStr > $tmpfile`;\n";

		my $count = 0;
		open IN, $tmpfile;
		while (<IN>) {
			$count++;
			if ($count % 10000 == 0) {
				print STDERR "\t$count\n";
			}
			chomp;
			s/\r//g;
			my @line = split /\t/;
			my $peakID = $line[0];
			next if (!exists($peaks{$peakID}));
			my $peakWidth = $peaks{$peakID}->{'e'}-$peaks{$peakID}->{'s'};
			my %peakMap = ();
			my @pos = split /\,/,$line[1];
			my $alltotal = 0;
			my $wigRatio = 1;
			if ($size ne 'given') {
				$wigRatio *= $histBinSize;
			}
			my $p5total = 0;
			my $p3total = 0;
			if ($histNorm > 0) {
				foreach(@pos) {
					my @pair = split /\=/;
					my @p = split /\|/, $pair[0];
					my $value = $pair[1];
					$alltotal += $value*($p[1]-$p[0]);
					$p5total += $value;
					$p3total += $value;
				}
				$alltotal = $histNorm if ($alltotal < $histNorm);
				$p5total = $histNorm if ($p5total < $histNorm);
				$p3total = $histNorm if ($p3total < $histNorm);
			} else {
				$alltotal = 1;
				$p5total = 1;
				$p3total = 1;
				if ($size eq 'given') {
					#$p3total = $peakWidth/$histBinSize;
					#$p5total = $peakWidth/$histBinSize;
					#$alltotal = $peakWidth/$histBinSize;
				}
			}
			foreach(@pos) {
				my @pair = split /\=/;
				my @p = split /\|/, $pair[0];
				my $value = $pair[1];

				my $binValueStart = 0;
				my $binValueEnd = 0;
				if ($size eq 'given') {
					$binValueStart = floor($p[0]/$peakWidth*$histBinSize);
					$binValueEnd = floor(($p[1])/$peakWidth*$histBinSize);
				} else {
					$binValueStart = floor($p[0]/$histBinSize+0.5)*$histBinSize;
					$binValueEnd = floor(($p[1])/$histBinSize+0.5)*$histBinSize;
				}

				$p5tags{$binValueStart} += $value/$p5total;
				$p5tagsN{$binValueStart} += $p5total;
				$p3tags{$binValueEnd} += $value/$p3total;
				$p3tagsN{$binValueEnd} += $p3total;
			
				my $startBin = $binValueStart;
				my $endBin = $binValueEnd;

				$diffMap{$startBin} += $value/$alltotal*$wigRatio;
				$diffMap{$endBin} -= $value/$alltotal*$wigRatio;
				$peakMap{$startBin} += $value/$alltotal*$wigRatio;
				$peakMap{$endBin} -= $value/$alltotal*$wigRatio;
				$diffMapN{$startBin}++;
				$diffMapN{$endBin}--;
				$peakMapN{$startBin}++;
				$peakMapN{$endBin}--;
			}
			if ($ghistFlag == 1) {
				my %peakCoverage=();
				my @peakSortBins = sort {$a <=> $b} keys %peakMap;
				my $startBin = $sortBins[0];
				my $endBin = $sortBins[@sortBins-1];
				my $value = 0;
				my $N = 0;
				foreach(@peakSortBins) {
					if (exists($peakMap{$_})) {
						$value += $peakMap{$_};
						$N += $peakMapN{$_};
					}
					my $v = $value;
					if ($ratioFlag) {
						$v /= $N if ($N>0);
					}
					$peakCoverage{$_}=$v/$histBinSize;
				}
				$startBin = -1*floor($halfSize/$histBinSize+0.5)*$histBinSize;
				$endBin = floor($halfSize/$histBinSize+0.5)*$histBinSize;
				my $incSize = $histBinSize;
				if ($size eq 'given') {
					$startBin = 1;
					$endBin = $histBinSize;
					$incSize = 1;
				}

				my $last = 0;
				my $ghistStr = "";
				for (my $b=$startBin;$b<=$endBin;$b+=$incSize) {
					my $v = $last;
					if (exists($peakCoverage{$b})) {
						$v = $peakCoverage{$b};
						$last = $v;
					}
					$ghistStr .= "\t$v";
				}
				if (!exists($ghistData{$peakID})) {
					my %c = ();
					$ghistData{$peakID} = \%c;
				}
				$ghistData{$peakID}->{$bedGraphFile} = $ghistStr;
			}
		}
		close IN;
		`rm "$tmpfile"`;
		#print STDERR "\tDone processing Tags\n";

		if ($ghistFlag == 1) {
			next;
		}

		#build coverage map
		my @sortBins = sort {$a <=> $b} keys %diffMap;
		if (@sortBins > 1) {

			my %tmpCoverage = ();
			my $value = 0;
			my $N = 0;
			foreach(@sortBins) {
				my $b = $_;
				$value += $diffMap{$b};
				$N += $diffMapN{$b};
				my $v = $value;
				if ($ratioFlag == 1) {
					$v /= $N if ($N > 0);
				}
				$tmpCoverage{$b} = $v;
			}
			

			my $startBin = $sortBins[0];
			my $endBin = $sortBins[@sortBins-1];
			my $incSize = $histBinSize;
			my $lastValue = 0;

			if ($size eq 'given') {
				$startBin = 0;
				$endBin = $histBinSize;
				$incSize = 1;
			}
			for (my $b=$startBin;$b<=$endBin;$b+=$incSize) {
				if (exists($tmpCoverage{$b})) {
					$lastValue = $tmpCoverage{$b};
				}
				$coverage{$b} = $lastValue;
			}
		}
		if ($ratioFlag==1) {
			foreach(keys %p5tags) {
				$p5tags{$_} /= $p5tagsN{$_};
			}
			foreach(keys %p3tags) {
				$p3tags{$_} /= $p3tagsN{$_};
			}
		}

		my $p5name = "$bedGraphFile" . " + Tags";
		my $p3name = "$bedGraphFile" . " - Tags";
		my $coverageName = "$bedGraphFile" . " Coverage";
		push(@histogramNames, $coverageName);
		push(@histogramNames, $p5name);
		push(@histogramNames, $p3name);
		$histograms{$p5name} = \%p5tags;
		$histograms{$p3name} = \%p3tags;
		$histograms{$coverageName} = \%coverage;
	}


	if ($ghistFlag == 1) {
		foreach(@peakOrder) {
			my $peakID = $_;
			next if (!exists($ghistData{$peakID}));
			print "$peakID";
			for (my $i=0;$i<@allDirs;$i++) {
				my $dir = $allDirs[$i];
				if (!exists($ghistData{$peakID}->{$dir})) {
					print $emptyRow;
				} else {
					print $ghistData{$peakID}->{$dir};
				}
			}
			print "\n";
		}
		`rm "$posFile"`;
		deleteFiles();
		exit;
	} else {

		#normalize histograms to per bp numbers
		my $numPeaks = scalar(keys %peaks);
		my $totalBpInBin = $histBinSize * $numPeaks;
		if ($size eq 'given') {
			$totalBpInBin = $numPeaks;
		}
		if ($ratioFlag==1) {
			$totalBpInBin = $histBinSize;
			$totalBpInBin = 1;
		}
		foreach(values %histograms) {
			foreach(values %$_) {
				$_ /= $totalBpInBin;
			}
		}
	}

	if ($seqFlag) {
		my %freqHists = ();
		my %freqCounts = ();
		my @freqNames = ();
		my $limit = 4;
		if ($diFlag == 1) {
			$limit = 24;
		}

		my $option = '';
		if ($size ne 'given') {
			$option = " -maxlen " . ($size+1);
		}
		`homerTools freq "$seqFile" -offset -$halfSize $option > "$tmpfile"`;
		open IN, $tmpfile;
		my $rowCount = 0;
		while (<IN>) {
			$rowCount++;
			chomp;
			my @line = split /\t/;
			if ($rowCount == 1) {
				for (my $i=1;$i<=$limit;$i++) {
					push(@freqNames, $line[$i]);
					my %a = ();
					my %b = ();
					$freqHists{$line[$i]} = \%a;
					$freqCounts{$line[$i]} = \%b;
				}
				next;
			}
			my $pos = $line[0];
			my $binValue = floor($pos/$histBinSize+0.5)*$histBinSize;
			for (my $i=1;$i<=$limit;$i++) {
				$freqHists{$freqNames[$i-1]}->{$binValue}+=$line[$i];
				$freqCounts{$freqNames[$i-1]}->{$binValue}++;
			}
		}
		close IN;
		`rm "$tmpfile"`;
		foreach(@freqNames) {
			my $name = $_;
			foreach(keys %{$freqHists{$name}}) {
				$freqHists{$name}->{$_} /= $freqCounts{$name}->{$_};
			}
			my $endName = "$name frequency";
			push(@histogramNames, $endName);
			$histograms{$endName} = $freqHists{$name};
		}
		`rm "$seqFile"`;
	}

	if ($consFlag) {
		`conservationAverage.pl "$consFile"  -$halfSize > "$tmpfile"`;
		my %cons = ();
		my %consCounts = ();
		open IN, $tmpfile;
		my $rowCount = 0;
		while (<IN>) {
			$rowCount++;
			next if ($rowCount < 2);
			chomp;
			my @line = split /\t/;
			my $pos = $line[0];
			my $binValue = floor($pos/$histBinSize+0.5)*$histBinSize;
			$cons{$binValue} += $line[1];
			$consCounts{$binValue}++;
		}
		close IN;
		`rm "$tmpfile"`;
		foreach(keys %cons) {
			$cons{$_} /= $consCounts{$_};
		}
		push(@histogramNames, "Conservation");
		$histograms{"Conservation"} = \%cons;
		`rm "$consFile"`;
	}
	`rm "$posFile"`;

	print "Distance from Center (cmd=$cmd)";
	foreach(@histogramNames) {
		print "\t$_";
	}
	print "\n";

	my $startBin = -1*floor($halfSize/$histBinSize+0.5)*$histBinSize;
	my $endBin = floor($halfSize/$histBinSize+0.5)*$histBinSize;
	my $incSize = $histBinSize;
	if ($size eq 'given') {
		$startBin = 0;
		$endBin = $histBinSize-1;
		$incSize = 1;
	}
	for (my $i=$startBin;$i<=$endBin;$i+=$incSize) {
		my $v = $i;
		if ($size eq 'given') {
			$v = ($i+1)/$histBinSize;
		}
		print "$v";
		foreach(@histogramNames) {
			if (exists($histograms{$_}->{$i})) {
				print "\t$histograms{$_}->{$i}";
			} else {
				print "\t0";
			}
		}
		print "\n";
	}
	print STDERR "\n";
	deleteFiles();
	exit;
}

######################################################################
########  Annotation mode related code....  ##########################
######################################################################

my %nativeTSSid = ();
# find nearest TSS
if ($tssFlag == 0 && $noGeneFlag == 0) {
	print STDERR "\tFinding Closest TSS...\n";
	my $promoterFile = $genomeDir . "/" . $genome . ".tss";	

	if ($gtfFile ne '') {
		print STDERR "\tProcessing custom annotation file...\n";
		`parseGTF.pl "$gtfFile" tss $gffFlag $gidFlag > "$gtfTSSFile"`;
		$promoterFile = $gtfTSSFile;
		$toDelete{$gtfTSSFile}=1;
	}

	my $promoterOffset = -2000;
	if ($promoter ne 'default') {
		$promoterFile = $config->{'PROMOTERS'}->{$promoter}->{'directory'} . "/$promoter.pos";
		$promoterOffset = $config->{'PROMOTERS'}->{$promoter}->{'start'};
	}
	if ($cpromoter ne '') {
		$promoterFile = $cpromoter;
		#$promoterOffset = -2000;
		$promoterOffset = ""
	}

	my $skipPromoterFile = 0;
	if ($mapFile ne '') {
		open IN, $mapFile;
		while (<IN>) {
			chomp;
			s/\r//g;
			my @line= split /\t/;
			if (exists($peaks{$line[0]})) {
				my $dist = 0;
				my $gene = $line[1];
				if (@line > 2) {
					if ($line[2] eq 'interchromosomal') {
						$line[2] = 1e9;
					}
					$dist = $line[2];
				}
				$gene =~ s/\-HOMER.*$//;
				if (exists($peaks{$line[0]}->{'tss'}) && $peaks{$line[0]}->{'tss'} ne 'NA') {
					if ($dist < $peaks{$line[0]}->{'tssDist'}) {
						$peaks{$line[0]}->{'tss'} = $gene;
						$peaks{$line[0]}->{'tssDist'} = $dist;
					}
				} else {
					$peaks{$line[0]}->{'tss'} = $gene;
					$peaks{$line[0]}->{'tssDist'} = $dist;
				}
			}
		}
		close IN;

	} elsif (-f $promoterFile) {
		`annotateRelativePosition.pl "$posFile", "$promoterFile",$promoterOffset  0 > "$tmpfile"`;
		#`cp "$tmpfile" check.tsv`;
		open IN, $tmpfile;
		while (<IN>) {
			chomp;
			my @line = split /\t/;
			my $hit = $line[0];
			my $dist = $line[2];
			my $gene = $line[1];
			my $relStrand = $line[5];
			$gene =~ s/\-HOMER.*$//;
			my $geneDirection = $line[3];
			if ($geneDirection == 0) {
				$dist *= -1;
				#print STDERR "$dist\n";
			}
			if ($peaks{$hit}->{'d'} eq '-') {
				$dist *= -1;
			}
			$peaks{$hit}->{'tss'} = $gene;
			$peaks{$hit}->{'tssDist'} = $dist;
			$peaks{$hit}->{'rstrand'} = $relStrand;
			$nativeTSSid{$gene}='NA';
		}
		close IN;
		`rm "$tmpfile"`;
	} else {
		my $skipPromoterFile = 1;
		print STDERR "\tSkipping TSS assignment (can't find file for genome $genome)\n";
		print STDERR "\t\tCan't find promoterFile $promoterFile\n";
		#print STDERR "!!!! Can't find TSS file for $genome ($promoterFile) !!!\n";
	}

} else {
	foreach(keys %peaks) {
		$peaks{$_}->{'tss'} = $_;
		$peaks{$_}->{'tss'} =~ s/\-HOMER.*$//;
		$peaks{$_}->{'tssDist'} = 0;
		$nativeTSSid{$_} = 'NA';
	}
}


if ($noAnnFlag == 0 && $noGeneFlag == 0) {
	my $annotationFile = $genomeDir . "/" . $genome . ".basic.annotation";

	if ($customAnnotationFile ne '') {
		$annotationFile = $customAnnotationFile;
	} elsif ($gtfFile ne '' && $customAnnotationFile eq '') {
		`parseGTF.pl "$gtfFile" ann $gffFlag $gidFlag > "$tmpfile"`;
		`assignGenomeAnnotation "$tmpfile" "$tmpfile" -prioritize "$tmpfile2"`;
		$annotationFile = $tmpfile2;
	} else {
		if (-f $annotationFile) {
		} else {
			$annotationFile = $genomeDir . "/" . $genome . ".annotation";
		}
	}

	if (-f $annotationFile) {
		my $opt = '';
		if ($annStatFile ne '') {
			$opt = " -stats \"$tmpfile3\"";
		}
		`assignGenomeAnnotation "$posFile" "$annotationFile" -ann "$tmpfile" $opt`;
		#print STDERR "`assignGenomeAnnotation $posFile $annotationFile -ann $tmpfile`\n";
		open IN, $tmpfile;
		while (<IN>) {
			chomp;
			s/\r//g;
			my @line = split /\t/;
			next if (!exists($peaks{$line[0]}));
			$line[2] =~ s/\-+\d+$//;
			$line[2] =~ s/\-HOMER\d+$//;
			$peaks{$line[0]}->{'ann'} = $line[2];
		}
		`rm "$tmpfile"`;
	} else {
		print STDERR "\tCould not find basic annotation file ($annotationFile)\n";
	}

	$annotationFile = $genomeDir . "/" . $genome . ".full.annotation";
	if (-f $annotationFile) {
	} else {
		$annotationFile = $genomeDir . "/" . $genome . ".annotation";
	}

	if (-f $annotationFile) {
		print STDERR "\tNOTE: If this part takes more than 2 minutes, there is a good chance\n";
		print STDERR "\t\tyour machine ran out of memory: consider hitting ctrl+C and rerunning\n";
		print STDERR "\t\tthe command with \"-noann\"\n";
		if ($annStatFile eq '') {
			print STDERR "\tTo capture annotation stats in a file, use \"-annStats <filename>\" next time\n";
		}
		my $opt = '';
		if ($annStatFile ne '') {
			$opt = " -stats \"$tmpfile4\"";
		}
		`assignGenomeAnnotation "$posFile" "$annotationFile" -ann "$tmpfile" $opt`;
		open IN, $tmpfile;
		while (<IN>) {
			chomp;
			s/\r//g;
			my @line = split /\t/;
			next if (!exists($peaks{$line[0]}));
			$line[2] =~ s/\-+\d+$//;
			$line[2] =~ s/\-HOMER\d+$//;
			$peaks{$line[0]}->{'fullann'} = $line[2];
		}
	} else {
		if ($gtfFile eq '') {
			print STDERR "\tCould not find full/detailed annotation file ($annotationFile)\n";
		}
	}
	if ($annStatFile ne '') {
		`cat "$tmpfile3" "$tmpfile4" > "$annStatFile"`;
	}
	
	`rm -f "$tmpfile" "$tmpfile2" "$tmpfile3" "$tmpfile4"`;
}



if ($goDir ne '' && $noGeneFlag == 0 && $organism ne 'unknown') {
	print STDERR "\tPerforming Gene Ontology Analysis...\n";
	open OUT, ">$tmpfile";
	foreach(keys %nativeTSSid) {
		print OUT "$_\n";
	}
	close OUT;
	`findGO.pl "$tmpfile" $organism "$goDir"`;
	`rm "$tmpfile"`;
}

if ($genomeOntologyDir ne '') {
	print STDERR "\t--------------------------------------\n";
	print STDERR "\tPerforming Genome Ontology Analysis...\n";
	`GenomeOntology.pl "$posFile" $genome "$genomeOntologyDir"`;
}



my @allIndividuals = ();
my %snps = ();
if ($vcfFile ne "") {
	print STDERR "\tExtracting genetic variation information from VCF file...\n";

	my $cppOpt = "";
	if ($size eq 'given') {
		$cppOpt .= " -fixed";
	} else {
		$cppOpt .= " -start -$halfSize -end $halfSize";
	}
	if (scalar(@individuals) > 1) {
		$cppOpt .= " -individuals";
		foreach(@individuals) {
			$cppOpt .= " $_";
		}
	}

	if ($editDistanceFlag) {
	} else {
		$cppOpt .= " -peaksnps";
	}

	#print STDERR "`getPeakTags $posFile $dir $cppOpt > $tmpfile`\n";
	`getPeakTags "$posFile" -vcf "$vcfFile" $cppOpt > "$tmpfile"`;
	
	open IN, $tmpfile;
	my $lineCount = 0;
	while (<IN>) {
		$lineCount++;
		chomp;
		my @line = split /\t/;
		if ($lineCount == 1) {
			last if ($line[1] eq '');
			@allIndividuals = split /\,/,$line[1];
			next;
		}
		my $hit = $line[0];
		next if (!exists($peaks{$hit}));
		next if (@line < 2);
		next if ($line[1] eq '');
		if ($editDistanceFlag) {
			my @pos = split /\,/,$line[1];
			$peaks{$hit}->{'snps'} = \@pos;
		} else {
			$peaks{$hit}->{'snps'} = $line[1];
		}
	}
	close IN;
	`rm "$tmpfile"`;
}
	

print STDERR "\tCounting Tags in Peaks from each directory...\n";
my @newDirs = ();
my %tagTotals = ();
my %normFactors = ();
my @allDirs = ();
my %dirTypes = ();
for (my $i=0;$i<@tagDirs;$i++) {
	push(@allDirs, $tagDirs[$i]);
	$dirTypes{$tagDirs[$i]} = "tagDir";
}
for (my $i=0;$i<@inputDirs;$i++) {
	push(@allDirs, $inputDirs[$i]);
	$dirTypes{$inputDirs[$i]} = "inputDir";
}
for (my $i=0;$i<@bedGraphFiles;$i++) {
	push(@allDirs, $bedGraphFiles[$i]);
	$dirTypes{$bedGraphFiles[$i]} = "bedGraph";
}
for (my $i=0;$i<@wigFiles;$i++) {
	push(@allDirs, $wigFiles[$i]);
	$dirTypes{$wigFiles[$i]}="wiggle";
}

my $cpus = 0;
my @files = ();
for (my $i=0;$i<@allDirs;$i++) {
	my $dir = $allDirs[$i];
	my $type = $dirTypes{$dir};

	my $tagTotal = 0;	
	my $tagPosTotal = 0;	
	my $dirFragLength = 0;
	my $dirHalfFragLength = 0;
	my $dirPeakLength = 0;
	
	if ($type eq 'tagDir' || $type eq 'inputDir') {
		($tagTotal, $tagPosTotal,$dirFragLength,$dirPeakLength) = HomerConfig::readTagInfo($dir,$init2One);
	}
	
	$tagTotals{$dir} = $tagTotal;
	$normFactors{$dir} = 1;
	
	my $cppOpt = "";
	$cppOpt .= " -strand $strandFlag ";

	if ($ratioFlag==1) {
		$cppOpt .= " -ratio";
	}
	if ($fragLength eq 'auto') {
		$cppOpt .= " -tagAdjust auto";
	} else {
		$dirHalfFragLength = floor($fragLength/2);
		$cppOpt .= " -tagAdjust $dirHalfFragLength";
	}
	$cppOpt .= " -tbp $init2One";
	my $preOpt = $cppOpt;

	if ($nfrFlag) {
		$cppOpt .= " -nfr -nfrSize $nfrSize";
	}

	if ($size eq 'given') {
		$cppOpt .= " -fixed";
	} else {
		$cppOpt .= " -start -$halfSize -end $halfSize";
	}

	my $inputFile = "\"$dir\"";
	if ($type eq 'bedGraph') {
		$inputFile = "-bedGraph $inputFile";
	} elsif ($type eq 'wiggle') {
		$inputFile = "-wig $inputFile";
	}

	my $file = $tmpfile . ".cpu$i";
	push(@files, $file);

	my $pid = fork();
	$cpus++;
	if ($pid == 0) {
		#print STDERR "`getPeakTags $posFile $inputFile $cppOpt > $tmpfile`;\n";
		`getPeakTags "$posFile" $inputFile $cppOpt > "$file"`;
		exit;
	} 	
	if ($cpus >= $maxCPUs) {
		wait();
		$cpus--;
	}
}
my $id = 0;
while ($id >= 0) {
    $id = wait();
    if ($id == -1) {
        #print STDERR "\tALL FINISHED!!!\n";
    } else {
        #print STDERR "\t$id finished\n";
    }
}
	
for (my $i=0;$i<@allDirs;$i++) {

	$dir = $allDirs[$i];
	$type = $dirTypes{$dir};
	my $tagTotal = 0;	
	my $tagPosTotal = 0;	
	my $dirFragLength = 0;
	my $dirHalfFragLength = 0;
	my $dirPeakLength = 0;
	
	if ($type eq 'tagDir' || $type eq 'inputDir') {
		($tagTotal, $tagPosTotal,$dirFragLength,$dirPeakLength) = HomerConfig::readTagInfo($dir,$init2One);
	}
	my $inputFile = "\"$dir\"";
	if ($type eq 'bedGraph') {
		$inputFile = "-bedGraph $inputFile";
	} elsif ($type eq 'wiggle') {
		$inputFile = "-wig $inputFile";
	}

	
	open IN, $files[$i];
	while (<IN>) {
		chomp;
		my @line = split /\t/;
		my $hit = $line[0];
		my $v = $line[1];
		$peaks{$hit}->{'t'}->{$dir} = $v;
	}
	close IN;
	`rm "$files[$i]"`;
	push(@newDirs, $dir);

	if ($local > 0 && $size ne 'given') {
		my $factor = sprintf("%.1f", $local/$size);
		my $dirBack = $dir . "-x$factor";


		$preOpt .= " -start -$halfLocal -end $halfLocal";
		`getPeakTags "$posFile" $inputFile $preOpt > "$tmpfile"`;

		open IN, $tmpfile;
		while (<IN>) {
			chomp;
			my @line = split /\t/;
			my $hit = $line[0];
			my $v = $line[1];
			$peaks{$hit}->{'t'}->{$dirBack} = $v;
		}
		close IN;
		`rm "$tmpfile"`;
		$tagTotals{$dirBack} = $tagTotal;
		push(@newDirs, $dirBack);
		$dirTypes{$dirBack} = $type;
	}
}

@allDirs = @newDirs;
if ($ratioFlag == 0 && $adjustFlag == 1 && $nfrFlag == 0) {
	if ($normValue < 1) {
		my $nn = 0;
		foreach(@allDirs) {
			next if ($dirTypes{$_} ne 'tagDir' && $dirTypes{$_} ne 'inputDir');
			$normValue += $tagTotals{$_};
			$nn++;
		}
		$normValue /= $nn if ($nn > 0);
	}
	my @peaks = keys %peaks;
	foreach(@allDirs) {
		next if ($dirTypes{$_} ne 'tagDir' && $dirTypes{$_} ne 'inputDir');
		my $total = $tagTotals{$_};
		next if ($total < 1);
		my $ratio = $normValue / $total;
		print STDERR "\tRatio for $_ : $ratio\n";
		my $vv= sprintf("%.2f",$ratio);
		$normFactors{$_} = $vv;
		my $dir = $_;
		foreach(@peaks) {
			if (exists($peaks{$_}->{'t'}->{$dir})) {
				my $peakLength = $peaks{$_}->{'e'} - $peaks{$_}->{'s'};
				my $v = $peaks{$_}->{'t'}->{$dir} * $ratio;
				#print STDERR "dir=$dir v=$v\n";
				if ($fpkmFlag) {
					$v *= 1000.0/$peakLength;
				}
				if ($logFlag) {
					#$v = log(1+$v+rand()*$ratio)/log(2);
					my $v1 = log(1+$v)/log(2);
					my $v2 = log(1+$v+1*$ratio)/log(2);
					$v = $v1 + ($v2-$v1)*rand();
					#$v = log(1+$v+rand()*$ratio)/log(2);
				} 
				if ($sqrtFlag) {
					$v = sqrt($v+rand()*$ratio);
					my $v1 = sqrt($v);
					my $v2 = sqrt($v+$ratio);
					$v = $v1 + ($v2-$v1)*rand();
				}
				$peaks{$_}->{'t'}->{$dir} = sprintf("%.2f",$v);
			}
		}
	}
}
@newDirs = ();
foreach(@allDirs) {
	my $type = $dirTypes{$_};
	if ($type eq 'inputDir') {
	} else {
		push(@newDirs, $_);
	}
}
@allDirs = @newDirs;


my %ug2gene = ();
my $convFile = $homeDir . "/data/accession/$organism" . "2gene.tsv";
my @geneData = ();
my @geneDataHeader = (); 
my %acc2gene = ();


print STDERR "\tOrganism: $organism\n";
if ($noGeneFlag == 0 && $organism ne 'unknown') {

	print STDERR "\tLoading Gene Informaiton...\n";
	open IN, $convFile;
	while (<IN>) {
		chomp;
		my @line= split /\t/;
		next if (exists($acc2gene{$line[0]}));
		$acc2gene{$line[0]} = $line[1];
	}
	close IN;

	my $descriptionFile = $homeDir . "/data/accession/$organism.description";
	open IN, $descriptionFile;
	while (<IN>) {
		chomp;
		s/\r//g;
		my @line= split /\t/;
		my $gid = $line[0];
		if (!exists($gene{$gid})) {
			my %a = ();
			$gene{$gid} = \%a;
		}
		$gene{$gid}->{'gid'} = $line[0];
		$gene{$gid}->{'ug'} = $line[1];
		$gene{$gid}->{'refseq'} = $line[2];
		$gene{$gid}->{'ensembl'} = $line[3];
		$gene{$gid}->{'name'} = $line[4];
		$gene{$gid}->{'alias'} = $line[5];
		$gene{$gid}->{'chr'} = $line[7];
		$gene{$gid}->{'desc'} = $line[8];
		$gene{$gid}->{'ttype'} = "NA";
		if (@line > 9) {
			$gene{$gid}->{'ttype'} = $line[9];
		}
	}
	close IN;
}

if (@geneDataFiles > 0) {
	for (my $j=0;$j<@geneDataFiles;$j++) {
		last if ($noGeneFlag);
		if ($organism ne 'unknown') {
			#print STDERR "`convertIDs.pl $geneDataFiles[$j] $organism $promoterIDtype yes yes yes > $tmpfile`;\n";
			`convertIDs.pl "$geneDataFiles[$j]" $organism $promoterIDtype yes yes yes > "$tmpfile"`;
		} else {
			`cp "$geneDataFiles[$j]" "$tmpfile"`;
		}
	
		my @currentHeader = ();	
		my $linecount = 0;
		my %geneData = ();

		open IN, $tmpfile;
		while (<IN>) {
			$linecount++;
			chomp;
			s/\r//g;
			my @line = split /\t/;
			if ($linecount == 1) {
				for (my $i=1;$i<@line;$i++){ 
					push(@currentHeader , $line[$i]);
				}
				next;
			}
			my $id = shift @line;
			my $oid = $line[0];
			while (scalar(@line) < scalar(@currentHeader)) {
				push(@line, "");
			}
			if (!exists($geneData{$oid})) {
				$geneData{$oid} = \@line;
			}
			if (!exists($geneData{$id})) {
				if ($id ne '') {
					$geneData{$id} = \@line;
				}
			}
		}
		close IN;

		push(@geneDataHeader, \@currentHeader);
		push(@geneData, \%geneData);
		`rm "$tmpfile"`;
			
	}
}

if ($gwasCatalog ne '') {
	print STDERR "\tChecking for GWAS risk SNP overlap\n";
	open IN, $gwasCatalog;
	open OUT, ">$tmpfile";
	my %gwas = ();
	my $idCount = 1;
	while (<IN>) {
		chomp;
		s/\r//g;
		my @line = split /\t/;
		my $id = 'gwas' . $idCount++;
		my $chr = $line[1];
		my $start = $line[2];
		my $end = $line[3];
		my $strand = "+";
		my $snp = $line[4];
		my $study = $line[10];
		$gwas{$id} = {c=>$chr,s=>$start,e=>$end,d=>$strand,snp=>"$snp|$study"};
		print OUT "$id\t$chr\t$start\t$end\t$strand\n";
	}
	close OUT;
	close IN;
	#print STDERR "`mergePeaks -d given $posFile $tmpfile > $tmpfile2 2> /dev/null`;\n";
	`mergePeaks -d given "$posFile" "$tmpfile" > "$tmpfile2" 2> /dev/null`;

	open IN, $tmpfile2;
	while (<IN>) {
		chomp;
		s/\r//g;
		my @line = split /\t/;
		next if (@line < 10);
		my @peaks = split /\,/, $line[8];
		my @snps = split /\,/, $line[9];
		
		for (my $i=0;$i<@peaks;$i++) {
			my $pid = $peaks[$i];
			next if (!exists($peaks{$pid}));

			for (my $j=0;$j<@snps;$j++) {
				my $snpID = $snps[$j];
				next if (!exists($gwas{$snpID}));
			
				my $str = $gwas{$snpID}->{'snp'};
				if (!exists($peaks{$pid}->{'gwas'})) {
					$peaks{$pid}->{'gwas'}= $str;
				} else {
					$peaks{$pid}->{'gwas'} .= "," . $str;
				}
			}
		}
	}
	close IN;

	`rm -f "$tmpfile" "$tmpfile2"`;

}

# advanced normalization, requires R and DESeq2
if ($advNormMethod ne '') {

	my $RdataFile = $rand . ".R.data.in.txt";
	my $RgroupFile = $rand . ".R.groups.in.txt";
	my $RscriptFile = $rand . ".R.script.R";
	my $RoutputFile = $rand . ".R.data.out.txt";
	open RDATA, ">$RdataFile";
	open RGROUP, ">$RgroupFile";
	open RSCRIPT, ">$RscriptFile";

	print RDATA "GeneIDs";
	print RGROUP "Experiment\tTreatment\n";
	for (my $i=0;$i<@allDirs;$i++) {
		print RDATA "\tS$i";
		print RGROUP "S$i\tT$i\n";
	}
	print RDATA "\n";

	foreach(keys %peaks) {
		my $hit = $_;
		next if (!exists($peaks{$hit}));
		print RDATA "$hit";
		foreach(@allDirs){ 
			my $D = $_;
			my $v = '';
			if (exists($peaks{$hit}->{'t'}->{$D})) {
				$v = $peaks{$hit}->{'t'}->{$D};
				$v = floor($v+0.5); 
			} else {
				$v = 0;
			}
			print RDATA "\t$v";
		}
		print RDATA "\n";
	}
	close RDATA;
	close RGROUP;

	print RSCRIPT "##### Temporary R script generated by analyzeRepeats.pl (HOMER)\n";
	print RSCRIPT "library(DESeq2)\n";
	print RSCRIPT "countData <- read.delim(\"$RdataFile\")\n";
	print RSCRIPT "colData <- read.delim(\"$RgroupFile\")\n\n";
	print RSCRIPT "dds <- DESeqDataSetFromMatrix(countData, colData,design=~Treatment,tidy=TRUE)\n";
	if ($advNormMethod eq 'rlog') {
		print RSCRIPT "norm <- rlog(dds,blind=TRUE)\n";
	} elsif ($advNormMethod eq 'vst') {
		print RSCRIPT "norm <- vst(dds,blind=TRUE)\n";
	}
	print RSCRIPT "norm_matrix <- assay(norm)\n";
	print RSCRIPT "norm_df <- data.frame(Gene=rownames(norm_matrix), norm_matrix)\n";
	print RSCRIPT "write.table(norm_df, \"$RoutputFile\", row.names = FALSE,sep=\"\\t\")\n";

	print STDERR "\n\tPerforming variance stabalization ($advNormMethod)...\n\n";
	my $nsamples = scalar(@allDirs);
	if ($nsamples > 20 && $advNormMethod eq 'rlog') {
		print STDERR "\t!!! Warning, number of samples = $nsamples (>20). rlog transformation may take\n";
		print STDERR "\t    a long time.  Consider using -vst option instead if you are in a hurry.\n";
	}
	`R --no-save < "$RscriptFile" $showRstderr`;

	if (open IN, $RoutputFile) {
		my $z = 0;
		while (<IN>) {
			$z++;
			next if ($z < 2);
			chomp;
			s/\r//g;
				my @line = split /\t/;
			my $hit = shift @line;
			$hit =~ s/\"//g;
			if (!exists($peaks{$hit})) {
				print STDERR "! Can't recognize $hit\n";
				next;
			}
			for (my $i=0;$i<@allDirs;$i++) {
				my $D = $allDirs[$i];
				my $v = $line[$i];
				if (exists($peaks{$hit}->{'t'}->{$D})) {
					$v = $peaks{$hit}->{'t'}->{$D} = $v;
				}
			}
		}
		close IN;
	} else {
		print STDERR "!!!Error: Something probably went wrong with running rlog with R/DESeq2\n" .
   		print STDERR "\tEither there are too few samples (2 is not enough), or something is wrong with the R/DEseq2 install\n";
	}
	`rm -f "$RdataFile" "$RgroupFile" "$RscriptFile" "$RoutputFile"`;
}

#print out annotation file...

print STDERR "\tOutputing Annotation File...\n";
print "PeakID (cmd=$cmd)\tChr\tStart\tEnd\tStrand";
if ($tssFlag == 0) {
	print "\tPeak Score";
} else {
	#print "\tCAGE/EST Tag Count";
	print "\tNot Used";
}
print "\tFocus Ratio/Region Size";

if ($noGeneFlag == 0) {
	print "\tAnnotation\tDetailed Annotation"
			. "\tDistance to TSS\tNearest PromoterID\tEntrez ID\tNearest Unigene\tNearest Refseq"
			. "\tNearest Ensembl\tGene Name\tGene Alias\tGene Description\tGene Type";
}

if ($gwasCatalog ne '') {
	print "\tOverlapping GWAS Risk SNPs";
}

if ($cpgFlag==1) {
	print "\tCpG%\tGC%";
}
if ($consFlag) {
	print "\tConservation Average (0=low to 1=high conservation)";
}

if ($cmpGenomeFlag) {
	for (my $i=0;$i<@cmpGenomes;$i++) {
		print "\t% of peak aligned in $cmpGenomes[$i]";
		print "\t% alignment in $cmpGenomes[$i]";
	}
}

if (@allIndividuals > 0) {
	if ($editDistanceFlag) {
		foreach(@allIndividuals) {
			print "\t$_";
			print " Edit Distance from ref";
		}
	} else {
		print "\tSNPs (";
		my $c = 0;
		foreach(@allIndividuals) {
			print "," if ($c>0);
			$c++;
			print "$_";
		}
	}
}

foreach(@allDirs) { 
	if ($dirTypes{$_} eq 'tagDir') {
		if ($fpkmFlag) {
			print "\t$_ FPKM";
		} else {
			print "\t$_ Tag Count in $size bp ($tagTotals{$_} Total, normalization factor = $normFactors{$_}, effective total = $normValue)";
		}
	} elsif ($dirTypes{$_} eq 'bedGraph') {
		print "\t$_ bedGraph avg over $size bp";
	} elsif ($dirTypes{$_} eq 'wiggle') {
		print "\t$_ wig avg over $size bp";
	}
}
if (@inputDirs > 0) {
	print STDERR "\tNormalizing tag counts by input (pseudo=$pseudo)\n";
	my $ninput = scalar(@inputDirs);
	foreach(keys %peaks) {
		my $hit = $_;
		for (my $i=0;$i<$ninput;$i++) {
			my $input = $inputDirs[$i];
			my $iv = $peaks{$hit}->{'t'}->{$input};
			for (my $j=$i;$j<@tagDirs;$j+=$ninput) { 
				my $dir = $tagDirs[$j];
				my $v = $peaks{$hit}->{'t'}->{$dir};
				my $vv = log(($v+$pseudo)/($iv+$pseudo))/log(2.0);
				#print STDERR "$dir/$input $v $iv $pseudo = $vv\n";
				$peaks{$hit}->{'t'}->{$dir} = $vv;
			}
		}
	}
	
}
foreach(@peakFiles) {
	print "\t$_ Distance to nearest Peak, Peak ID";
}
foreach(@seFiles) {
	print "\t$_ Super Enhancer Rank";
}
foreach(@motifNames) {
	if ($mscoreFlag==1) {
		print "\t$_ Best Motif log-odds Score";
	} else {
		print "\t$_ Distance From Peak(sequence,strand,conservation)";
	}
	if ($cmpGenomeFlag) {
		for (my $i=0;$i<@cmpGenomes;$i++) {
			print "\tFound in $cmpGenomes[$i]";
		}
	}
	if (@revLiftover > 0) {
		for (my $i=0;$i<@cmpGenomes;$i++) {
			if ($mscoreFlag==1) {
				print "\t$_ Best Motif log-odds Score in $cmpGenomes[$i]";
			} else {
				print "\t$_ Distance From Peak(sequence,strand,conservation) in $cmpGenomes[$i]";
			}
			print "\tFound in $genome";
		}
	}
}
foreach(@geneDataHeader) {
	foreach(@$_) {
		print "\t$_";
	}
}
print "\n";

my $sortFlag = 'num';
foreach(keys %peaks) {
	#print STDERR "$_ $peaks{$_}->{'v'}\n";
	if ($peaks{$_}->{'v'} !~ /^[\-\d\.\+e]+$/ || $peaks{$_}->{'v'} eq '.') {
		$sortFlag = 'str';
	}
}
my @hits = ();
if ($sortFlag eq 'num') {
	@hits = sort {$peaks{$b}->{'v'} <=> $peaks{$a}->{'v'}} keys %peaks;
} else {
	@hits = sort {$peaks{$b}->{'v'} cmp $peaks{$a}->{'v'}} keys %peaks;
}


my $totalRows = 0;
my $numBadRows = 0;


foreach(@hits) {
	my $numBlanks =0;
	my $rowStr = '';

	my $hit = $_;
	my $chr = $peaks{$hit}->{'c'};
	my $start = $peaks{$hit}->{'s'};
	my $end = $peaks{$hit}->{'e'};
	my $dir = $peaks{$hit}->{'d'};
	my $v = $peaks{$hit}->{'v'};
	my $fdr = $peaks{$hit}->{'fdr'};
	my $ann = $peaks{$hit}->{'ann'};
	my $fullann = $peaks{$hit}->{'fullann'};
	my $cons = 'NA';
	my $peakLength = $end - $start;
	$peakLength = 1 if ($peakLength < 1);
	if ($peaks{$hit}->{'cons'} ne 'NA') {
		$cons = sprintf("%.2f",$peaks{$hit}->{'cons'}/9.0);
	}
	my $tssID = $peaks{$hit}->{'tss'};
	my $dist = $peaks{$hit}->{'tssDist'};


	$rowStr .= "$hit\t$chr\t$start\t$end\t$dir\t$v\t$fdr";
	if ($noGeneFlag == 0) {
		$rowStr .= "\t$ann\t$fullann\t$dist";
		my $ogID = $tssID;
		my $tssAltID = $tssID;
		$tssAltID =~ s/\.\d+?$//;
		if (!exists($acc2gene{$tssID}) && exists($acc2gene{$tssAltID})) {
			$tssID = $tssAltID;
		}
		if (exists($acc2gene{$tssID}) && $acc2gene{$tssID} ne 'NA' && exists($gene{$acc2gene{$tssID}})) {
			my $gid = $acc2gene{$tssID};
			my $ugid = $gene{$gid}->{'ug'};
			my $refseq = $gene{$gid}->{'refseq'};
			my $embl = $gene{$gid}->{'ensembl'};
			my $gname = $gene{$gid}->{'name'};
			my $alias = $gene{$gid}->{'alias'};
			my $desc = $gene{$gid}->{'desc'};
			my $ttype = $gene{$gid}->{'ttype'};
			$rowStr .= "\t$ogID\t$gid\t$ugid\t$refseq\t$embl\t$gname\t$alias\t$desc\t$ttype";
		} else {
			$rowStr .= "\t$ogID\t\t\t\t\t\t\t\t";
		}
	}
	if ($gwasCatalog ne '') {
		if (exists($peaks{$hit}->{'gwas'})) {
			$rowStr .= "\t$peaks{$hit}->{'gwas'}";
		} else {
			$rowStr .= "\t";
		}
	}
	if ($cpgFlag==1) {
		$rowStr .= "\t$peaks{$hit}->{'cpg'}\t$peaks{$hit}->{'gc'}";
		$numBlanks++ if ($peaks{$hit}->{'cpg'} eq 'NA' || $peaks{$hit}->{'cpg'} eq '');
		$numBlanks++ if ($peaks{$hit}->{'gc'} eq 'NA' || $peaks{$hit}->{'gc'} eq '');
	}
	if ($consFlag==1) {
		$rowStr .= "\t$cons";
		$numBlanks++ if ($cons eq 'NA' || $cons eq '');
	}
	if ($cmpGenomeFlag) {
		for (my $i=0;$i<@cmpGenomes;$i++) {
			my $v1 = "NA";
			my $v2 = $peaks{$hit}->{'gComp'}->[$i]->{'map'};
			if ($skipBlastn == 0) {
				$v1 = $peaks{$hit}->{'gComp'}->[$i]->{'pid'};
				$v2 = $peaks{$hit}->{'gComp'}->[$i]->{'paln'};
			}
			$rowStr .= "\t$v1\t$v2";
		}
	}
	if (@allIndividuals > 0) {
		if ($editDistanceFlag) {
			foreach(my $i=0;$i<@allIndividuals;$i++) {
				my $v = 0;
				if (exists($peaks{$hit}->{'snps'})) {
					if (scalar(@{$peaks{$hit}->{'snps'}}) > $i) {
						$v = $peaks{$hit}->{'snps'}->[$i];
					}
				}
				$rowStr .= "\t$v";
			}
		} else {
			$rowStr .= "\t";
			if (exists($peaks{$hit}->{'snps'})) {
				$rowStr .= $peaks{$hit}->{'snps'};
			}
		}
	}
	foreach(@allDirs){ 
		my $D = $_;
		if (exists($peaks{$hit}->{'t'}->{$D})) {
			my $v = $peaks{$hit}->{'t'}->{$D};
			$rowStr .= "\t$v";
		} else {
			$rowStr .= "\t";
			$numBlanks++;
		}
	}
	foreach(@peakFiles) {
		my $str = '';
		if (exists($peaks{$hit}->{'p'}->{$_})) {
			if ($pCountFlag==1) {
				$str = $peaks{$hit}->{'p'}->{$_}->{'s'};
			} elsif ($pDistFlag==1) {
				$str = abs($peaks{$hit}->{'p'}->{$_}->{'d'});
			} elsif ($pDistFlag==2) {
				$str = $peaks{$hit}->{'p'}->{$_}->{'d'};
			} else {
				$str = $peaks{$hit}->{'p'}->{$_}->{'d'} . "," . $peaks{$hit}->{'p'}->{$_}->{'id'};
			}
		} else {
			$numBlanks++;
		}
		$rowStr .= "\t$str";
	}
	for (my $j=0;$j<@seFiles;$j++) {
		my $str = '';
		if (exists($seRanks[$j]->{$hit})) {
			$str = $seRanks[$j]->{$hit};
		}
		$rowStr .= "\t$str";
	}
	foreach(@motifNames) {
		my $str = '';
		my $motif = $_;
		my $nmotifs = 0;
		my $minDist2Motif = '';
		if (exists($peaks{$hit}->{'m'})) {
			if (exists($peaks{$hit}->{'m'}->{$motif})) {
				my @pos = sort {$a->{'p'} <=> $b->{'p'}} @{$peaks{$hit}->{'m'}->{$motif}};	
				my $c = 0;
				foreach(@pos) {
					if ($mscoreFlag) {
						my $score = $_->{'score'};
						$str .= $score;
						last;
					} 
					next if ($removeCloseMotifs && $_->{'valid'}==0);
					$nmotifs++;
					$str .= ',' if ($c > 0);
					$c++;
					my $con = sprintf("%.2f",$_->{'c'}/9.0);
					$str .= $_->{'p'} . "(" . $_->{'s'} . "," . $_->{'d'} . "," . $con . ")";
					if ($minDist2Motif eq '') {
						$minDist2Motif = $_->{'p'};
					} elsif (abs($_->{'p'}) < abs($minDist2Motif)) {
						$minDist2Motif = $_->{'p'};
					}
				}
			}
		}

		if ($nscoreFlag == 1) {
			$str = $nmotifs;
		} 
		if ($mdistFlag == 1) {
			$str = $minDist2Motif;
		}
		$rowStr .= "\t$str";
		if ($cmpGenomeFlag) {
			for (my $i=0;$i<@cmpGenomes;$i++) {
				my $str = '';
				if (exists($peaks{$hit}->{'m'})) {
					if (exists($peaks{$hit}->{'m'}->{$motif})) {
						my $total = 0;
						my $match = 0;

						foreach(@{$peaks{$hit}->{'m'}->{$motif}}) {
							$match += $_->{'gComp'}->[$i]->{'m'};
							$total++;
						}
						if ($total < 1) {
							$match = -1;
						} else {
							$match /= $total;
						}
						$str = $match;
					}
				}
				$rowStr .= "\t$str";
			}

			if (@revLiftover > 0) {
				for (my $i=0;$i<@cmpGenomes;$i++) {
					my $str = '';
					my $nmotifs = 0;
					my $minDist2Motif = '';
					if (exists($cpeaks[$i]->{$hit}->{'m'})) {
						if (exists($cpeaks[$i]->{$hit}->{'m'}->{$motif})) {
							my @pos = sort {$a->{'p'} <=> $b->{'p'}} @{$cpeaks[$i]->{$hit}->{'m'}->{$motif}};	
							my $c = 0;
							foreach(@pos) {
								if ($mscoreFlag) {
									my $score = $_->{'score'};
									$str .= $score;
									last;
								} 
								next if ($removeCloseMotifs && $_->{'valid'}==0);
								$nmotifs++;
								$str .= ',' if ($c > 0);
								$c++;
								#my $con = sprintf("%.2f",$_->{'c'}/9.0);
								my $con = 0;
								$str .= $_->{'p'} . "(" . $_->{'s'} . "," . $_->{'d'} . "," . $con . ")";
								if ($minDist2Motif eq '') {
									$minDist2Motif = $_->{'p'};
								} elsif (abs($_->{'p'}) < abs($minDist2Motif)) {
									$minDist2Motif = $_->{'p'};
								}
							}
						}
					}
		
					if ($nscoreFlag == 1) {
						$str = $nmotifs;
					} 
					if ($mdistFlag == 1) {
						$str = $minDist2Motif;
					}
					$rowStr .= "\t$str";
					$str = '';
					if (exists($cpeaks[$i]->{$hit}->{'m'})) {
						if (exists($cpeaks[$i]->{$hit}->{'m'}->{$motif})) {
							my $total = 0;
							my $match = 0;
	
							foreach(@{$cpeaks[$i]->{$hit}->{'m'}->{$motif}}) {
								$match += $_->{'gComp'}->[0]->{'m'};
								$total++;
							}
							if ($total < 1) {
								$match = -1;
							} else {
								$match /= $total;
							}
							$str = $match;
						}
					}
					$rowStr .= "\t$str";
				}
			}
		}
	}

	for (my $i=0;$i<@geneData;$i++) {
		if (exists($geneData[$i]->{$tssID})) {
			foreach(@{$geneData[$i]->{$tssID}}) {
				$rowStr .= "\t$_";
			}
		} else {
			$numBlanks++;
			foreach(@{$geneDataHeader[$i]}) {
				$rowStr .= "\t";
			}
		}
	}
			
	$rowStr .= "\n";


	$totalRows++;
	if ($noblanksFlag == 1) {
		if ($numBlanks > 0) {
			next;
			$numBadRows++;
		}
	}

	print $rowStr;
		
}
if ($noblanksFlag == 1) {
	print STDERR "\tRemoved $numBadRows with missing data (out of $totalRows)\n";
}

`rm "$posFile"`; 
`rm "$seqFile"` if ($seqFlag);
`rm "$consFile"` if ($consFlag);
print STDERR "\tDone annotating peaks file\n\n";
deleteFiles();

sub deleteFiles {
	foreach(keys %toDelete) {
		`rm "$_"`;
	}
}

sub calcMotifLogic {

	my ($peaks,$motifNames,$strand,$mlogicFile) = @_;

	my @codes = ();
	my @rcodes = ();
	my %index = ();
	for (my $i=0;$i<@$motifNames;$i++) {
		my $c = chr(65+$i);
		my $rc = chr(97+$i);
		print STDERR "\tCodes: $motifNames->[$i] $i $c $rc\n";
		push(@codes,$c);
		push(@rcodes,$rc);
		$index{$c} = $i;
		$index{$rc} = $i;
	}

	my %codeCounts = ();
	my $assStr = "";
	my %pCodes = ();
	my $peakTotal = 0;

	foreach(keys %$peaks) {
		my $peakID = $_;
		$peakTotal++;
		my @hits = ();
		my $bad = 0;
		for (my $i=0;$i<@$motifNames;$i++) {
			my $mname = $motifNames[$i];
			if (!exists($peaks->{$peakID}->{'m'}->{$mname})) {
				$bad = 1;
				next;
			}
			foreach(@{$peaks->{$peakID}->{'m'}->{$mname}}) {
				next if ($_->{'valid'}==0);
				my $p = $_->{'p'};
				my $d = $_->{'d'};
				my $c = $codes[$i];
				if ($d eq '-' || $d eq '1') {
					$c = $rcodes[$i];
				}
				my $info = {p=>$p,d=>$d,c=>$c};
				push(@hits, $info);
			}
		}
		#next if ($bad==1);
		@hits = sort {$a->{'p'} <=> $b->{'p'} } @hits;
		my $codeStr = "";
		foreach(@hits) {
			$codeStr .= $_->{'c'};
		}
		my $usedStr = $codeStr;
		my $rv = revOppCode($codeStr);
		if ($strand eq 'both') {
			my $cc = $codeStr cmp $rv;
			if ($cc > 0) {
				$usedStr = $rv;
			}
		}

		if (exists($codeCounts{$usedStr})) {
			$codeCounts{$usedStr}++;
		} else {
			$codeCounts{$usedStr}=1;
		}
		$pCodes{$peakID}= $usedStr;
		$assStr .= "$peakID\t$peaks->{$peakID}->{'c'}\t$peaks->{$peakID}->{'s'}\t$peaks->{$peakID}->{'e'}\t$peaks->{$peakID}->{'d'}\t$usedStr\n";
	}
	my %codeSubs = ();
	foreach(keys %codeCounts) {
		my $code = $_;
		my $rcode = revOppCode($code);	
		my $total = 0;
		foreach(values %pCodes) {
			my $c = $_;
			my $f = 0;
			if (index($c,$code) > -1) {
				$total++;
				next;
			} elsif (index($c,$rcode) > -1) {
				$total++;
				next;
			}
		}
		$codeSubs{$code}=$total;
	}


	open OUT, ">$mlogicFile";
	
	print OUT "Motif Name\t+ strand code\t- strand code\n";
	for (my $i=0;$i<@codes;$i++) {
		print OUT "$motifNames->[$i]\t$codes[$i]\t$rcodes[$i]\n";
	}
	print OUT "\n\n";

	print OUT "CRM codes\t# of Peaks\t% of Peaks\t# of Peaks (exact)\t% of Peaks (exact)\n";
	my @crm = sort {$codeSubs{$b} <=> $codeSubs{$a}} keys %codeCounts;
	foreach(@crm) {
		my $v= $_;
		$v = 'None' if ($v eq '');
		print OUT "$v";
		$v = $codeSubs{$_};
		print OUT "\t$v";
		$v = sprintf("%.3lf",$codeSubs{$_}/$peakTotal);
		print OUT "\t$v";
		$v = $codeCounts{$_};
		print OUT "\t$v";
		$v = sprintf("%.3lf",$codeCounts{$_}/$peakTotal);
		print OUT "\t$v";
		print OUT "\n";
	}


	print OUT "\n\n";
	print OUT "PeakID\tchr\tstart\tend\tstrand\tCRM code\n";
	print OUT $assStr;
	close OUT;
	

}
sub revOppCode {
	my ($code) = @_;
	$code = reverse($code);
	my $len = length($code);
	my $ncode = "";
	for (my $i=0;$i<$len;$i++) {
		my $c = substr($code,$i,1);
		my $n = ord($c);
		if ($n < 95) {
			$n += 32;
		} else {
			$n -= 32;
		}
		$ncode .= chr($n);
	}
	return $ncode;
}

map_peak_to_gene_features_v3.pl

#!/usr/bin/perl

###############  slight modification of v2; ading NM_0000 ID in map outfile

use strict;



if($#ARGV < 0){
   print "Usage: perl <prog> <peak file> <peak threshold> <refFlat file> <upstream distance> <downstream distance> <output file>\n";
   exit(1);
}


my $file_pk = $ARGV[0];
my $peak_threshold = $ARGV[1];
my $file_refflat = $ARGV[2];
my $ups = $ARGV[3];
my $ds = $ARGV[4];
my $output_file = $ARGV[5];

my %hash_peaks = ();
my %hash_peak_feature = ();
my %hash_gene_data = ();

&read_seq_peaks($file_pk, $peak_threshold, \%hash_peaks, 3);
&read_refflat_by_chr_all($file_refflat, \%hash_gene_data);

open(OUT, ">$output_file") || die("Err opening $output_file\n");

my @arr_genes = ();
my @arr_peaks = ();
foreach my $chr (sort keys %hash_peaks)
{
 #  print "$chr\n" ;
   @arr_peaks = sort { $a->{peakpos} <=> $b->{peakpos} } @{ $hash_peaks{$chr} };
   my %hash_peak_feature = ();
##### Mapping peaks to genes on the plus strand #######

   @arr_genes = eval{sort { $a->{tx_start} <=> $b->{tx_start} } @{ $hash_gene_data{$chr}->{plus}}};
   my $start = 0;
   my $start_flg = 1;
   for(my $peakctr = 0; $peakctr <= $#arr_peaks; $peakctr ++)
    {
        my $peak = $chr."_".$arr_peaks[$peakctr]->{start}."_".$arr_peaks[$peakctr]->{end}."_".$arr_peaks[$peakctr]->{peakpos}."_".$arr_peaks[$peakctr]->{w_score};
        for(my $genectr = $start; $genectr <= $#arr_genes; $genectr ++)
         {
           my ($feature, $dist, $index) = &assign_peak_to_gene_feature($arr_peaks[$peakctr]->{peakpos}, $arr_genes[$genectr], $ups, $ds);

           if($feature != -1 && $feature != 1){
               my $target;
               $target->{gene} = $arr_genes[$genectr];
               $target->{feature} = $feature;
               $target->{dist} = $dist;
               $target->{index} = $index;
               push @{ $hash_peak_feature{$peak} }, $target;
               if($start_flg){ $start = $genectr;  $start_flg = 0;  }
           }
           if($feature == -1){  if($start_flg){ $start = $genectr;  } last;  }  #### If peak is on the left, update peak. Record genectr if peak has not been assigned any target ###
           if($feature == 1){ next;  } #### If peak is on the right, update gene ###
         }
    }

##### Mapping peaks to genes on the minus strand #######

   @arr_genes = eval{ sort { $a->{tx_start} <=> $b->{tx_start} } @{ $hash_gene_data{$chr}->{minus}}};
   my $start = 0;
   my $start_flg = 1;
   for(my $peakctr = 0; $peakctr <= $#arr_peaks; $peakctr ++)
    {
        my $peak = $chr."_".$arr_peaks[$peakctr]->{start}."_".$arr_peaks[$peakctr]->{end}."_".$arr_peaks[$peakctr]->{peakpos}."_".$arr_peaks[$peakctr]->{w_score};
         for(my $genectr = $start; $genectr <= $#arr_genes; $genectr ++)
         {
           my ($feature, $dist, $index) = &assign_peak_to_gene_feature($arr_peaks[$peakctr]->{peakpos}, $arr_genes[$genectr], $ups, $ds);

           if($feature != -1 && $feature != 1){
               my $target;
               $target->{gene} = $arr_genes[$genectr];
               $target->{feature} = $feature;
               $target->{dist} = $dist;
               $target->{index} = $index;
               push @{ $hash_peak_feature{$peak} }, $target;
               if($start_flg){ $start = $genectr;  $start_flg = 0;  }
           }
           if($feature == -1){ if($start_flg){ $start = $genectr;  } last;  }
           if($feature == 1){ next;  }
         }
    }

    foreach my $peak (sort keys %hash_peak_feature){
        my @arr = split(/\_/,$peak);
        foreach my $target (@{ $hash_peak_feature{$peak} }){
           print OUT "$arr[0]\t$arr[1]\t$arr[2]\t$arr[3]\t$arr[4]\t$target->{gene}->{gene_name}\t";
           print OUT "$target->{feature}\t$target->{index}\t$target->{dist}\t";
           print OUT "$target->{gene}->{acc_name}\t$target->{gene}->{strand}\n";
        }
    }
}
close(OUT);

sub read_refflat_by_chr_all{
    my($file_refflat, $ref_hash, $no_strand) = @_;

    open(REF,"$file_refflat") || die("Err opening $file_refflat\n");
    while(<REF>)
     {
         next if $_ =~ /^geneName/; #hdr
         my @arr = split(/\t/,$_);
         my $gene_name = $arr[0];
         my $acc_name = $arr[1];
         my $chr = $arr[2];
         my $strand = $arr[3];
         my $tx_start = $arr[4];
         my $tx_end = $arr[5];
         my $cds_start = $arr[6];
         my $cds_end = $arr[7];
         my $num_exons = $arr[8];
         my $utr5_start; my $utr5_end; my $utr3_start; my $utr3_end;
         if($strand eq "+" || $no_strand){
            $utr5_start = $tx_start;  $utr5_end = $cds_start - 1;
            $utr3_start = $cds_end + 1;  $utr3_end = $tx_end;
         } else {
            my $tmp = $tx_start; $tx_start = $tx_end; $tx_end = $tmp;
            $tmp = $cds_start; $cds_start = $cds_end; $cds_end = $tmp;

            $utr5_start = $tx_start;  $utr5_end = $cds_start + 1;
            $utr3_start = $cds_end - 1;  $utr3_end = $tx_end;
         }
         my $gene;
         $gene->{gene_name} = $gene_name;
         $gene->{acc_name} = $acc_name;
         $gene->{strand} = $strand;
         $gene->{tx_start} = $tx_start;
         $gene->{tx_end} = $tx_end;
         $gene->{cds_start} = $cds_start;
         $gene->{cds_end} = $cds_end;
         $gene->{utr5_start} = $utr5_start;
         $gene->{utr5_end} = $utr5_end;
         $gene->{utr3_start} = $utr3_start;
         $gene->{utr3_end} = $utr3_end;
         $gene->{num_exons} = $num_exons;

         my @arr_exon = ();

         my $exon_start_str = $arr[9];
         my $exon_end_str = $arr[10];
         my @arr_exon_start = split(/\,/,$exon_start_str);
         my @arr_exon_end = split(/\,/,$exon_end_str);

         if($strand eq "+" || $no_strand){
            for(my $exonctr = 0; $exonctr < $num_exons; $exonctr ++)
             {
               $arr_exon[$exonctr]->{start} = $arr_exon_start[$exonctr];
               $arr_exon[$exonctr]->{end} = $arr_exon_end[$exonctr];
             }

         }else{
            for(my $exonctr = 0; $exonctr < $num_exons; $exonctr ++)
             {
               $arr_exon[$num_exons - $exonctr - 1]->{start} = $arr_exon_end[$exonctr];
               $arr_exon[$num_exons - $exonctr - 1]->{end} = $arr_exon_start[$exonctr];
             }
         }

         @{ $gene->{exons} } = @arr_exon;

         if ($no_strand) {
            push @{ $ref_hash->{$chr} }, $gene;
         } else {
            if($gene->{strand} eq "+"){
               push @{ $ref_hash->{$chr}->{plus} }, $gene;
            }else{
               push @{ $ref_hash->{$chr}->{minus} }, $gene;
            }
         }
     }
    close(REF);
    return $ref_hash;
}

sub open_file_return_handle{
    my($filename) = @_;
    if ($filename =~ /.gz/) {
       die("File does not exist: '$filename'") unless -e $filename;
       open(FILE,"gzip -dc $filename |") || die("Err opening $filename");
    }
    elsif ($filename =~ /.zip/) {
       die("File does not exist: '$filename'") unless -e $filename;
       open(FILE,"unzip -qc $filename |") || die("Err opening $filename");
    }
    else {
        open(FILE,"$filename") || die("Err opening $filename");
    }
    my $filehandle = *FILE;
    return $filehandle;
}

sub read_seq_peaks{
    my($file_pk, $peak_threshold, $ref_hash, $filterCol) = @_;
    $filterCol = 3 if !defined($filterCol);
    my $ct = 0;
    my $FH = &open_file_return_handle($file_pk);
    <$FH>;
    while(<$FH>) {
        $_=~s/\n//;  $_=~s/\r//;
        my @arr = split(/\t/,$_);
        my $chr = $arr[0];
        my $start = $arr[1];
        my $end = $arr[2];
        my $w_score = $arr[3];
        my $unw_score = $arr[4];
        my $peakpos = $arr[6];
        my $sum_pzscore = $arr[10];
        my $sum_nreads = $arr[11];
        my $score = $arr[$filterCol];
		my $ID = $arr[9] ;
        if(!defined($score) || $score < $peak_threshold){ next; }

        $ct++;
        my $peak;
        $peak->{start} = $start;  $peak->{end} = $end;
        $peak->{unw_score} = $unw_score; $peak->{w_score} = $w_score;
        $peak->{center} = ($start+$end)/2; $peak->{peakpos} = $peakpos;
        $peak->{sum_pzscore} = $sum_pzscore;
        $peak->{sum_nreads} = $sum_nreads;
        $peak->{data} = $_ ;
		$peak->{ID} = $ID ;
        push @{ $ref_hash->{$chr} }, $peak;
     }
     close($FH);
     return $ct;
}

sub assign_peak_to_gene_feature{
    my($pos, $gene, $ups,  $ds) = @_;


    if($gene->{strand} eq "+")
    {
       if( ($gene->{tx_start} - $ups) <= $pos && $pos <= ($gene->{tx_end} + $ds) )
       {

          if( ($gene->{tx_start} - $ups) <= $pos && $pos <= $gene->{tx_start} ){
        	   return ("ups",abs($gene->{tx_start}-$pos), "");
          }
          if( $gene->{tx_end} <= $pos && $pos <= ($gene->{tx_end} + $ds) ){
        	   return ("ds",abs($gene->{tx_end}-$pos), "");
          }

          my $exonctr;
          for($exonctr = 0; $exonctr < $gene->{num_exons}-1; $exonctr ++){

          if( $gene->{exons}[$exonctr]->{start} <= $pos && $pos <= $gene->{exons}[$exonctr]->{end}){ return ("exon",abs($gene->{tx_start}-$pos), $exonctr+1); }

          if( $gene->{exons}[$exonctr]->{end} < $pos && $pos < $gene->{exons}[$exonctr+1]->{start}){ return ("intron", abs($gene->{tx_start}-$pos), $exonctr+1); }

          }
          if( $gene->{exons}[$exonctr]->{start} <= $pos && $pos <= $gene->{exons}[$exonctr]->{end}){ return ("exon",abs($gene->{tx_start}-$pos), $exonctr+1); }

          print "Err....Unable to assign peak $pos to gene $gene->{gene_name} features\n";  exit(1);

       }
       if( ($gene->{tx_start} - $ups) > $pos){ return -1;  }
       if( $pos > ($gene->{tx_end} + $ds) ){ return 1;  }

    }
    if($gene->{strand} eq "-")
    {

       if( ($gene->{tx_end} - $ds) <= $pos && $pos <= ($gene->{tx_start} + $ups) )
       {

         if( $gene->{tx_start} <= $pos && $pos <= ($gene->{tx_start} + $ups) ){
         	return ("ups",abs($gene->{tx_start}-$pos), "");
         }
         if( ($gene->{tx_end} - $ds) <= $pos && $pos <= $gene->{tx_end} ){
         	return ("ds",abs($gene->{tx_end}-$pos), "");
         }

         my $exonctr;
         for($exonctr = 0; $exonctr < $gene->{num_exons}-1; $exonctr ++){

          if( $gene->{exons}[$exonctr]->{start} >= $pos && $pos >= $gene->{exons}[$exonctr]->{end}){ return ("exon",abs($gene->{tx_start}-$pos), $exonctr+1); }

          if( $gene->{exons}[$exonctr]->{end} > $pos && $pos > $gene->{exons}[$exonctr+1]->{start}){ return ("intron", abs($gene->{tx_start}-$pos), $exonctr+1); }

          }
          if( $gene->{exons}[$exonctr]->{start} >= $pos && $pos >= $gene->{exons}[$exonctr]->{end}){ return ("exon",abs($gene->{tx_start}-$pos), $exonctr+1); }

          print "Err....Unable to assign peak $pos to gene $gene->{gene_name} features\n";  exit(1);

       }
       if( ($gene->{tx_end} - $ds) > $pos){ return -1;  }
       if( $pos > ($gene->{tx_start} + $ups) ){ return 1;  }
   }
}

gen-peak-feature-counts.pl

#!/usr/bin/perl

###### Script to analyze the output of the map_peak_to_gene_feature.pl script  #####
###### Outout is a count of the number of peaks that are ups, core, intron, exon or intergenic only ####
###### A peak is assigned to one and only one feature based on the following heirarchy: ####
###### core > ups > intron > exon > ds > intergenic #######
###### The user can change what qualifies as intergenic based on the parameter $ups ####
###### However, the upper limit of $ups and $ds is set by the map_peak_to_gene_feature_v2.pl script #####


use strict;


if($#ARGV < 0){
    print "Usage: ./gen-peak-feature-counts.pl <file.pk> <pk_threshold> <file.map> <ups> <core_ups> <core_ds> <ds> <print_peak_flag> > <output file>\n";
	exit(1);
}

my $file_peak = $ARGV[0];
my $pk_threshold = $ARGV[1];
my $file_map = $ARGV[2];
my $ups = $ARGV[3];
my $core_ups = $ARGV[4];
my $core_ds = $ARGV[5];
my $ds = $ARGV[6];
my $print_peak_flag = $ARGV[7];


my %hash_peak_feature = ();
my %hash_feature_index = ();
my @arr_feature_count = ();
my $total_num_pk = 0;

$hash_feature_index{"intergenic"} = 1;
$hash_feature_index{"ds"} = 2;
$hash_feature_index{"exon"} = 3;
$hash_feature_index{"intron"} = 4;
$hash_feature_index{"ups"} = 5;
$hash_feature_index{"core"} = 6;

###### Intialize all peaks as intergenic ####
open(PK,"$file_peak") || die("Err opening $file_peak\n");
<PK>;
while(<PK>)
{
    $_=~s/\n//; $_=~s/\r//;
    my @arr = split(/\t/,$_);
    if($#arr < 6){ print "Incorrect peak file format\n"; exit(1);  }
    if($arr[0] eq "" || $arr[3] eq "" || $arr[6] eq ""){ print "Incorrect peak file format\n";  exit(1);  }

    if($arr[3] < $pk_threshold){ next;  }
    my $peak = $arr[0]."-".$arr[1]."-".$arr[2]."-".$arr[6]."-".$arr[3];
    $hash_peak_feature{$peak} = 1;
    $total_num_pk ++;
}
close(PK);

$total_num_pk = 0;

open(MAP,"$file_map") || die("Err opening $file_map\n");
while(<MAP>)
{
	$_=~s/\r//; $_=~s/\n//;
	my @arr = split(/\t/,$_);

	if($#arr < 8){ print "Incorrect map file format: 9 cols expected\n"; exit(1);  }

	my $peak = $arr[0]."-".$arr[1]."-".$arr[2]."-".$arr[3]."-".$arr[4];
	if($hash_peak_feature{$peak} eq ""){ next;  }

	my $feature = $arr[6];
	my $dist = $arr[8];
	if($feature eq "ups"){
		if($dist <= $core_ups){ $feature = "core"; }
		if(($dist > $core_ups) && ($dist <= $ups)){ $feature = "ups"; }
		if($dist > $ups){ $feature = "intergenic";  }
	}
	if($feature eq "exon" || $feature eq "intron"){ if($dist <= $core_ds){ $feature = "core"; }  }
	if($feature eq "ds"){
	    if($dist <= $ds){ $feature = "ds"; }
	    if($dist > $ds){ $feature = "intergenic"; }
	}
	my $feature_index = $hash_feature_index{$feature};
	if($hash_peak_feature{$peak} < $feature_index){ $hash_peak_feature{$peak} = $feature_index;  }
}
close(MAP);

if($print_peak_flag){

my $file_intg = $file_map.".intergenic";
my $file_ds = $file_map.".ds";
my $file_exon = $file_map.".exon";
my $file_intron = $file_map.".intron";
my $file_ups = $file_map.".ups";
my $file_core = $file_map.".core";

open(INTG,">$file_intg") || die("Err opening $file_intg\n");
open(DS,">$file_ds") || die("Err opening $file_ds\n");
open(EXON,">$file_exon") || die("Err opening $file_exon\n");
open(INTRON,">$file_intron") || die("Err opening $file_intron\n");
open(UPS,">$file_ups") || die("Err opening $file_ups\n");
open(CORE,">$file_core") || die("Err opening $file_core\n");

print INTG "track name=\"intergenic peaks\"\n";
print DS "track name=\"ds peaks\"\n";
print EXON "track name=\"exon peaks\"\n";
print INTRON "track name=\"intron peaks\"\n";
print UPS "track name=\"ups peaks\"\n";
print CORE "track name=\"core peaks\"\n";
}

foreach my $peak (sort keys %hash_peak_feature)
{
	my $feature_index = $hash_peak_feature{$peak};
   	$arr_feature_count[$feature_index]++;
   	$total_num_pk ++;

   	if($print_peak_flag){

   	my @arr = split(/-/,$peak);
   	if($feature_index == 1){ print INTG "$arr[0]\t$arr[1]\t$arr[2]\t$arr[4]\t$arr[4]\t+\t$arr[3]\t$arr[3]\n";  }
   	if($feature_index == 2){ print DS "$arr[0]\t$arr[1]\t$arr[2]\t$arr[4]\t$arr[4]\t+\t$arr[3]\t$arr[3]\n";  }
   	if($feature_index == 3){ print EXON "$arr[0]\t$arr[1]\t$arr[2]\t$arr[4]\t$arr[4]\t+\t$arr[3]\t$arr[3]\n";  }
   	if($feature_index == 4){ print INTRON "$arr[0]\t$arr[1]\t$arr[2]\t$arr[4]\t$arr[4]\t+\t$arr[3]\t$arr[3]\n";   }
   	if($feature_index == 5){ print UPS "$arr[0]\t$arr[1]\t$arr[2]\t$arr[4]\t$arr[4]\t+\t$arr[3]\t$arr[3]\n";   }
   	if($feature_index == 6){ print CORE "$arr[0]\t$arr[1]\t$arr[2]\t$arr[4]\t$arr[4]\t+\t$arr[3]\t$arr[3]\n";   }

   	}
}
if($print_peak_flag){

close(INTG);
close(DS);
close(EXON);
close(INTRON);
close(UPS);
close(CORE);

}

print "threshold\t$pk_threshold\t$pk_threshold\n";
print "total\t$total_num_pk\t100\n";
for(my $ctr = 1; $ctr <= $#arr_feature_count; $ctr ++)
{
  if($ctr == 1){ printf ("intergenic\t$arr_feature_count[$ctr]\t%0.2f\n",100*($arr_feature_count[$ctr]/$total_num_pk) );  }
  if($ctr == 2){ printf ("ds\t$arr_feature_count[$ctr]\t%0.2f\n",100*($arr_feature_count[$ctr]/$total_num_pk) );  }
  if($ctr == 3){ printf ("exon\t$arr_feature_count[$ctr]\t%0.2f\n",100*($arr_feature_count[$ctr]/$total_num_pk) );  }
  if($ctr == 4){ printf ("intron\t$arr_feature_count[$ctr]\t%0.2f\n",100*($arr_feature_count[$ctr]/$total_num_pk) );  }
  if($ctr == 5){ printf ("ups\t$arr_feature_count[$ctr]\t%0.2f\n",100*($arr_feature_count[$ctr]/$total_num_pk) );  }
  if($ctr == 6){ printf ("core\t$arr_feature_count[$ctr]\t%0.2f\n",100*($arr_feature_count[$ctr]/$total_num_pk) );  }
}


