# Python source code used for core analyses. The intention of this code is to give other researchers a protocol for developing similar tools within in their own coding environments.
# Code here takes advantage of BCBio / Biopython:
# https://github.com/chapmanb/bcbb/tree/master/gff
# http://biopython.org

# Function that computes the ratio between datasets at every ORF position as well as the pause score. 
# Inputs: 
# counts - Dictionary, keyed by chromosome, of ribosome profiling density at each position in WT strain
# counts1 - Dictionary, keyed by chromosome, of ribosome profiling density at each position in mutant strain
# GFFlist - GFF formatted annotation for genome
# utrtable - Table of 3'UTR annotations - only used to eliminate overlapping genes
# utrtable2 - Table of 5'UTR annotations - only used to eliminate overlapping genes
# shift - Whether to shift the position of the read with respect to sequence (default = 0)
# halfwindow - Controls window size when comparing reads at a position, allowing for multiple reads to be combined.
# ratiothresh - Minimal difference in value between KO and WT to be counted in total.
# countsthresh - List of 2 giving minimal density (in rpm) at each position in WT and KO required to be counted.
# pausethresh - Minimal pause level to be counted
# avgthresh - Threshold of minimal reads required in the gene (list of 2 for WT and KO).


def findpauseratio(counts,counts1,GFFlist,utrtable,utrtable2,shift,halfwindow,ratiothresh,countsthresh,pausethresh,avgthresh):
	missedcountsthresh=0
	missedratiothresh=0
	missedpausethresh=0
	missedavgthresh=0
	illegalgenes=0
	positionstotal=0
	hits=[]
	# Put headers on genelist
	hits.append(["headers","alias","chrom","feat_num","chrompos","mrnaposition","reads","reads1","pause","pause1","ratioreads","seq"])
	
	for chrom in GFFlist:
		print chrom
		feat_num=0
		for feature in GFFlist[chrom].features:
			# Import sequence and counts. Uses standard methods to extract the gene from the ribosome profiling data and offer sequence and counts back as lists. 
			gg=seqtools.givegene(chrom,feat_num,[GFFlist,utrtable2,utrtable],counts,[0,0,shift],2)
			genesequence=gg[1]
			genecounts=gg[0]
			

			# Get rid of dubious genes, nongenes, genes with overlap of others.
			if (genesequence==-1 or genecounts ==-1 or genesequence==-2 or genecounts==-2):	
				feat_num+=1
				illegalgenes+=1
				continue

			gg1=seqtools.givegene(chrom,feat_num,[GFFlist,utrtable2,utrtable],counts1,[0,0,shift],2)	
			genesequence1=gg1[1]
			genecounts1=gg1[0]
			
			
			# Define ORF
			start=0
			end=len(genecounts)
			if start==end:
				print "Error, gene length is 0 for gene "+feature.id
				exit()
				
			# Compute Gene median or average - here we are using the average
			avglevel=sum(genecounts)/len(genecounts)	
			avglevel1=sum(genecounts1)/len(genecounts1)

				
			if avglevel<avgthresh[0] or avglevel1<avgthresh[1]:
				missedavgthresh+=1
				avglevel=-1
				avglevel1=-1
				feat_num+=1
				continue
			
			if "Alias" in feature.qualifiers:
				alias = feature.qualifiers["Alias"][0]
			elif "Name" in feature.qualifiers:				# For E. coli genome
				alias=feature.qualifiers["Name"][0]
			elif "external_name" in feature.qualifiers:
				alias=feature.qualifiers["external_name"][0]	# For S. pombe genome
			else:
				alias = "NA"
						
			if "Note" in feature.qualifiers:
				note = feature.qualifiers["Note"][0]
			elif "description" in feature.qualifiers:
				note = feature.qualifiers["description"][0]	# For S. pombe genome
			else:
				note = "NA"		
			
			
			for i in range(halfwindow,len(genecounts)-halfwindow,1):
			# Compute Reads at each position.
				reads=sum(genecounts[i-halfwindow:i+halfwindow+1])
				reads1=sum(genecounts1[i-halfwindow:i+halfwindow+1])
				if reads==0 and countsthresh[0]<0:
					reads=countsthresh[0]*-1		
				if reads<countsthresh[0] or reads1<countsthresh[1]:
					missedcountsthresh+=1
					continue			
					

			# Compute pauses
				pause=reads/((2*halfwindow+1)*avglevel)
				pause1=reads1/((2*halfwindow+1)*avglevel1)
				if pause<pausethresh[0] or pause1<pausethresh[1]:
					missedpausethresh+=1
					pause=-1
					pause1=-1
					continue
				
			# Compute Ratio
				if reads==0:
					ratio=-1
				else:
					ratio=reads1/float(reads)			
				if ratio<ratiothresh:
					ratio=-1
					missedratiothresh+=1
					continue
				
				positionstotal+=1
			
			# Output chrom, featurenum, counts, extracounts, alias, note.
				if feature.strand==1:				
					genomicpos=seqtools.convertmrnatogenomic(i,chrom,feat_num,GFFlist)-shift		# So unshifted.
				else:
					genomicpos=seqtools.convertmrnatogenomic(i,chrom,feat_num,GFFlist)+shift		# So unshifted.
				
				
				hits.append([feature.id,alias,chrom,feat_num,genomicpos,i,reads,reads1,pause,pause1,ratio,gg[1]])

			feat_num+=1	
	
	print "Positions below threshold for counts, ratio, pause, median = "+str(missedcountsthresh)+", "+str(missedratiothresh)+", "+str(missedpausethresh)+", "+str(missedavgthresh)
	print "Genes dropped by givegene (overlap, undesirable features, etc.) = "+str(illegalgenes)
	print "positions included="+str(positionstotal)
	return hits






# This function will take a list with gene names and positions and a window over which to average and then output the average as a binary file. Each position is equally weighted in the average.

# Inputs: 
# genelist - a csv file with the following columns: gene name, chromosome, feature number, and position of interest.
# readcounts - Dictionary, keyed by chromosome, of ribosome profiling density at each position
# GFFlist - GFF formatted annotation for genome
# riboshift - Whether to shift the position of the read with respect to sequence
# seqwin - List of 2 controlling window size (5' and 3') around position of interest to be averaged.
# outfilestring - Base of output file name.
# thresh - Threshold of minimal reads required in the region of interest to be included in average.

def makeposavgSHARE(genelist,GFFlist,seqwin,readcounts,outfilestring,riboshift,thresh):
	zeroORFcount=0
	seqcheck=[0,0]

	countslist=[]
	f_csv=open(genelist)
	pausedict=listavg.readindict(f_csv)
	avgcounts=[0 for x in range(seqwin[0]+seqwin[1])]
	count=0
	zerocount=0
	for genename in pausedict:
		if genename=="headers":
			continue
		position=pausedict[genename][3]
		chrom=pausedict[genename][1]
		feat_num=pausedict[genename][2]
		
		position=int(position)
		feat_num=int(feat_num)
		mrnaposition=int(pausedict[genename][4])
	
		# Import sequence and counts. Uses standard methods to extract the gene from the ribosome profiling data and offer counts back as a list. 
		gg=seqtools.givegene(chrom,feat_num,GFFlist,readcounts,[0,0,riboshift],1)			
		if gg[0]==-1 or gg[0]==-2:
			continue
		genelength=len(gg[0])
		genecounts=gg[0]
	
		windowscheck=0
	
		if ((seqcheck[1]==1 or (genelength-mrnaposition)>=seqwin[1]) and (seqcheck[0]==1 or mrnaposition>=(seqwin[0]))):
			windowscheck=1
			
		if windowscheck==1:
			loccounts=genecounts[mrnaposition-seqwin[0]:mrnaposition+seqwin[1]]
			ORFcounts=genecounts[0:genelength]
			
			if sum(loccounts)>thresh:
				count+=1	
			else:
				zerocount+=1
				continue	
				
		else:
			loccounts=[] 
			continue
		countslist.append([genename,loccounts,ORFcounts])					
	
	if count==0:
		print "No genes to average."
		return [count,zerocount,0]
			
	
	# Make average
	for gene in countslist:
		genesum=sum(gene[1])
		ORFsum=1000*sum(gene[2])/len(gene[2]) # conversion so in rpkm
		for i in range(len(avgcounts)):
			if genesum!=0:
				avgcounts[i]+=gene[1][i]/float(genesum)		# Equal weighting.
		
	for i in range(len(avgcounts)):
		avgcounts[i]/=count
		
	avgs=[avgcounts]

	f2=open(outfilestring,"wb")
	for i in range(0,seqwin[0]+seqwin[1]):
		f2.write(struct.pack("f",float(avgs[0][i])))
	f2.close()
	return [count,zerocount,zeroORFcount]
	
	
	
	
	
	
	
	
	
	
	