##################################
##	16/10/5 by Sohyun	##
##################################

## Input: Ensembl Chicken annotation
INFILE = "Ensembl_Chicken_mRNA.gtf"
OUTFILE = "Chicken_Intron.gtf"

###### parsing using Class ##############
class cGTF:
    def __init__(self):
		self.sChrNum = "NULL"
		self.sFeature="NULL"
		self.nChr_start = 0
		self.nChr_End = 0
		self.sStrand = "NULL"
		self.sAttribute = "NULL"
        
    def parse_gtf_line(self, sReadLine):
		self.sChrNum = sReadLine.split("\t")[0]
		self.sSource = sReadLine.split("\t")[1]
		self.sFeature = sReadLine.split("\t")[2]
		self.nChr_start = int(sReadLine.split("\t")[3])
		self.nChr_end = int(sReadLine.split("\t")[4])
		self.sScore=sReadLine.split("\t")[5]
		self.sStrand = sReadLine.split("\t")[6]
		self.nFrame = sReadLine.split("\t")[7]
		self.sAttribute = sReadLine.split("\t")[8]

#########################################


def Bring_exon_position():
	Dic_Exon={}
	sGeneid_total=[]
	for sLine in infile:
		cReadGTF=cGTF()
		cReadGTF.parse_gtf_line(sLine)

		if cReadGTF.sFeature == "exon":
			Attribute_list = cReadGTF.sAttribute.strip().split("; ")
			for sAttribute in Attribute_list:
			
				if sAttribute.startswith("transcript_id"):
					TranscriptID = sAttribute[15:-1]
					
				elif sAttribute.startswith("gene_id"):
					GeneID = sAttribute[9:-1]
					sGeneid_total.append(GeneID)

			sKey = cReadGTF.sChrNum+"\t"+cReadGTF.sStrand+"\t"+GeneID
			Dic_Exon.setdefault(sKey,[])
			sValue = str(cReadGTF.nChr_start)+"_"+str(cReadGTF.nChr_end)
			
			
			if not sValue in Dic_Exon[sKey] :
				Dic_Exon[sKey].append(sValue)
		
			Dic_Exon[sKey].sort()

	sGeneid_total_nonrepet=list(set(sGeneid_total))
	print "sGene_total : "+str(len(sGeneid_total_nonrepet))
	return Dic_Exon


def Sum_Exon_position_for_AS(Dic_Exon):
	sExon_dic_SumExon = {}
	Overlap_num=0
	for sKeys in Dic_Exon.keys():
		sExon_dic_SumExon.setdefault(sKeys,[])

		for Exon_position in Dic_Exon[sKeys]:
			nExon_start = int(Exon_position.strip().split("_")[0])
			nExon_End = int(Exon_position.strip().split("_")[1])
			
			Exon_Overlap_list=[]
			for Compared_Exon in Dic_Exon[sKeys]:
				nExon_start_compared = int(Compared_Exon.strip().split("_")[0])
				nExon_End_compared=int(Compared_Exon.strip().split("_")[1])

				
				
					
				
				# Condition 1 #	
				# ----------
				#	----------
				if nExon_start <= nExon_start_compared and nExon_start_compared <= nExon_End <= nExon_End_compared :

					sExon_total=str(nExon_start)+"_"+str(nExon_End_compared)
					Switch="on"
					Overlap_num+=1
					Exon_Overlap_list.append(sExon_total)

				# Condition2 #	
				#	------------
				# --------
				elif nExon_start_compared <= nExon_start <= nExon_End_compared  and nExon_End_compared  <= nExon_End:
					sExon_total = str(nExon_start_compared)+"_"+str(nExon_End)
					Switch="on"
					Overlap_num+=1
					Exon_Overlap_list.append(sExon_total)

				# Condition3 #
				#	-----
				# ----------------

				elif nExon_start_compared <= nExon_start <= nExon_End_compared and nExon_start_compared <= nExon_End <= nExon_End_compared:

					sExon_total = str(nExon_start_compared)+"_"+str(nExon_End_compared)
					Switch="on"
					Overlap_num+=1
						
					Exon_Overlap_list.append(sExon_total)


				#Condition4 #
				#--------------------
				#	-------
				elif nExon_start <= nExon_start_compared and nExon_End_compared <= nExon_End:
					sExon_total = str(nExon_start)+"_"+str(nExon_End)
					Switch="on"
					Overlap_num+=1
						

					Exon_Overlap_list.append(sExon_total)
					
				else :

					sExon_total = str(nExon_start)+"_"+str(nExon_End)
					Switch="off"
					Exon_Overlap_list.append(sExon_total)
			
			tempt_dic={}
			tempt_length_list=[]
			
			for sLine in Exon_Overlap_list:
				sList=sLine.strip().split("_")
				nStart=int(sList[0])
				nEnd=int(sList[1])
				nValue=nEnd-nStart
				tempt_length_list.append(nValue)
				tempt_dic[nValue]=sLine
			
				
			nMax=max(tempt_length_list)
			if not tempt_dic[nMax] in sExon_dic_SumExon[sKeys]:

				sExon_dic_SumExon[sKeys].append(tempt_dic[nMax])
		
	print Overlap_num	
	return sExon_dic_SumExon



def Make_intron_line(Dic_ExonStart,Dic_ExonEnd):

	sGene_id_morethan_two_exons=[]
	sGene_id_justonexon=[]
	
	Dic_Intron_StartEnd={}
	Intron_list=[]
	for sKeys in Dic_ExonStart.keys():
		if len(Dic_ExonStart[sKeys])== len(Dic_ExonEnd[sKeys]):
			
			ExonNumber = len(Dic_ExonStart[sKeys])
			sKey_list=sKeys.strip().split("\t")
			sChr=sKey_list[0]
			sStrand=sKey_list[1]
			sGeneID=sKey_list[2]			

			if not ExonNumber ==1: 
				Dic_Intron_StartEnd.setdefault(sKeys,[])
				sGene_id_morethan_two_exons.append(sGeneID)
				for nNumber in range(0,ExonNumber-1):
				
					
					if (int(Dic_ExonStart[sKeys][nNumber+1])-1)-(int(Dic_ExonEnd[sKeys][nNumber])+1)<0:
						print sIntron_position
						print Dic_ExonStart[sKeys]
						print Dic_ExonEnd[sKeys]
					
					else:
						sIntron_position=str(Dic_ExonEnd[sKeys][nNumber]+1)+"\t"+str(Dic_ExonStart[sKeys][nNumber+1]-1)

						Dic_Intron_StartEnd[sKeys].append(sIntron_position)
						Intron_list.append(sIntron_position)
						
			else:
				sGene_id_justonexon.append(sGeneID)


		else :
			print"ERROR"
	
	sGene_id_morethan_two_exons=list(set(sGene_id_morethan_two_exons))
	sGene_id_justonexon=list(set(sGene_id_justonexon))
	print "More than two exons : "+str(len(sGene_id_morethan_two_exons))
	print "More than one exon : "+str(len(sGene_id_justonexon))
	return Dic_Intron_StartEnd,Intron_list


		
def Make_intronID(Intron_list):
	IntronID_dic={} # Key : IntronID / Value: position
	Intron_list_deleteR=list(set(Intron_list))
	nTotal_intron=len(Intron_list_deleteR)
	for nNum in range(1,nTotal_intron+1):
		sIntronID="SOHYUNI"+str(nNum)
		
		IntronID_dic[Intron_list_deleteR[nNum-1]]=sIntronID
		
	return IntronID_dic

def Writing_Intron(Dic_Intron_StartEnd,IntronID_dic):
	outfile=open(OUTFILE,"w")

	for sKeys in Dic_Intron_StartEnd.keys():
		sKey_list=sKeys.strip().split("\t")
		sChr=sKey_list[0]
		sStrand=sKey_list[1]
		sGeneID=sKey_list[2]


		IntronPosition_list=Dic_Intron_StartEnd[sKeys]
		
		nNumber_of_Intron=len(IntronPosition_list)

		for nNum in range(0,nNumber_of_Intron):
			sIntronPosition=IntronPosition_list[nNum]	
			nIntron_number=nNum

			outfile.write(sChr+"\tensembl\tintron\t")
			outfile.write(sIntronPosition+"\t")
			outfile.write(".\t"+sStrand+"\t"+".\t")
			outfile.write('gene_id "'+IntronID_dic[sIntronPosition]+'_I"; ')
			outfile.write('intron_number "'+str(nIntron_number)+'"; ')
			outfile.write('intron_id "'+IntronID_dic[sIntronPosition]+'"; ')
			outfile.write("\n")


if __name__ == "__main__":
	
	infile=open(INFILE,"r")

	Dic_Exon = Bring_exon_position()
	print "round 1"
	sExon_dic_SumExon1 = Sum_Exon_position_for_AS(Dic_Exon)
	print "round 2"
	sExon_dic_SumExon2 = Sum_Exon_position_for_AS(sExon_dic_SumExon1)
	print "round 3"
	sExon_dic_SumExon3 = Sum_Exon_position_for_AS(sExon_dic_SumExon2)
	print "round 4"
	sExon_dic_SumExon4 = Sum_Exon_position_for_AS(sExon_dic_SumExon3)
	print "round 5"
	sExon_dic_SumExon5 = Sum_Exon_position_for_AS(sExon_dic_SumExon4)
	

	Dic_ExonStart,Dic_ExonEnd=Make_Exon_StartandEnd(sExon_dic_SumExon5)
	Dic_Intron_StartEnd,Intron_list=Make_intron_line(Dic_ExonStart,Dic_ExonEnd)
	
	IntronID_dic = Make_intronID(Intron_list)
	
	Writing_Intron(Dic_Intron_StartEnd,IntronID_dic)

	
main()