##############################
#Sub routine
##############################
def NULLELIM(cvs):
	aft_data = []
	for i in range(len(cvs)):
		temp = []
		temp = cvs[i]
		if re.search('.*\w+.*',temp):
			aft_data.append(temp)
		else:
			continue
	return aft_data

def JUDGEIDENTITY(tmp_array,clustalpath,test,stp_seq,cutnum):
	flag = int(0)
	#print stp_seq
	stp_label = []
	stp_label = re.split("\n",stp_seq)[0]
	stp_label = re.split("\>",stp_label)[1]
	#stp_label = stp_seq
	#print stp_label

	if os.path.isfile("temporary.fasta"):
		os.remove("temporary.fasta")
	tmp_output = open("temporary.fasta","w")
	for i in range(len(tmp_array)):
		tmpa = []
		tmpa = tmp_array[i]
		tmp_output.write(tmpa)
		tmp_output.write("\n")
	tmp_output.close()

	cline = ClustalwCommandline(clustalpath, infile="temporary.fasta")
	assert os.path.isfile(clustalpath)
	stdout, stderr = cline()

	align = AlignIO.read("temporary.aln","clustal")
	count = SeqIO.write(align,"temporary.fasta","fasta")
	alignment = AlignIO.read("temporary.fasta","fasta")

	j = 0
	k = 0
	percent = 0

	stp_seq = []
	check_seq = []

	#print alignment

	for record in alignment:
		#tmp_label = re.split("\|",record.id)[1]
		tmp_label = record.id
		#print tmp_label
		if tmp_label == stp_label:
			stp_seq.extend(record.seq)
		else:
			check_seq.extend(record.seq)

	#print stp_seq

	matches = sum(aa1 == aa2 for aa1, aa2 in zip (stp_seq, check_seq))
	pct_identity = 100.0*matches/len(stp_seq)

	if cutnum < pct_identity:
		flag = int(1)

	#print pct_identity

	return flag

def ABSTSEQ(number,library_data,clustalpath,cutnum):
	delete_num_array = []
	stp_seq = library_data[number]
	#print len(library_data)
	for i in range(number+1,len(library_data),1):
		#print i
		tmp_array = [];tmp_lib = [];delete_flag = int(0)
		tmp_array.append(stp_seq)
		tmp_lib = library_data[i]
		tmp_array.append(tmp_lib)

		#print tmp_array

		delete_flag = JUDGEIDENTITY(tmp_array,clustalpath,i,stp_seq,cutnum)

		if delete_flag == int(1):
			delete_seq_num = str(i)
			delete_num_array.insert(0,delete_seq_num)
	return delete_num_array
		

##############################
#Main program
##############################

import os,sys,re,random
import numpy as np
from Bio.Align.Applications import ClustalwCommandline
from Bio import AlignIO
from Bio import SeqIO

residuefix = []

while len(sys.argv) > 1:
	option = sys.argv[1]
	del sys.argv[1]
	if option == "-LIBRARY":
		library = sys.argv[1]
		del sys.argv[1]
	elif option == "-CLUSTALPATH":
		clustalpath = sys.argv[1]
		del sys.argv[1]
	elif option == "-CUTNUM":
		cutnum = float(sys.argv[1])
		del sys.argv[1]
	elif option == "-OUTPUT":
		output = sys.argv[1]
		del sys.argv[1]

number = int(0);remseqs = [];deleted_nums = []
library_data = file(library).read().split("\n\n")

init_1 = len(library_data)

print "Number of initial number of libaray is %(init_1)i"%vars()

while 1:

	print "Number %(number)i th sequence is now analyzing..."%vars()
	
	library_data = NULLELIM(library_data)
	num_of_library = len(library_data)

	tmp_analseq = []
	tmp_analseq = library_data[number].split("\n")[0]
	remseqs.append(tmp_analseq)

	tmp_deleted = int(0)
	tmp_prior_del = int(num_of_library)

	delete_array = ABSTSEQ(number,library_data,clustalpath,cutnum)

	for delseq in range(len(delete_array)):
		tmp_del = int(delete_array[delseq])
		del library_data[tmp_del]
	
	number = number + 1
	num_of_library = len(library_data)
	
	tmp_deleted = tmp_prior_del - num_of_library
	deleted_nums.append(str(tmp_deleted))

	print "Total number of sequences in library = %(num_of_library)i"%vars()

	if number >= num_of_library:
		break

outputfile = open(output,"w")
outputfile.write("Number of sequences: %(num_of_library)i\n"%vars())

for i in range(len(library_data)):
	tmp = []
	tmp = library_data[i]
	outputfile.write("%(tmp)s\n\n"%vars())
outputfile.close()

outputfile_log = open("analysis.log","w")
for j in range(len(remseqs)):
	tmpdata = []
	tmpdata = remseqs[j]+":"+deleted_nums[j]
	outputfile_log.write("%(tmpdata)s\n"%vars())
outputfile_log.close() 









