import pandas as pd 
import numpy as np
import re
from sklearn import datasets, linear_model
from sklearn.feature_selection import f_regression, SelectFdr
from statsmodels.sandbox.stats.multicomp import multipletests

def loadHodgesGenes(gradeType):
	gradeType = "all"
	gsemat = pd.read_csv("/home/andrew/GSE3790/GSE3790-GPL97_series_matrix.txt", sep = "\t", header = 81, index_col = 0)
	descmatall = pd.read_csv("/home/andrew/GSE3790/GSE3790-GPL97_series_matrix.txt", sep = "\t", header = 49, index_col = 0, nrows = 16)

	gsemat_b = pd.read_csv("/home/andrew/GSE3790/GSE3790-GPL96_series_matrix.txt", sep = "\t", header = 81, index_col = 0)
	descmatall_b = pd.read_csv("/home/andrew/GSE3790/GSE3790-GPL96_series_matrix.txt", sep = "\t", header = 49, index_col = 0, nrows = 16)

	def extractFeatures(charstring):
		teststring = charstring
		genotype = int(re.search('/\d*',teststring).group().lstrip('/'))
		age = int(re.search('Age = \d*',teststring).group().lstrip('Age = '))
		sex = re.search('sex = \w',teststring).group().lstrip('sex = ')
		if re.search('grade',teststring) == None:
			grade = -1
		else:
			grade = int(re.search('grade \d',teststring).group().lstrip('grade '))
		return genotype, age, sex, grade

	# replace indicies
	gsemat.columns = [col[:-2]for col in descmatall.columns]
	descmatall.columns = [col[:-2] for col in descmatall.columns]

	gsemat_b.columns = [col[:-2]for col in descmatall_b.columns]
	descmatall_b.columns = [col[:-2] for col in descmatall_b.columns]

	gsemat_all = pd.concat([gsemat,gsemat_b])

	caudateIndices = descmatall.loc[descmatall.index[6],] == "Caudate Nucleus"



	# extract features
	gsegenotype = descmatall.loc[descmatall.index[8],].apply(lambda x: extractFeatures(x)[0])
	gseage = descmatall.loc[descmatall.index[8],].apply(lambda x: extractFeatures(x)[1])
	gsesex = descmatall.loc[descmatall.index[8],].apply(lambda x: extractFeatures(x)[2])
	gsegrade = descmatall.loc[descmatall.index[8],].apply(lambda x: extractFeatures(x)[3])

	'''
	gsegenotype_b = descmatall_b.loc[descmatall_b.index[8],].apply(lambda x: extractFeatures(x)[0])
	gseage_b = descmatall_b.loc[descmatall_b.index[8],].apply(lambda x: extractFeatures(x)[1])
	gsesex_b = descmatall_b.loc[descmatall_b.index[8],].apply(lambda x: extractFeatures(x)[2])
	gsegrade_b = descmatall_b.loc[descmatall_b.index[8],].apply(lambda x: extractFeatures(x)[3])
	'''
	gsecase = gsegrade > -1



	if gradeType == "low":
		lowgradeindices = (gsegrade < 3) 
		mask = caudateIndices & lowgradeindices
	else:
		mask = caudateIndices

	#merging data matrices 
	gsemat2 = gsemat.dropna(axis = 0).T[mask].apply(lambda x: np.log(x + 1))
	gsemat2b = gsemat_b.dropna(axis = 0).T[mask].apply(lambda x: np.log(x + 1))

	age = gseage[mask]
	genotype = gsegenotype[mask]
	grade = gsegrade[mask]


	#editing gsegrade to replace C with 0
	#grade[grade == 'C'] = 0
	features = pd.concat([genotype, grade, age],axis = 1)
	features.columns = ['genotype','grade','age']
	# initializing regression model
	regr = linear_model.LinearRegression()

	regcoef = pd.DataFrame(index = gsemat2.columns, columns = ['cag','grade','age'])
	fcoef = pd.DataFrame(index = gsemat2.columns, columns = ['cag','grade','age'])
	fpvals = pd.DataFrame(index = gsemat2.columns, columns = ['cag','grade','age'])

	affygenes = gsemat2.columns

	for gene in affygenes:
		print(gene)
		y = gsemat2[gene]
		freg = f_regression(np.array(features), y, center = True)
		fcoef.loc[gene] = freg[0]
		fpvals.loc[gene] = freg[1]


	corpvals = pd.DataFrame(index = gsemat2.columns, columns = ['cag','grade'])

	# correcting pvalues
	fdrthresh = 0.01

	corpvals['cag'] = multipletests(fpvals['cag'], alpha = fdrthresh, method = "fdr_bh")[1]
	corpvals['grade'] = multipletests(fpvals['grade'], alpha = fdrthresh, method = "fdr_bh")[1]

	threshGenes = corpvals[corpvals.apply(lambda x: (x[0] < fdrthresh) | (x[1] < fdrthresh), axis = 1)].index

	'''
		affyconv = pd.read_csv("/home/andrew/NP/affy2Entrezb.txt", sep = "\t")
		affydict = dict(zip(affyconv['From'],affyconv['To']))
	'''

	affyconv2 = pd.read_csv("/home/andrew/paper1/HodgesSup/GPL97-17394.txt", header = 16, sep = "\t")
	affyconv3 = affyconv2.dropna(subset = ['ENTREZ_GENE_ID'])
	affydict2 = dict(zip(affyconv3['ID'],affyconv3['ENTREZ_GENE_ID']))
	entrezHumanStrings = [affydict2[probe] for probe in threshGenes if probe in affydict2.keys()]
	entrezHuman = [int(gene.split(' /// ')[0]) for gene in entrezHumanStrings]

	return entrezHuman

def loadDurrenburger():
	seriesFile = "/home/andrew/paper1/humanSamples/DurrenburgerCN/GSE26927_series_matrix.txt"
	gsemat = pd.read_csv(seriesFile, sep = "\t", header = 77, index_col = 0)
	descmatall = pd.read_csv(seriesFile, sep = "\t", header =41, index_col = 0, nrows = 15)

	httSamples = descmatall.iloc[8].apply(lambda x: 'Huntington' in x)
	ageDeath = descmatall.iloc[10].apply(lambda x: x.lstrip("age at death (in years): "))


	# converting ilumina to entrez


	convTable = pd.read_csv("/home/andrew/paper1/humanSamples/DurrenburgerCN/gplAnnot.txt", 
				sep = "\t", header = 6, index_col = 0)

	hgncToEntrez = pd.read_csv("/home/andrew/NP/hgncToEntrez.txt",sep = "\t")


	convDict2 = dict(zip(hgncToEntrez['Approved Symbol'],hgncToEntrez['Entrez Gene ID(supplied by NCBI)']))
	convDict2b = {k : int(convDict2[k]) for k in convDict2 if not np.isnan(convDict2[k])}

	inTable = convTable['SYMBOL'].apply(lambda x: x in convDict2b.keys())


def loadHodgesList():
	# Take list directly from Hodges, stages 0-2, gender and sex matched

	suppFile = '/home/andrew/paper1/HodgesSup/TableS1.xls'
	suppTable = pd.read_excel(suppFile, header = 1)
	suppTable['Caudate Probeset'] = suppTable['Caudate Probeset'].apply(lambda x: x.rstrip('_B'))

	affyconv_a = pd.read_csv("/home/andrew/paper1/HodgesSup/GPL97-17394.txt", header = 16, sep = "\t")
	affyconv_b = pd.read_csv("/home/andrew/paper1/HodgesSup/GPL96-57554.txt", header = 16, sep = "\t")	

	affyconv = pd.concat([affyconv_a,affyconv_b])

	affyconv2 = pd.DataFrame(affyconv[['ID','ENTREZ_GENE_ID']]).dropna()
	#affyconv2['ENTREZ2'] = affyconv2['ENTREZ_GENE_ID'].apply(lambda x: x.split('///'))


	#hgncToEntrez = pd.read_csv("/home/andrew/NP/hgncToEntrez.txt",sep = "\t")
	convDict = dict(zip(affyconv2['ID'],affyconv2['ENTREZ_GENE_ID']))

	'''
	allCaudate = list(suppTable['Caudate Probeset'])
	allEntrez = []
	notMapped = 0
	for probe in allCaudate:
		try:
			allEntrez.extend(convDict[probe])
		except:
			notMapped += 1
	'''

	suppTable['adjp'] = multipletests(suppTable['P.Value'], alpha = 0.05, method = 'fdr_bh')[1]

	def convProbe(probe):
			try:
				return convDict[probe]
			except:
				return 'NA'

	suppTable['Entrez'] = suppTable['Caudate Probeset'].apply(convProbe)

	# remove probes that do not have a unique mapping
	def filtFunc(myList):
		if myList == 'NA':
			return('NA')
		else:
			myList2 = [x for x in myList if x != 'NA']
			return([gene for gene in myList2 if int(gene) < 100000000])

	suppTable['entrezList'] = suppTable['Entrez'].str.split(' /// ')


	suppTable['entrezList_filt'] = suppTable['entrezList'].apply(filtFunc)
	suppTable['validList'] = suppTable['entrezList_filt'].apply(lambda x: len(x)==1)
	

	'''
	s = suppTable['Entrez'].str.split(' /// ', expand = True).stack()
	i = s.index.get_level_values(0)
	suppTable2 = suppTable.loc[i].copy()
	suppTable2['Entrez2'] = s.values
	'''
	return suppTable

def loadHodgesTable():
	# load in hodges Table, not summary statistics
	gradeType = "all"
	gsemat = pd.read_csv("/home/andrew/GSE3790/GSE3790-GPL97_series_matrix.txt", sep = "\t", header = 81, index_col = 0)
	descmatall = pd.read_csv("/home/andrew/GSE3790/GSE3790-GPL97_series_matrix.txt", sep = "\t", header = 49, index_col = 0, nrows = 16)

	gsemat_b = pd.read_csv("/home/andrew/GSE3790/GSE3790-GPL96_series_matrix.txt", sep = "\t", header = 81, index_col = 0)
	descmatall_b = pd.read_csv("/home/andrew/GSE3790/GSE3790-GPL96_series_matrix.txt", sep = "\t", header = 49, index_col = 0, nrows = 16)

	def extractFeatures(charstring):
		teststring = charstring
		genotype = int(re.search('/\d*',teststring).group().lstrip('/'))
		age = int(re.search('Age = \d*',teststring).group().lstrip('Age = '))
		sex = re.search('sex = \w',teststring).group().lstrip('sex = ')
		if re.search('grade',teststring) == None:
			grade = -1
		else:
			grade = int(re.search('grade \d',teststring).group().lstrip('grade '))
		return genotype, age, sex, grade

	# replace indicies
	gsemat.columns = [col[:-2]for col in descmatall.columns]
	descmatall.columns = [col[:-2] for col in descmatall.columns]

	gsemat_b.columns = [col[:-2]for col in descmatall_b.columns]
	descmatall_b.columns = [col[:-2] for col in descmatall_b.columns]

	gsemat_all = pd.concat([gsemat,gsemat_b])

	caudateIndices = descmatall.loc[descmatall.index[6],] == "Caudate Nucleus"



	# extract features
	gsegenotype = descmatall.loc[descmatall.index[8],].apply(lambda x: extractFeatures(x)[0])
	gseage = descmatall.loc[descmatall.index[8],].apply(lambda x: extractFeatures(x)[1])
	gsesex = descmatall.loc[descmatall.index[8],].apply(lambda x: extractFeatures(x)[2])
	gsegrade = descmatall.loc[descmatall.index[8],].apply(lambda x: extractFeatures(x)[3])



