# Construction of a genetic map for Streblospio benedicti

# Step 4: Inference of founder genotypes

# This analysis requires the founder genotype phases, inferred from segregation of intercross-pattern SNPs in the G2 generation, and poolseq SNP allele read counts. The script calculates the likelihoods that the founder genotype phases correspond to (Bayonne:LongBeach) vs (LongBeach:Bayonne). 

haps <- read.csv(file = "Sb702ParentalHaplotypeAssignment.csv") #founder genotype phases for SNPs infered (Step 1) to have intercross segregation pattern (AAxBB).

rcs <- read.table(file = "PoolSeqReadCounts.txt", head = F) # poolseq SNP allele read counts. Columns are SNP ID, contig, position, count of reference allele reads from Bayonne pools, count of alternative allele reads from Bayonne pools, count of reference allele reads from Long Beach pools, count of alternative allele reads from Long Beach pools. 

HapLogLs <- data.frame(nrow = 10, ncol = 7)

# Calculate haplotype assignment likelihoods for each of the 10 autosomes in turn

for(j in 1:10){ # for each autosome 
	
	chr <- haps[haps$snpID %in% rcs[rcs$V1 %in% c(haps[haps$Sb271LG ==j,1]),1],] #Extract the intercross SNPs for that chromosome
	
	rc <- rcs[rcs$V1 %in% c(haps[haps$Sb271LG ==j,1]),]	# Extract the poolseq read counts for those SNPs
	
	rc[,c(4:7)] <- rc[,c(4:7)] +1 # Add one to each read count to deal with zeros. 


	###### Estimate allele frequencies for each SNP for each population
	
	probrefbay <- NULL;
	probrefLB <- NULL
	for(i in 1:dim(chr)[1]){
		probrefbay[i] <- (rc[which(rc$V1 == chr$snpID[i]),4]) / ((rc[which(rc$V1 == chr$snpID[i]),4]) + rc[which(rc$V1 == chr$snpID[i]),5])
		probrefLB[i] <-  (rc[which(rc$V1 == chr$snpID[i]),6]) / ((rc[which(rc$V1 == chr$snpID[i]),6]) + rc[which(rc$V1 == chr$snpID[i]),7])
	}
	probaltbay <- 1-probrefbay
	probaltLB <- 1-probrefLB

	probmat <- data.frame(probrefbay, probaltbay, probrefLB, probaltLB)
	probmat <- probmat^2 # Square each allele frequency to get the estimated homozygote frequency at Hardy-Weinberg Equilibrium.

	##### Make vectors of genotype probabilities for each of the founder assignment possibilities

	bayvec1 <- NULL
	lbvec1 <- NULL
	for(i in 1:dim(chr)[1]){
		bayvec1[i] <- probmat[i,chr$Hap1[i]+1]
		lbvec1[i] <- probmat[i, chr$Hap1[i]+3]
	}

	bayvec2 <- NULL
	lbvec2 <- NULL
	for(i in 1:dim(chr)[1]){
		bayvec2[i] <- probmat[i,chr$Hap2[i]+1]
		lbvec2[i] <- probmat[i, chr$Hap2[i]+3]
	}

	##### Next, calculate the log likelihood of each assignment

	HapLogLs[j, 1] <- sum(log(bayvec1)) # This is the log likelihood that the first haplotype derives from the Bayonne population
	HapLogLs[j, 2] <- sum(log(lbvec1))	# This is the log likelihood that the first haplotype derives from the Long Beach population
	HapLogLs[j, 3] <- sum(log(bayvec2)) # This is the log likelihood that the second haplotype derives from the Bayonne population
	HapLogLs[j, 4] <- sum(log(lbvec2))	# This is the log likelihood that the second haplotype derives from the Long Beach population
}

	# Sum the log likelihoods to get the log likelihoods for the hypothesized arrangement and the alternative arrangment
HapLogLs[,5] <- apply(HapLogLs[,c(1,4)], 1, sum)
HapLogLs[,6] <- apply(HapLogLs[,c(2,3)], 1, sum)

	# Take the log-10 ratio of the likelihoods as an easy-to-interpret test statistic.
HapLogLs[,7] <-log(exp(HapLogLs[,5]) / exp(HapLogLs[,6]),10)

names(HapLogLs) <-c ("logLHap1Bay", "logLHap1LB", "logLHap2Bay", "logLHap2LB", "logL12BayLB", "logL12LBBay", "log10LR.BayLBvsLBBay")

# Report the results:

HapLogLs









