#####
# 4C analysis
#####


################################################################################
# load library
################################################################################
library("FourCSeq")



################################################################################
# local functions
################################################################################
local.writeTrackFiles <- function(object=fc, assay = "counts", folder = "tracks", format = "bw", removeZeros = TRUE){
    #####
    # generate RPM bigwig files
    #####
    stopifnot(class(object) == "FourC")
    if (!format %in% c("bw", "bedGraph")) 
        stop("Format has to be 'bw' or 'bedGraph'")
    if (!assay %in% names(assays(object))) 
        stop("Select a valid assay of the FourC object.")
    ivCols <- c("seqnames", "start", "end")
    fragData = rowRanges(object)
    mcols(fragData) = NULL
    data <- assays(object)[[assay]]
    outputDir <- file.path(metadata(object)$projectPath, folder)
    if (!file.exists(outputDir)) 
        dir.create(outputDir)
    tmp <- options(scipen = 12, digits = 2)
    for (name in colnames(data)){
        outputFile = file.path(outputDir, paste0(assay, "_", name, ".", format))
        notNA <- !is.na(data[, name])
        if (removeZeros) 
            notNA[notNA][(data[notNA, name] == 0)] <- FALSE
        gr = fragData[notNA, ]
        gr$score = data[notNA, name]
        gr = keepSeqlevels(gr, unique(as.character(seqnames(gr))))
        gr$score = 1000000*gr$score/sum(gr$score)
        rtracklayer::export(gr, outputFile, format = format)
        rm(gr)
    }
    options(tmp)
    return(paste0("Successfully created ", format, " files of the ", assay, " data."))
}

local.FourC.Counts <- function(ph){
    #####
    # FourC analysis -- count track
    #####

    ## ref genome
    hg19_genome = "/michorlab/hjwu/code/bior_4C/ref/hg19/hg19_genome_bt2/hg19.fa"
    #referenceGenomeFile = "/michorlab/hjwu/task/genome/hg19/Sequence/Chromosomes/chr8.fa"

    ## bam file folder
    bamFilePath = "../../data"
    bamFilePrefix = paste0(ph$Condition, "_Rep", 1:3, "_R1")

    ## primer seq (used to get the view point, not used here, we manully insert the view point coordinates)
    primerFile = "primer.fa"

    ## sample condition
    cond = gsub("_ND.*", "", ph$Condition)

    ## view point name
    vp.name = gsub(".*_", "", ph$Condition)[1]

    ## create meta data
    metadata <- list(projectPath = "SOX17",
        fragmentDir = "re_fragments",
        referenceGenomeFile = hg19_genome,
        reSequence1 = "CATG",
        reSequence2 = "GATC",
        primerFile = primerFile,
        bamFilePath = bamFilePath)

    ## create col data
    colData <- DataFrame(viewpoint = vp.name,
        condition = factor(cond, levels=unique(cond)),
        levels = cond,
        replicate = ph$Replicate,
        bamFile = paste0(bamFilePrefix, "/", bamFilePrefix, ".bam"),
        sequencingPrimer="first")

    ## create FourC object
    fc <- FourC(colData, metadata)

    ## add RE fragments
    fc <- addFragments(fc, save=T)

    ## add view points
    colData(fc)$chr = ph$Chromosome[1]
    colData(fc)$start = ph$Viewpoint.Start[1]
    colData(fc)$end = ph$Viewpoint.End[1]

    ## count reads
    fc <- countFragmentOverlaps(fc, trim=4, minMapq=0)
    fc <- combineFragEnds(fc)

    ## write raw counts data in bedGraph
    writeTrackFiles(fc, format="bedGraph")

    ## smooth the counts
    fc <- smoothCounts(fc)

    ## write RPM in bigwig
    local.writeTrackFiles(fc, assay="counts_5")

    ## return object
    fc
}



################################################################################
# mapping stats
################################################################################
# paste <(find . -name ITER0.mapstat |grep _R1 |sort) <(find . -name ITER0.mapstat |grep _R1 |sort |xargs cat |grep "aligned exactly 1 time" |sed 's/ aligned exactly 1 time//g')
# paste <(find . -name ITER0.mapstat |sort) <(find . -name ITER0.mapstat |sort |xargs cat |grep "aligned exactly 1 time" |sed 's/ aligned exactly 1 time//g')



################################################################################
# read in data
################################################################################
## readin pheno data
pheno = read.delim("../../info/Data_summary_table.txt", stringsAsFactors=F)

## make output dir
system("mkdir SOX17")



################################################################################
# FourCSeq analysis
################################################################################
## different VP results
fc_ongene = local.FourC.Counts(pheno[grep("ND23660834", pheno$Condition),])
fc_closeCTCF = local.FourC.Counts(pheno[grep("ND23657852", pheno$Condition),])
fc_onDMR = local.FourC.Counts(pheno[grep("ND23658528", pheno$Condition),])
fc_closeDMR = local.FourC.Counts(pheno[grep("ND23658576", pheno$Condition),])
fc_upstreamTADcrtl = local.FourC.Counts(pheno[grep("ND23657570", pheno$Condition),])

## to a list data
fc_list = list(fc_ongene, fc_closeCTCF, fc_onDMR, fc_closeDMR, fc_upstreamTADcrtl)

## make track with smooth data
tmp <- lapply(fc_list, function(x){
    local.writeTrackFiles(x, assay="counts_5")
})





## calculate Z score
fcf <- getZScores(fc, minCount = 40)

## call peaks
fcf <- addPeaks(fcf, zScoreThresh=2, fdrThresh=0.05)

object=fc
assay = "counts"
folder = "tracks"
format = "bw"
removeZeros = TRUE
{
    stopifnot(class(object) == "FourC")
    if (!format %in% c("bw", "bedGraph")) 
        stop("Format has to be 'bw' or 'bedGraph'")
    if (!assay %in% names(assays(object))) 
        stop("Select a valid assay of the FourC object.")
    ivCols <- c("seqnames", "start", "end")
    fragData = rowRanges(object)
    mcols(fragData) = NULL
    data <- assays(object)[[assay]]
    outputDir <- file.path(metadata(object)$projectPath, folder)
    if (!file.exists(outputDir)) 
        dir.create(outputDir)
    tmp <- options(scipen = 12, digits = 2)
    for (name in colnames(data)) {
        outputFile = file.path(outputDir, paste0(assay, "_", 
            name, ".", format))
        notNA <- !is.na(data[, name])
        if (removeZeros) 
            notNA[notNA][(data[notNA, name] == 0)] <- FALSE
        gr = fragData[notNA, ]
        gr$score = data[notNA, name]
        gr = keepSeqlevels(gr, unique(as.character(seqnames(gr))))
        export(gr, outputFile, format = format)
        rm(gr)
    }
    options(tmp)
    return(paste0("Successfully created ", format, " files of the ", 
        assay, " data."))
}


object=fc
removeZeros = TRUE
minCount = 40
minDist = NULL
fitFun = "distFitMonotoneSymmetric"
sdFun = mad
maxDist = 2000000
minDist = 5000
{
    stopifnot(class(object) == "FourC")
    if (!c("counts") %in% names(assays(object))) 
        stop("No assay 'counts' found. Use 'combineFragEnds' first.")
    if (any(!c("chr", "start", "end") %in% names(colData(object)))) 
        stop("No information about viewpoint position provided. Add this information as described in the vignette.")
    if (c("zScore") %in% names(assays(object))) 
        stop("z-scores are already calculated. To recalculate z-scores use the object returned by 'combineFragEnds'.")
    viewpoint = unique(colData(object)$viewpoint)
    if (length(viewpoint) != 1) 
        stop("None or more than one viewpoint are contained in the 'FourC' object.\n         Use a 'FourC' object that contains only one viewpoint.")
    print(viewpoint)
    object <- object[seqnames(object) == unique(colData(object)$chr), 
        ]
    fragData = getDistAroundVp(viewpoint, colData(object), rowRanges(object))
    medianCounts <- apply(counts(object), 1, median)
    if (!is.null(minDist)) { #minDist = 2kb
        tooClose = abs(fragData$dist) <= minDist
    }else{
        toLeft <- fragData$dist > -20000 & fragData$dist < 0 & 
            !is.na(fragData$dist)
        afterMin <- 1:sum(toLeft) > tail(which(sign(diff(medianCounts[toLeft])) < 
            0), 1) + 1
        toExclude <- which(toLeft)[afterMin]
        toRight <- fragData$dist < 20000 & fragData$dist > 0 & 
            !is.na(fragData$dist)
        beforeMin <- 1:sum(toRight) < which(sign(diff(medianCounts[toRight])) > 
            0)[1]
        toExclude <- c(toExclude, which(toRight)[beforeMin])
        toExclude = union(toExclude, which(abs(fragData$dist) < 
            1000))
        tooClose = rep(FALSE, length(fragData$dist))
        tooClose[toExclude] = TRUE
    }
    fragData$posLeft[tooClose] = FALSE
    fragData$posRight[tooClose] = FALSE
    fragData$tooClose = tooClose
    lowCounts = medianCounts < minCount
    fragData$posLeft[lowCounts] = FALSE
    fragData$posRight[lowCounts] = FALSE
    fragData$lowCounts = lowCounts
    fragData$tooFar = abs(fragData$dist) > maxDist

    fragData$selectedForFit <- (fragData$posLeft | fragData$posRight) & !fragData$tooFar
    newCols <- c("tooClose", "lowCounts", "selectedForFit")
    mcolsRows <- DataFrame(type = rep("fragmentSelection", length(newCols)), 
        description = rep("", length(newCols)))
    mcols(mcols(fragData))[colnames(mcols(fragData)) %in% newCols, 
        ] <- mcolsRows
    rowRanges(object) <- fragData
    fragData = as.data.frame(fragData)
    colData(object)$condition <- factor(colData(object)$condition)
    dds <- object[mcols(object)$selectedForFit, ]
    dds <- estimateSizeFactors(dds)
    dds <- estimateDispersions(dds)
    if (attr(dispersionFunction(dds), "fitType") != "parametric") {
        stop("Failed to estimate the parameters of the Variance stabilizing transformation.")
    }
    else {
        coefs <- attr(dispersionFunction(dds), "coefficients")
        attr(dds, "vst") <- function(q) {
            log((1 + coefs["extraPois"] + 2 * coefs["asymptDisp"] * 
                q + 2 * sqrt(coefs["asymptDisp"] * q * (1 + coefs["extraPois"] + 
                coefs["asymptDisp"] * q)))/(4 * coefs["asymptDisp"]))/log(2)
        }
        attr(dds, "inverse-vst") <- function(q) {
            (4 * coefs["asymptDisp"] * 2^q - (1 + coefs["extraPois"]))^2/(4 * 
                coefs["asymptDisp"] * (1 + coefs["extraPois"] + 
                (4 * coefs["asymptDisp"] * 2^q - (1 + coefs["extraPois"]))))
        }
    }
    trafo <- getVST(dds)(counts(dds))
    fit <- apply(trafo, 2, fitFun, fragData = as.data.frame(rowRanges(dds)), 
        removeZeros = removeZeros, ...)
    residuals <- trafo - fit
    sd <- apply(residuals, 2, sdFun)
    zScore <- sweep(residuals, 2, sd, "/")
    pValue <- apply(zScore, 2, pnorm, lower.tail = FALSE)
    pAdjusted <- apply(pValue, 2, p.adjust, method = "BH")
    colData(dds)$sd <- sd
    idx <- which(colnames(colData(dds)) == "sd")
    metaDataFrame <- DataFrame(type = "intermediate", description = "sd/mad calculated from the residuals")
    mcols(colData(dds))[idx, ] <- metaDataFrame
    metadata(dds)$parameter <- DataFrame(fitFun = fitFun, removeZeros = removeZeros, 
        minCount = minCount, sdFun = sdFun, ...)
    if (!is.null(minDist)) 
        metadata(dds)$parameter$minDist = minDist
    assays(dds) <- c(assays(dds), SimpleList(trafo = trafo, fit = fit, 
        zScore = zScore, pValue = pValue, pAdjusted = pAdjusted))
    invisible(dds)
}








library(TxDb.Hsapiens.UCSC.hg19.knownGene)

plotZScores(fcf[,1:2], txdb=TxDb.Hsapiens.UCSC.hg19.knownGene, plotWindows=500000)













