# R code: Global mapping of highly pathogenic avian influenza H5N1 and 
# H5Nx clade 2.3.4.4 viruses with spatial cross-validation
#
# Madhur S Dhingra, Jean Artois, Timothy P Robinson, Catherine Linard, Celia Chaiban
# Ioannis Xenarios, Robin Engler, Robin Liechti, Dmitri Kuznetsov, Xiangming Xiao
# Sophie Von Dobschuetz, Filip Claes, Scott H Newman, Gwenaëlle Dauphin, Marius Gilbert.
#
# 2016/10/11

# Loads the library
library(gbm)
library(dismo)
library(pROC)

# Loads the 'thrSample' function
# the thrSample function selects a specific number of points (equal to k, the number of folds CV)
# not too close of each other selected points. For this purpose, the user specifies the setting
# parameter “thr” corresponding to the minimum distance between the selected points.

thrSample <- function(     
  thr = 4000, # the minimum distance
  coords = c("x", "y"), # the column names of geographic coordinates
  data = data, # the dataset
  nPts = 4) { # the number of items to choose
  
  # Stocks the results in 'MyId'
  MyId <- rep(NA, nPts)
  
  # Samples one observation from 'data' and saves it
  MyRow <- list(1:nrow(data)) 
  MyId[1] <- sample(MyRow[[1]], 1)
  
  # Computes all the distances between the sampled 
  # observation and the remaining observations
  # and stores it in 'MyDist'
  MyDist <- list(spDistsN1(as.matrix(data[,coords]),
                           as.matrix(data[MyId[1], coords]), longlat = TRUE))
  
  # Repeats the previous process a number of time
  # equal to the number of items to choose
  # -1 (to account for the first sampled observation)
  for(ii in 2:nPts){
    
    # Selects the ID of row that meet the minimum distance condition
    MyRowTr <- which(MyDist[[(ii-1)]] > thr)
    
    # Stops the function if no observation meets the condition
    if(length(MyRowTr) == 0) {
      stop("Restarts the function with a smaller minimum distance")
    }
    
    # Stores the ID
    MyRow[[ii]] <- MyRowTr 
    
    # Counts the number of time the observations meet the  
    # distance condition over iteration loops already performed 
    TestTabl <- table(unlist(MyRow))
    
    # Samples an observation from idThr, a vector of observation
    # that meet each time the minimum distance condition
    idThr <- as.numeric(names(which(TestTabl == ii)))
    MyId[ii] <- sample(idThr, 1)   
    
    # Computes the new distances
    MyDist[[ii]] <- spDistsN1(as.matrix(data[, coords]),
                              as.matrix(data[MyId[ii], coords]), longlat = TRUE)
  }
  return(MyId)
}

# Loads the 'Lib_DistEstimatesID' function
# Returns the identity row of nearest point
# in a vector Y for each element of a vector X.
Lib_DistEstimatesID <- function(myVec1, myVec2, longlat = TRUE){
	MyRes <- sapply(1:nrow(myVec1), function(i)
	                which.min(spDistsN1(myVec2, myVec1[i,], longlat = longlat)))
	return(MyRes)
}


# Loads the influenza cases and the pseudo-absences
dataTot <- read.table("../Response variable/EpiData.csv", head=T, sep=",", dec=".")


## Sets the run settings
#########################

# Models
P_InAdd <- 20152 # number of pseudo-absences in the models
P_dist <- lapply(1:20, function(ii) c(10, seq(1000,2995,105)[[ii]])) # a set of tested distances 

# BRT
P_tc <- 4 # tree complexity
P_lr <- 0.01 # learning rate
P_nfolds <- 5 # number of folds in the cross-validation
P_nt <- 100 # initial number of tree
P_ss <- 200 # step size

# The predictor names
P_VarList <- list(
  
        c("ChDnLgExt", "ChDnLgInt", "DuDnLg", "HpDnLg", "IsChina"),

				c("DW","Evergreen.Deciduous","Open.Water","Evergreen.Broadleaf.Trees","Deciduous.Broadleaf.Trees","Mixed.Other.Trees",
				"Shrubs","Herbaceous.Vegetation","Cultivated.and.Managed.Vegetation","Regularly.Flooded.Vegetation","Urban.Built.up", "IsChina"),
				
				c("Temperature.Annual.mean","Temperature.Amplitude.annual","Temperature.Amplitude.bi.annual","Temperature.Amplitude.tri.annual",
				"Temperature.Variance.of.annual", "Temperature.Variance.of.bi.annual","Temperature.variance.in.annual..bi.annual..and.tri.annual",
				"NDVI.Annual.mean","NDVI.Amplitude.annual","NDVI.Amplitude.bi.annual","NDVI.Amplitude.tri.annual", "NDVI.Variance.of.annual",
				"NDVI.Variance.of.bi.annual","NDVI.Variance.of.tri.annual","NDVI.variance.in.annual..bi.annual..and.tri.annual", "IsChina"),
				
				c("Cultivated.and.Managed.Vegetation","Open.Water","DW","Temperature.Annual.mean"))
			  
names_Set <- list("Set1", "Set2", "Set3", "Set4", "set2.1", "set3.1")

P_Prefix <- names_Set[set]
P_Prefix <- paste0("PModel", P_Model, "_", P_Prefix)


## Computes the standard models
###############################

# Runs the script for the set 1
set = 1
P_VarListF <- P_VarList[[set]]

# sets the dependent, i.e. finds the ID of the column "IsPresent" in dataTot.
myPosDep <- match("IsPresent", names(dataTot))

# Sets the predictors, i.e. finds the ID of predictors in dataTot
myPosPred <- match(P_VarListF, names(dataTot))

# Runs the script for one bootstrap, 
# i.e. one setting parameter of maximum distance and one set of pseudo-absence 
bo = 1

# Selects the HPAI H5N1 cases and one set of pseudo-absence
id <- c(which(dataTot$IsPresent==1 &  dataTot$pix=="H5N1"), which(dataTot$IsPresent==0)[((P_InAdd*(bo-1))+1):(P_InAdd*bo)])
dataTotbo <- dataTot[id,]

# Removes the pseudo-absences too close or too far of presences
idDistExclu <- which(dataTotbo$distp1 < P_dist[[bo]][1] | dataTotbo$distp1 > P_dist[[bo]][2])
dataTotbo <- dataTotbo[-idDistExclu,]
	
# Runs the scripts for one fold draw	
bo2 = 1

# Runs the standard BRT
gbmSav <- gbm.step(data = dataTotbo, gbm.x = myPosPred, gbm.y = myPosDep,
				           family = "bernoulli",tree.complexity = P_tc, learning.rate = P_lr, 
				           n.folds = P_nfolds, n.trees = P_nt, step.size = P_ss)

## Computes the spatial models
##############################

# Builds the spatial folds
MyRef <- dataTotbo[which(dataTotbo$pix == "H5N1"), c("x","y")]
MyId <- thrSample(thr = 3000, coords = c("x", "y"),
                  data = MyRef, nPts = P_nfolds)
MyRef <- MyRef[MyId,]
dataTotbo$LabGeo <- Lib_DistEstimatesID(as.matrix(dataTotbo[,c("x","y")]),
                                        as.matrix(MyRef))

plot(dataTotbo[,c("x","y")], pch = 16, col = as.factor(dataTotbo$LabGeo), asp = 1, axes = F)

# Runs the spatial BRT
myBRTGeo <- gbm.step(data = dataTotbo, gbm.x = myPosPred, gbm.y = myPosDep, fold.vector = dataTotbo$LabGeo,
                     family = "bernoulli", tree.complexity = P_tc, learning.rate = P_lr,
                     n.folds = P_nfolds, n.trees = P_nt, step.size = P_ss)

## Computes the SSB models
##########################

# Counts the number of presences/pseudo-absences
Count <- table(dataTotbo$IsPresent)

# Randomly samples the fraction of data to be sampled for each validation.
# The result is stocked as labels in 'selector' object.
# For this example we specify a number of fold equal to 5.
# The following section of code is taken from the `gbm.step` function (dismo package)
presence.mask <- dataTotbo[, "IsPresent"] == 1
absence.mask <- dataTotbo[, "IsPresent"] == 0
n.pres <- sum(presence.mask)
n.abs <- sum(absence.mask)
selector <- rep(0, nrow(dataTotbo))
temp <- rep(seq(1, P_nfolds, by = 1), length = n.pres)  
temp <- temp[order(runif(n.pres, 1, 100))]
selector[presence.mask] <- temp
temp <- rep(seq(1, P_nfolds, by = 1), length = n.abs)
temp <- temp[order(runif(n.abs, 1, 100))]
selector[absence.mask] <- temp
		
# Computes only one SSB AUC
nfold = 1

# Splits the data into validation/testing and training data  
idTrain1 <- which(selector != nfold & dataTotbo$IsPresent == 1)
idTrain0 <- which(selector != nfold & dataTotbo$IsPresent == 0) 
idVal1 <- which(selector == nfold & dataTotbo$IsPresent == 1)
idVal0 <- which(selector == nfold & dataTotbo$IsPresent == 0)
Val1 <- dataTotbo[idVal1,]
Val0 <- dataTotbo[idVal0,]
Tr1 <- dataTotbo[idTrain1,]
Tr0 <- dataTotbo[idTrain0,]

# Runs the first BRT on the training data
myBRTGeo1 <- gbm.step(data = rbind(Tr1,Tr0), gbm.x = myPosPred, gbm.y = myPosDep,
		                  family = "bernoulli", tree.complexity = P_tc, learning.rate = P_lr,
		                  n.folds = P_nfolds, n.trees = P_nt, step.size = P_ss)

# Samples the validation/testing data to remove "spatial sorting biais"
i <- pwdSample(Val1[,c("x","y")], Val0[,c("x","y")], Tr1[,c("x","y")], lonlat = TRUE, nearest = TRUE)
sVal1 <- Val1[!is.na(i), ]
ValDF <-  rbind(sVal1,Val0)

# Predicts the probability of presence on the filtered validation/testing
ValDF$myPred1 <- predict.gbm(myBRTGeo1, ValDF, n.trees=myBRTGeo1$gbm.call$best.trees, type="response")

# Standard CV auc
myBRTGeo1$cv.statistics$discrimination.mean

# SSB CV auc
roc(ValDF$IsPresent ~ ValDF$myPred1, data = ValDF)$auc[1]