Random cells in organoids are transduced to express oncogenes. However, only some transduced cells proliferate and lead to tumor formation. Initial observations suggest that tumors originate from clusters of transduced cells.

Goal: Find which features of transduced cell clusters best predict the formation of tumors.

For each cluster of transduced cells, the features to test are: - number of cells in the organoid - ratio of number of cells to organoid surface area (~cell density) - number of transduced cells in the organoid - number of cells in the cluster volume - number of transduced cells in the cluster - average pairwise distance between all cells in the cluster volume - average pairwise distance between transduced cells in the cluster - fraction of transduced cells in the cluster volume - number of contacts between transduced cells in the cluster

Load the data

Data consists of two csv files per organoid at the start of the experiment: - one file containing x,y,z coordinates and ID of all cells in the organoid, - one file containing IDs of all transduced cells in the organoid with 1 in the second column indicating that the cluster the transduced cell belongs to will produce a tumor. Only one transduced cell per cluster is annotated.

data.dir <- "organoids_tumors"
file.list <- list.files(data.dir, pattern = "Pos", full.names = TRUE)
organoids <- list()
for(i in 1:length(file.list)){
  # Read cells positions
  # The files have 2 lines above the header (hence the use of skip=2)
  # Be careful: sometimes a third empty line is at the top of the file and should be removed before reading here
  # Also read.csv() issues a warning if the file doesn't end with an empty line
  cells <- read.csv(file.list[[i]], header=TRUE, sep=",",  stringsAsFactors = FALSE, skip = 2)
  # Remove unnecessary columns
  cells <- cells[, -c(4:9)]
  # Rename columns
  colnames(cells) <- c("x","y","z","id")
  # Read labels
  annotation.file <- gsub("Pos", "Label", file.list[[i]])
  annotations <- read.csv(annotation.file, header=TRUE, sep=",", stringsAsFactors = FALSE)
  annotations <- cbind(annotations, 1)
  colnames(annotations) <- c("id", "tumor", "transduced")
  # Combine positions and annotations based on cell id
  cells <- merge(cells, annotations, by = "id", all.x = TRUE)
  cells[is.na(cells)] <- 0
  organoids[[i]] <- cells
}
incomplete final line found by readTableHeader on 'organoids_tumors/Label_0010.csv'incomplete final line found by readTableHeader on 'organoids_tumors/Label_0018.csv'

Identify clusters of transduced cells and extract features

library(dbscan)
library(car)
library(RColorBrewer)
library(pals)
library(dynamicTreeCut)
library(dendextend)
palette.cells <- rev(colorRampPalette(brewer.pal(11, "Spectral"))(3+1L))
palette.clusters <- alphabet()
palette.clusters[c("damson","violet")] <- palette.clusters[c("violet","damson")]
organoid.features <- data.frame(number.of.cells = numeric(),
                                cell.density = numeric(),
                                number.of.transduced.cells = numeric(), 
                                tumor = numeric())
clusters <- data.frame(id = character(), 
                       number.of.cells.in.organoid = numeric(),
                       organoid.cell.density = numeric(),
                       number.of.transduced.cells.in.organoid = numeric(), 
                       number.of.cells.in.a.cluster = numeric(),
                       number.of.transduced.cells.in.a.cluster = numeric(),
                       average.distance.between.cells.in.a.cluster = numeric(),
                       average.distance.between.transduced.cells.in.a.cluster = numeric(),
                       fraction.of.transduced.cells.in.a.cluster = numeric(),
                       number.of.contacts.between.transduced.cells.in.a.cluster = numeric(),
                       tumor = numeric())
for(i in 1:length(organoids)) {
  cells <- organoids[[i]]
  
  # Coordinates for some organoids seem to be in nm instead of um
  if (max(cells[,c("x","y","z")])>1000) {
    cells[,c("x","y","z")] <- cells[,c("x","y","z")]/1000
  }
  
  # Estimate average cell diameter as average distance between nearest neighbours
  NN.dist <- kNNdist(cells[,c("x","y","z")], k=1)
  avg.cell.diameter <- mean(NN.dist)
  sd.cell.diameter <- sd(NN.dist)
  
  # Remove outliers
  # Outliers are cells that have a very large distance to their nearest neighbour
  idx.outliers <- which(NN.dist>avg.cell.diameter+3*sd.cell.diameter)
  if(length(idx.outliers)>0) {
    cells <- cells[-idx.outliers,]
  }
  
  # Recompute values for the organoid
  NN.dist <- kNNdist(cells[,c("x","y","z")], k=1)
  avg.cell.diameter <- mean(NN.dist)
  sd.cell.diameter <- sd(NN.dist)
  
  # Estimate organoid diameter as distance between the two most distant cells
  org.diameter <- max(dist(cells[,c("x","y","z")]))
  # Cell density is ratio of number of cells to organoid surface area
  # Multiply by 1000 to bring on the same scale as other features
  cell.density <- nrow(cells)/(4*pi*org.diameter^2) * 1000
  
  # # Show positions of cells in organoid
  # scatter3d(cells$x,cells$y,cells$z, surface = FALSE, sphere.size = 3, surface.col = palette.cells, col = palette.cells, xlab="x",ylab="y",zlab="z", axis.scales = FALSE,  axis.col = c("black", "black", "black"), groups = as.factor(cells$transduced+cells$tumor))
  
  # Identify clusters of transduced cells
  transduced.cells <- cells[which(cells$transduced == 1),]
  D <- dist(transduced.cells[,c("x","y","z")])
  n <- nrow(transduced.cells)
  if(n>2) {
    hc <- hclust(D, method = "complete")
    cluster.membership <- cutreeHybrid(hc, minClusterSize = 1, distM = as.matrix(D))
    transduced.cells$cluster <- cluster.membership$labels
    # Show clusters on dendrogram
    dend <- as.dendrogram(hc)
    dend %>% set("leaves_pch", 19-10*transduced.cells$tumor[order.dendrogram(dend)]) %>%  # node type
      set("leaves_cex", 2) %>%  # node size
      set("leaves_col", palette.clusters[transduced.cells$cluster[order.dendrogram(dend)]+1]) %>% # leaves colors
      set("labels", "") %>%
    plot(main = paste0("Clusters of transduced cells in organoid ",i))
    
  } else if(n==2) {
    if(D<avg.cell.diameter+2*sd.cell.diameter) { # Form one cluster
      transduced.cells$cluster <- 1
    } else { # Form two clusters
      transduced.cells$cluster <- c(1,2)
    }
  } else if(n==1) {
    transduced.cells$cluster <- 1
  } else {
    print(paste0("Organoid ",i," has no transduced cell."))
  }
  # Extract cluster features
  for(j in 1:max(transduced.cells$cluster)) {
    cluster.cells <- transduced.cells[transduced.cells$cluster==j,]
    id <- paste0(i,".",j)
    
    # Number of cells in the organoid
    n.organoid <- nrow(cells)
    
    # Number of transduced cells in the organoid
    n.transduced.organoid <- nrow(transduced.cells)
    
    # Number of cells in the cluster volume
    # The cluster volume is defined as the sphere centrered at the centre of mass of the cluster
    # with diameter equal to the distance between the two farthest transduced cells of the cluster
    cluster.centre <- apply(cluster.cells[, c("x","y","z")], 2, mean)
    if (nrow(cluster.cells)>1) {
      dist.cluster.cells <- dist(cluster.cells[, c("x","y","z")])
      cluster.diameter <- max(dist.cluster.cells) + 1e-6 # Add small amount to deal with numerical precision issues
    } else {
      dist.cluster.cells <- 0
      cluster.diameter <- 0
    }
    dist2centre <- apply(cells[, c("x","y","z")], 1, function(x) {dist(rbind(x, cluster.centre))})
    volume.cells <- cells[which(dist2centre<=cluster.diameter), c("x","y","z")] 
    n.cluster <- nrow(volume.cells)
    
    # Number of transduced cells in the cluster
    n.transduced.cluster <- nrow(cluster.cells)
  
    # Average pairwise distance between transduced cells in the cluster
    avg.dist.cluster.cells <- mean(dist.cluster.cells)
    
    # Average pairwise distance between all cells in the cluster
    if (nrow(volume.cells)>1) {
      avg.dist.volume.cells <- mean(dist(volume.cells))
    } else {
      avg.dist.volume.cells <- avg.dist.cluster.cells
    }    
  
    # Number of contacts between transduced cells in the cluster
    # Two cells are presumed in contact if they are less than 1 cell diameter (+ sd) apart 
    n.contacts.transduced <- length(which(dist.cluster.cells<=avg.cell.diameter+2*sd.cell.diameter))
    
    # Fraction of transduced cells in the cluster volume (in percent)
    f.transduced <- 100
    if (n.cluster>0) {
      f.transduced <- n.transduced.cluster/n.cluster * 100
    }
    
    # Does the cluster give rise to a tumor ?
    tumor <- if(sum(cluster.cells$tumor>0)) 1 else 0
    
    df <- data.frame(id = id, 
                     number.of.cells.in.organoid = n.organoid,
                     organoid.cell.density = cell.density,
                     number.of.transduced.cells.in.organoid = n.transduced.organoid, 
                     number.of.cells.in.a.cluster = n.cluster,
                     number.of.transduced.cells.in.a.cluster = n.transduced.cluster,
                     average.distance.between.cells.in.a.cluster = avg.dist.volume.cells,
                     average.distance.between.transduced.cells.in.a.cluster = avg.dist.cluster.cells,
                     fraction.of.transduced.cells.in.a.cluster = f.transduced,
                     number.of.contacts.between.transduced.cells.in.a.cluster = n.contacts.transduced,
                     tumor = tumor)
    
    clusters <- rbind(clusters, df)
    
  }
  org.tumor <- if(sum(cells$tumor>0)) 1 else 0
  organoid.features[i,] <- c(nrow(cells), cell.density, n, org.tumor)
}
 ..cutHeight not given, setting it to 74.3  ===>  99% of the (truncated) height range in dendro.
 ..done.
The lengths of the new labels is shorter than the number of leaves in the dendrogram - labels are recycled.
 ..cutHeight not given, setting it to 57.5  ===>  99% of the (truncated) height range in dendro.
 ..done.

 ..cutHeight not given, setting it to 76.9  ===>  99% of the (truncated) height range in dendro.
 ..done.
The lengths of the new labels is shorter than the number of leaves in the dendrogram - labels are recycled.

 ..cutHeight not given, setting it to 69  ===>  99% of the (truncated) height range in dendro.
 ..done.
The lengths of the new labels is shorter than the number of leaves in the dendrogram - labels are recycled.

 ..cutHeight not given, setting it to 67.3  ===>  99% of the (truncated) height range in dendro.
 ..done.
The lengths of the new labels is shorter than the number of leaves in the dendrogram - labels are recycled.

 ..cutHeight not given, setting it to 75.9  ===>  99% of the (truncated) height range in dendro.
 ..done.
The lengths of the new labels is shorter than the number of leaves in the dendrogram - labels are recycled.

 ..cutHeight not given, setting it to 67.6  ===>  99% of the (truncated) height range in dendro.
 ..done.
The lengths of the new labels is shorter than the number of leaves in the dendrogram - labels are recycled.

 ..cutHeight not given, setting it to 65.3  ===>  99% of the (truncated) height range in dendro.
 ..done.
The lengths of the new labels is shorter than the number of leaves in the dendrogram - labels are recycled.

 ..cutHeight not given, setting it to 47.3  ===>  99% of the (truncated) height range in dendro.
 ..done.
The lengths of the new labels is shorter than the number of leaves in the dendrogram - labels are recycled.

 ..cutHeight not given, setting it to 31.7  ===>  99% of the (truncated) height range in dendro.
 ..done.
The lengths of the new labels is shorter than the number of leaves in the dendrogram - labels are recycled.

 ..cutHeight not given, setting it to 52.4  ===>  99% of the (truncated) height range in dendro.
 ..done.
The lengths of the new labels is shorter than the number of leaves in the dendrogram - labels are recycled.

 ..cutHeight not given, setting it to 22.6  ===>  99% of the (truncated) height range in dendro.
 ..done.
The lengths of the new labels is shorter than the number of leaves in the dendrogram - labels are recycled.

 ..cutHeight not given, setting it to 61.7  ===>  99% of the (truncated) height range in dendro.
 ..done.
The lengths of the new labels is shorter than the number of leaves in the dendrogram - labels are recycled.

 ..cutHeight not given, setting it to 25.1  ===>  99% of the (truncated) height range in dendro.
 ..done.
The lengths of the new labels is shorter than the number of leaves in the dendrogram - labels are recycled.

 ..cutHeight not given, setting it to 58.5  ===>  99% of the (truncated) height range in dendro.
 ..done.
The lengths of the new labels is shorter than the number of leaves in the dendrogram - labels are recycled.

 ..cutHeight not given, setting it to 52.6  ===>  99% of the (truncated) height range in dendro.
 ..done.
The lengths of the new labels is shorter than the number of leaves in the dendrogram - labels are recycled.

 ..cutHeight not given, setting it to 57.4  ===>  99% of the (truncated) height range in dendro.
 ..done.
The lengths of the new labels is shorter than the number of leaves in the dendrogram - labels are recycled.

Logistic regression with model selection

In this approach, we build a model for each possible combination of features (i.e. 2**9 models for 9 features) and use the Akaike information criterion to select the best model and evaluate the contribution of features retained in this model. Because it is possible the AIC is subject to small variations and because models are considered indistinguishable if the difference in AIC is less than 2, we also look at variable importance over multiple/all models

library(glmulti)
Loading required package: rJava
JavaVM: requested Java version ((null)) not available. Using Java at "" instead.
JavaVM: Failed to load JVM: /bundle/Libraries/libserver.dylib
JavaVM FATAL: Failed to load the jvm library.
Error : .onLoad failed in loadNamespace() for 'glmulti', details:
  call: .jinit(parameters = "-XmX512m", force.init = TRUE)
  error: JNI_GetCreatedJavaVMs returned -1

Error: package or namespace load failed for ‘glmulti’

Logistic regression with elastic net regularization

This approach starts with all features in the model and tries to set to 0 as many of the coefficients as possible.

---
title: "Organoid tumors"
output: html_notebook
---

Random cells in organoids are transduced to express oncogenes. However, only some transduced cells proliferate and lead to tumor formation. Initial observations suggest that tumors originate from clusters of transduced cells. 

Goal: Find which features of transduced cell clusters best predict the formation of tumors.

For each cluster of transduced cells, the features to test are:
- number of cells in the organoid
- ratio of number of cells to organoid surface area (~cell density)
- number of transduced cells in the organoid
- number of cells in the cluster volume
- number of transduced cells in the cluster
- average pairwise distance between all cells in the cluster volume
- average pairwise distance between transduced cells in the cluster
- fraction of transduced cells in the cluster volume
- number of contacts between transduced cells in the cluster


#### Load the data
Data consists of two csv files per organoid at the start of the experiment:
- one file containing x,y,z coordinates and ID of all cells in the organoid,
- one file containing IDs of all transduced cells in the organoid with 1 in the second column indicating that the cluster the transduced cell belongs to will produce a tumor. Only one transduced cell per cluster is annotated.
```{r}

data.dir <- "organoids_tumors"

file.list <- list.files(data.dir, pattern = "Pos", full.names = TRUE)

organoids <- list()
for(i in 1:length(file.list)){
  # Read cells positions
  # The files have 2 lines above the header (hence the use of skip=2)
  # Be careful: sometimes a third empty line is at the top of the file and should be removed before reading here
  # Also read.csv() issues a warning if the file doesn't end with an empty line
  cells <- read.csv(file.list[[i]], header=TRUE, sep=",",  stringsAsFactors = FALSE, skip = 2)
  # Remove unnecessary columns
  cells <- cells[, -c(4:9)]
  # Rename columns
  colnames(cells) <- c("x","y","z","id")
  # Read labels
  annotation.file <- gsub("Pos", "Label", file.list[[i]])
  annotations <- read.csv(annotation.file, header=TRUE, sep=",", stringsAsFactors = FALSE)
  annotations <- cbind(annotations, 1)
  colnames(annotations) <- c("id", "tumor", "transduced")
  # Combine positions and annotations based on cell id
  cells <- merge(cells, annotations, by = "id", all.x = TRUE)
  cells[is.na(cells)] <- 0
  organoids[[i]] <- cells
}
```

#### Identify clusters of transduced cells and extract features
```{r}
library(dbscan)
library(car)
library(RColorBrewer)
library(pals)
library(dynamicTreeCut)
library(dendextend)

palette.cells <- rev(colorRampPalette(brewer.pal(11, "Spectral"))(3+1L))
palette.clusters <- alphabet()
palette.clusters[c("damson","violet")] <- palette.clusters[c("violet","damson")]

organoid.features <- data.frame(number.of.cells = numeric(),
                                cell.density = numeric(),
                                number.of.transduced.cells = numeric(), 
                                tumor = numeric())

clusters <- data.frame(id = character(), 
                       number.of.cells.in.organoid = numeric(),
                       organoid.cell.density = numeric(),
                       number.of.transduced.cells.in.organoid = numeric(), 
                       number.of.cells.in.a.cluster = numeric(),
                       number.of.transduced.cells.in.a.cluster = numeric(),
                       average.distance.between.cells.in.a.cluster = numeric(),
                       average.distance.between.transduced.cells.in.a.cluster = numeric(),
                       fraction.of.transduced.cells.in.a.cluster = numeric(),
                       number.of.contacts.between.transduced.cells.in.a.cluster = numeric(),
                       tumor = numeric())
for(i in 1:length(organoids)) {
  cells <- organoids[[i]]
  
  # Coordinates for some organoids seem to be in nm instead of um
  if (max(cells[,c("x","y","z")])>1000) {
    cells[,c("x","y","z")] <- cells[,c("x","y","z")]/1000
  }
  
  # Estimate average cell diameter as average distance between nearest neighbours
  NN.dist <- kNNdist(cells[,c("x","y","z")], k=1)
  avg.cell.diameter <- mean(NN.dist)
  sd.cell.diameter <- sd(NN.dist)
  
  # Remove outliers
  # Outliers are cells that have a very large distance to their nearest neighbour
  idx.outliers <- which(NN.dist>avg.cell.diameter+3*sd.cell.diameter)
  if(length(idx.outliers)>0) {
    cells <- cells[-idx.outliers,]
  }
  
  # Recompute values for the organoid
  NN.dist <- kNNdist(cells[,c("x","y","z")], k=1)
  avg.cell.diameter <- mean(NN.dist)
  sd.cell.diameter <- sd(NN.dist)
  
  # Estimate organoid diameter as distance between the two most distant cells
  org.diameter <- max(dist(cells[,c("x","y","z")]))
  # Cell density is ratio of number of cells to organoid surface area
  # Multiply by 1000 to bring on the same scale as other features
  cell.density <- nrow(cells)/(4*pi*org.diameter^2) * 1000
  
  # # Show positions of cells in organoid
  # scatter3d(cells$x,cells$y,cells$z, surface = FALSE, sphere.size = 3, surface.col = palette.cells, col = palette.cells, xlab="x",ylab="y",zlab="z", axis.scales = FALSE,  axis.col = c("black", "black", "black"), groups = as.factor(cells$transduced+cells$tumor))
  
  # Identify clusters of transduced cells
  transduced.cells <- cells[which(cells$transduced == 1),]
  D <- dist(transduced.cells[,c("x","y","z")])
  n <- nrow(transduced.cells)
  if(n>2) {
    hc <- hclust(D, method = "complete")
    cluster.membership <- cutreeHybrid(hc, minClusterSize = 1, distM = as.matrix(D))
    transduced.cells$cluster <- cluster.membership$labels
    # Show clusters on dendrogram
    dend <- as.dendrogram(hc)
    dend %>% set("leaves_pch", 19-10*transduced.cells$tumor[order.dendrogram(dend)]) %>%  # node type
      set("leaves_cex", 2) %>%  # node size
      set("leaves_col", palette.clusters[transduced.cells$cluster[order.dendrogram(dend)]+1]) %>% # leaves colors
      set("labels", "") %>%
    plot(main = paste0("Clusters of transduced cells in organoid ",i))
    
  } else if(n==2) {
    if(D<avg.cell.diameter+2*sd.cell.diameter) { # Form one cluster
      transduced.cells$cluster <- 1
    } else { # Form two clusters
      transduced.cells$cluster <- c(1,2)
    }
  } else if(n==1) {
    transduced.cells$cluster <- 1
  } else {
    print(paste0("Organoid ",i," has no transduced cell."))
  }
  # Extract cluster features
  for(j in 1:max(transduced.cells$cluster)) {
    cluster.cells <- transduced.cells[transduced.cells$cluster==j,]
    id <- paste0(i,".",j)
    
    # Number of cells in the organoid
    n.organoid <- nrow(cells)
    
    # Number of transduced cells in the organoid
    n.transduced.organoid <- nrow(transduced.cells)
    
    # Number of cells in the cluster volume
    # The cluster volume is defined as the sphere centrered at the centre of mass of the cluster
    # with diameter equal to the distance between the two farthest transduced cells of the cluster
    cluster.centre <- apply(cluster.cells[, c("x","y","z")], 2, mean)
    if (nrow(cluster.cells)>1) {
      dist.cluster.cells <- dist(cluster.cells[, c("x","y","z")])
      cluster.diameter <- max(dist.cluster.cells) + 1e-6 # Add small amount to deal with numerical precision issues
    } else {
      dist.cluster.cells <- 0
      cluster.diameter <- 0
    }
    dist2centre <- apply(cells[, c("x","y","z")], 1, function(x) {dist(rbind(x, cluster.centre))})
    volume.cells <- cells[which(dist2centre<=cluster.diameter), c("x","y","z")] 
    n.cluster <- nrow(volume.cells)
    
    # Number of transduced cells in the cluster
    n.transduced.cluster <- nrow(cluster.cells)
  
    # Average pairwise distance between transduced cells in the cluster
    avg.dist.cluster.cells <- mean(dist.cluster.cells)
    
    # Average pairwise distance between all cells in the cluster
    if (nrow(volume.cells)>1) {
      avg.dist.volume.cells <- mean(dist(volume.cells))
    } else {
      avg.dist.volume.cells <- avg.dist.cluster.cells
    }    
  
    # Number of contacts between transduced cells in the cluster
    # Two cells are presumed in contact if they are less than 1 cell diameter (+ sd) apart 
    n.contacts.transduced <- length(which(dist.cluster.cells<=avg.cell.diameter+2*sd.cell.diameter))
    
    # Fraction of transduced cells in the cluster volume (in percent)
    f.transduced <- 100
    if (n.cluster>0) {
      f.transduced <- n.transduced.cluster/n.cluster * 100
    }
    
    # Does the cluster give rise to a tumor ?
    tumor <- if(sum(cluster.cells$tumor>0)) 1 else 0
    
    df <- data.frame(id = id, 
                     number.of.cells.in.organoid = n.organoid,
                     organoid.cell.density = cell.density,
                     number.of.transduced.cells.in.organoid = n.transduced.organoid, 
                     number.of.cells.in.a.cluster = n.cluster,
                     number.of.transduced.cells.in.a.cluster = n.transduced.cluster,
                     average.distance.between.cells.in.a.cluster = avg.dist.volume.cells,
                     average.distance.between.transduced.cells.in.a.cluster = avg.dist.cluster.cells,
                     fraction.of.transduced.cells.in.a.cluster = f.transduced,
                     number.of.contacts.between.transduced.cells.in.a.cluster = n.contacts.transduced,
                     tumor = tumor)
    
    clusters <- rbind(clusters, df)
    
  }
  org.tumor <- if(sum(cells$tumor>0)) 1 else 0
  organoid.features[i,] <- c(nrow(cells), cell.density, n, org.tumor)
}

```

#### Logistic regression with model selection
In this approach, we build a model for each possible combination of features (i.e. 2**9 models for 9 features) and use the Akaike information criterion to select the best model and evaluate the contribution of features retained in this model.
Because it is possible the AIC is subject to small variations and because models are considered indistinguishable if the difference in AIC is less than 2, we also look at variable importance over multiple/all models
```{r}
library(glmulti)
library(sjPlot)

# Automatically select best model(s)
models <- glmulti(tumor ~ ., data = clusters[,-1], # Remove ID column
                  intercept = TRUE,
                  level = 1,                # Don't consider pairwise interactions
                  method = "h",             # Explore all set of candidates
                  crit = "aicc",            # AIC as criteria (with correction for small samples)
                  confsetsize = 2**9,       # Number of models to keep.
                  plotty = F, report = F,   # No plot or interim reports
                  fitfunction = "glm",      # Use glm() function
                  family = binomial)        # binomial family for logistic regression

# Plot the AICc values
# The red line indicates a conventional cut-off for model selection
# (i.e. less than 2 AIC units from the best model)
plot(models)

# Get model-averaged estimates of feature weights for the top models within 3 IC unit of the best
coef(models, select = 3.0)

# Plot relative importance of the features taking all of the models into consideration.
# For each feature, this is the sum of the weights/probabilities for the models in which the feature appears.
# The vertical red line at .8 is an often used cutoff to differentiate between important and not important variables.
par(mar=c(5,12,4,1)+.1) # Change left margin size to write long variable names
plot(models, type = "s", las = 1)

# # Show second best model
# plot_model(models@objects[[2]], sort.est = TRUE, show.values = TRUE, value.offset = .3, vline.color = "grey")
# print(summary(models@objects[[2]]))

# Get best model
best.fit <- models@objects[[1]]
plot_model(best.fit, sort.est = TRUE, show.values = TRUE, value.offset = .3, vline.color = "grey")
print(summary(best.fit))


```

#### Logistic regression with elastic net regularization
This approach starts with all features in the model and tries to set to 0 as many of the coefficients as possible.
```{r}
library(glmnet)

set.seed(20181204)

features <- as.matrix(subset(clusters, select = -c(id, tumor)))
# alpha = 1 --> LASSO
model <- glmnet(features, factor(clusters$tumor), family = "binomial", alpha = 1)

# Plot evolution of the coefficients as a function of regularization.
# A small L1 norm means a strong regularization, with 0 corresponding to an empty model.
# As regularization is relaxed (increase in the L1 norm), more features enter the model.
# Therefore more important features can be seen as entering the model early.
par(mar=c(4.5,4.5,2,8)) # Increase right margin to write variable names
plot(model)
coeff.paths <- coef(model)
last.values <- coeff.paths[-1,ncol(coeff.paths)] # remove the intercept, and keep the coefficients at the end of the path
axis(4, at = last.values, line = -.5, label = names(last.values), las = 1, tick = FALSE, cex.axis = 0.5) 

# Get feature coefficients with cross-validation
model <- cv.glmnet(features, factor(clusters$tumor), family = "binomial", type = "deviance", nfolds = 5)
plot(model)
print(coef(model, s = "lambda.min"))

```

