#Clustered hypermathylated genes analysis
## les données
data=read.table("gene_new.txt2", head=T)#gene list of the 229 genes Hypermethylated in the more agressive cell lines 
data$Strand=as.factor(data$Strand)
summary(data)


## les fonctions de calcul
nbsuite=function(vc){
  
  NS=0
  for (xx in 2:length(vc))
  {
    if (vc[xx] != vc[xx-1] && vc[xx] == 0)
    {
      NS=NS+1
    }
  }
  if (vc[length(vc)]== 1)
  {NS=NS+1}
  return(NS)
}


auto=function(x, chr, lnb,boot){
  sub=x[x$Chromosome == chr,]
  sub=sub[with(sub,order(Start)),]
  mx=max(sub$Start)
  nbmx=length(sub$Start)
  sq=seq(1, nbmx-lnb, by=lnb)
  vc=vector(len=length(sq))
  xx=1
  for (ii in sq)
  {
    vc[xx]=0
    for (jj in 0:(lnb-1) )
    {
      if (sub$hyper[ii+jj] == "hyper")
      {
        vc[xx]=1
      }
    }
    xx=xx+1
  }
  NSbsuite(vc)
  results=replicate(boot,nbsuite(sample(vc)))
  
  #  print(vc)
  print ("####################")
  print (paste("chr:", chr))
  print(paste("Prop hypo:", table(data[data$Chromosome == cc,]$hyper)[2]/table(data[data$Chromosome == cc,]$hyper)[1]))
  print(paste("long. paquet:", lnb))
  print(paste("nbr suite:", NS))
  print(paste("nbr 1:", sum(vc)))
  print(paste("p-value:", sum(results <= NS)/length(results)))
  return(sum(results <= NS)/length(results))
}





## lancer le programme
l=c(1:22,"X","Y")
for (cc in l)
{
  ## paquet de 1
  auto(data, cc, 1, 1000)
  ## paquet de 2
  auto(data, cc, 2, 1000)
  ## paquet de 5
  auto(data, cc, 5, 1000)
  ## paquet de 10
  auto(data, cc, 10, 1000)
  ## paquet de 20
  auto(data, cc, 20, 1000)
}
















clusters=function(x, chr){
  sub=x[x$Chromosome == chr,]
  sub=sub[with(sub,order(Start)),]
  mx=max(sub$Start)
  nbmx=length(sub$Start)
  sq=seq(1, nbmx-1, by=1)
  vc=vector(len=length(sq))
  xx=1
  ok=0
  for (ii in sq)
  {
    if (sub$hyper[ii] == "hyper" & sub$hyper[ii+1] == "hyper")
    {
      ok=1
      print(paste(sub$id[ii]," ",ii, " ", sub$Start[ii]))
      
    }
    if (sub$hyper[ii] == "hyper" & sub$hyper[ii+1] != "hyper" && ok == 1)
      
    {
      ok=0
      print (paste(sub$id[ii]," ", ii, " ", sub$Start[ii]))
      print("fin_cluster")
    }
  }
}
clusters(data,6)





#Score.OVS
#datafile= Scores-OVS.csv
m= read.csv2("~/Scores-OVS.csv")
library(survival)
mdata=as.data.frame(m)
data.surv =with(mdata, Surv(OVS.month,Status))
fit=survfit(data.surv~accord.to.score,data = mdata)
library(ggplot2)
library(survminer)
ggsurvplot(fit,data=mdata,conf.int = TRUE,pval = TRUE)
ggsurvplot(fit,data=mdata,palette = c("blue", "red"),conf.int = FALSE,pval = TRUE, risk.table = TRUE)
survdiff(data.surv~accord.to.score,data = mdata)##test du log-rank##
Call:
  survdiff(formula = data.surv ~ accord.to.score, data = mdata)

N Observed Expected (O-E)^2/E (O-E)^2/V
accord.to.score=0 36       23     29.5      1.43      11.4
accord.to.score=1 13       11      4.5      9.40      11.4

Chisq= 11.4  on 1 degrees of freedom, p= 7e-04 
survdiff(data.surv~accord.to.score,data = mdata,rho=1)##test de Wilcoxon##
Call:
  survdiff(formula = data.surv ~ accord.to.score, data = mdata, 
           rho = 1)

N Observed Expected (O-E)^2/E (O-E)^2/V
accord.to.score=0 36    13.91    18.88      1.31      11.5
accord.to.score=1 13     8.33     3.36      7.36      11.5
coxfit=coxph(data.surv~accord.to.score,data = mdata)
ftest=cox.zph(coxfit)
ggcoxzph(ftest)
ggforest(coxfit, data= mdata)
library(precrec)
precrec_obj <- evalmod(scores = mdata$score3, labels = mdata$Status)
autoplot(precrec_obj)
precrec_obj2 <- evalmod(scores = mdata$score3, labels = mdata$Status, mode="basic")
autoplot(precrec_obj2)  
library(PRROC)
PRROC_obj <- roc.curve(scores.class0 = mdata$score3, weights.class0=mdata$Status, curve=TRUE)
plot(PRROC_obj)

precrec_objinv <- evalmod(scores = mdata$Status, labels = mdata$score3)
autoplot(precrec_objinv)


Chisq= 11.5  on 1 degrees of freedom, p= 7e-04 

#Analysis Score_survival_breslow
library(ggplot2)
d = read.csv2("~/Scores-OVS.csv")
data=as.data.frame(d)
x = data$OVS
y1= data$Breslow
y2= data$score3
plot(x = data$OVS.month,y= data$Breslow)
plot(x = data$OVS.month,y= data$score3)
# Add a regression linewith the form `abline(lm(y ~ x))`
plot(x,y1)
abline(lm(y1~ x))
summary(lm(y1~ x))
Call:
  lm(formula = y1 ~ x)

Residuals:
  Min     1Q Median     3Q    Max 
-9.811 -4.196 -2.480  0.483 73.195 

Coefficients:
  Estimate Std. Error t value Pr(>|t|)    
(Intercept) 10.83993    2.56767   4.222  0.00012 ***
  x           -0.07148    0.04309  -1.659  0.10429    
---
  Signif. codes:  0 â***â 0.001 â**â 0.01 â*â 0.05 â.â 0.1 â â 1

Residual standard error: 12.14 on 44 degrees of freedom
(3 observations deleted due to missingness)
Multiple R-squared:  0.05885,	Adjusted R-squared:  0.03746 
F-statistic: 2.751 on 1 and 44 DF,  p-value: 0.1043
plot(x,y2)
abline(lm(y2~ x))
summary(lm(y2~ x))
Call:
  lm(formula = y2 ~ x)

Residuals:
  Min      1Q  Median      3Q     Max 
-1.4279 -0.5715 -0.2126  0.5505  2.5916 

Coefficients:
  Estimate Std. Error t value Pr(>|t|)    
(Intercept)  1.472243   0.184160   7.994  2.6e-10 ***
  x           -0.009126   0.003188  -2.863  0.00626 ** 
  ---
  Signif. codes:  0 â***â 0.001 â**â 0.01 â*â 0.05 â.â 0.1 â â 1

Residual standard error: 0.9178 on 47 degrees of freedom
Multiple R-squared:  0.1485,	Adjusted R-squared:  0.1303 
F-statistic: 8.194 on 1 and 47 DF,  p-value: 0.006258


ggplot(data,aes(x, y1)) + 
  geom_point()+
  geom_smooth(method=lm)

ggplot(data,aes(x, y2)) + 
  geom_point()+
  geom_smooth(method=lm)
#Correlation score survival breslow
data = read.csv2("~/Scores-OVS.csv")
base = data[,2:4]
cor(base)
cor.test(base$OVS.month, base$score3, method ="pearson")
plot(base$OVS.month,base$score3)
plot(base$OVS.month,base$Breslow)
cor.test(base$OVS.month, base$Breslow, method ="pearson")
table(base$score3)

#Independance of the gene methylation
## Reformating of the excel into a CSV file
The ouput file is `Scores_0320_lp.csv`

## R simulation

```
data=read.csv2("~/Scores 0320_lpac1.csv")
dim(data)
[1] 49 16
summary(data)
sample      PCDHB16       PCDHB15            MYH1           BCL2L10     
12967  : 1   Min.   :0.0   Min.   :0.0000   Min.   :0.0000   Min.   :0.000  
12969  : 1   1st Qu.:0.0   1st Qu.:0.2500   1st Qu.:0.0000   1st Qu.:0.000  
12972.1: 1   Median :0.5   Median :1.0000   Median :1.0000   Median :1.000  
12973  : 1   Mean   :0.5   Mean   :0.7391   Mean   :0.5102   Mean   :0.561  
ARNCO  : 1   3rd Qu.:1.0   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.000  
ARQHU  : 1   Max.   :1.0   Max.   :1.0000   Max.   :1.0000   Max.   :1.000  
(Other):43   NA's   :7     NA's   :3                         NA's   :8      
     Score1        PCDHB16.1         PCDHB15.1           MYH1.1       
 Min.   :0.000   Min.   :-1.0000   Min.   :-1.0000   Min.   :-1.0000  
 1st Qu.:1.000   1st Qu.:-1.0000   1st Qu.:-0.7500   1st Qu.:-1.0000  
 Median :2.000   Median :-0.5000   Median : 1.0000   Median :-0.5000  
 Mean   :2.102   Mean   :-0.2381   Mean   : 0.3478   Mean   :-0.3333  
 3rd Qu.:3.000   3rd Qu.: 0.7500   3rd Qu.: 1.0000   3rd Qu.: 0.0000  
 Max.   :4.000   Max.   : 1.0000   Max.   : 1.0000   Max.   : 1.0000  
                 NA's   :7         NA's   :3         NA's   :1        
BCL2L10.1          score2          PCDHB16.2        PCDHB15.2     
Min.   :-1.000   Min.   :-3.0000   Min.   :0.0000   Min.   :0.0000  
1st Qu.:-1.000   1st Qu.:-1.0000   1st Qu.:0.0000   1st Qu.:0.0000  
Median : 0.000   Median :-1.0000   Median :0.0000   Median :1.0000  
Mean   :-0.122   Mean   :-0.3061   Mean   :0.2558   Mean   :0.6087  
3rd Qu.: 0.000   3rd Qu.: 1.0000   3rd Qu.:0.5000   3rd Qu.:1.0000  
Max.   : 1.000   Max.   : 4.0000   Max.   :1.0000   Max.   :1.0000  
NA's   :8                          NA's   :6        NA's   :3       
     MYH1.2         BCL2L10.2          score3     
 Min.   :0.0000   Min.   :0.0000   Min.   :0.000  
 1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.000  
 Median :0.0000   Median :0.0000   Median :1.000  
 Mean   :0.1633   Mean   :0.1707   Mean   :1.102  
 3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:2.000  
 Max.   :1.0000   Max.   :1.0000   Max.   :4.000  
                  NA's   :8                       



########################################################################
simul=function(a,b,c,d, N=100){
  library(dplyr)
  as=sample(a)
  bs=sample(b)
  cs=sample(c)
  ds=sample(d)
  x=table(rowSums(cbind(as,bs,cs,ds), na.rm=T))
  for (ii in 1:N){
    as=sample(a)
    bs=sample(b)
    cs=sample(c)
    ds=sample(d)
    print(table(rowSums(cbind(as,bs,cs,ds), na.rm=T)))
    x=dplyr::bind_rows(x, table(rowSums(cbind(as,bs,cs,ds), na.rm=T)))
  }
  return(x)
}



########################################################################
y=simul(data$PCDHB16, data$PCDHB15,  data$MYH1,   data$BCL2L10, N=10000)
y[is.na(y)] <- 0
apply(y, 2, mean)
sum(apply(y, 2, mean))
p=apply(y, 2, mean)/sum(apply(y, 2, mean))
chisq.test(table(data$Score1), p=p)
table(y[,2])
ks.test(table(data$Score1), p)
ks.test(table(data$score3), p)
Two-sample Kolmogorov-Smirnov test

data:  table(data$score3) and p
D = 1, p-value = 0.007937
alternative hypothesis: two-sided

Chi-squared test for given probabilities

data:  table(data$Score1)
X-squared = 6.2447, df = 4, p-value = 0.1816

Warning message:
  In chisq.test(table(data$Score1), p = p) :
  Chi-squared approximation may be incorrect

########################################################################
y=simul(data$PCDHB16.1, data$PCDHB15.1,  data$MYH1.1,   data$BCL2L10.1, N=10000)
y[is.na(y)] <- 0
apply(y, 2, mean)
sum(apply(y, 2, mean))
p=apply(y, 2, mean)/sum(apply(y, 2, mean))
p=p[order(names(p))]
t=table(data$score2)
x=c(t[3], t[2], t[1], 0, t[4], t[5], t[6], 0, t[7])
chisq.test(x, p=p)
> chisq.test(x, p=p)

Chi-squared test for given probabilities

data:  x
X-squared = 29.175, df = 8, p-value = 0.0002954

########################################################################
y=simul(data$PCDHB16.2, data$PCDHB15.2,  data$MYH1.2,   data$BCL2L10.2, N=10000)
y[is.na(y)] <- 0
apply(y, 2, mean)
sum(apply(y, 2, mean))
p=apply(y, 2, mean)/sum(apply(y, 2, mean))
p=p[order(names(p))]
t=table(data$score3)
> chisq.test(t, p=p)

Chi-squared test for given probabilities

data:  t
X-squared = 8.0336, df = 4, p-value = 0.09035

Warning message:
  In chisq.test(t, p = p) : Chi-squared approximation may be incorrect


```
