## this script analyse the Evolution of M4 gene module
## load data that are needed for analysis
#biocLite("WGCNA")
source("SubFunction_all.R")
source("LoadRNAseqData.R");
source("LoadAnnotation.R");

## read the expression profile of the M4 module;
M4.gene.expression<-read.table(file="OutputData/data.all.count.FPKM.M4.txt",sep="\t",header=T,row.names=1);

## read the conserved genes from M4 module;
conserved.gene.expression<-read.table(file="OutputData/data.gene.conserved.expression.txt",header=T,row.names=1)
gene.conserved<-row.names(conserved.gene.expression)
nrow(data.count.FPKM.WWFAC);
M4.conserved<-intersect(gene.conserved,row.names(M4.gene.expression));
length(intersect(gene.conserved,row.names(M4.gene.expression)))

##write the output file (optional);
## this is used for finding enriched motifs
#write.table(data.count.FPKM.WWFAC,file="InputData/NIATTr2.FPKM.WWFAC.txt",sep="\t",col.names=T, quote =F);
#write.table(data.count.FPKM.WWFAC[!row.names(data.count.FPKM.WWFAC) %in% M4.gene,],file="InputData/NIATTr2.FPKM.WWFAC.neg.txt",sep="\t",col.names=T, quote =F);

## load gene duplication data;
data.duplications<-read.table(file="InputData/NIATTr2.4.duplications.txt",sep="\t",header=T,row.names=1);
dim(data.duplications)

head(data.duplications$BootstrapValue);

## Select duplications events that have high confidence;
data.duplications.nodup<-data.duplications[data.duplications$DuplicationTime=='singleton' |data.duplications$DuplicationTime=='not duplicated' ,];
data.duplications.dup<-data.duplications[!data.duplications$BootstrapValue=='---',];

data.duplications.dup$BootstrapValue<-as.numeric(as.character(data.duplications.dup$BootstrapValue));

data.duplications.dup<-data.duplications.dup[data.duplications.dup$BootstrapValue>=0.9,];
head(data.duplications.dup)
dim(data.duplications.dup);
dim(data.duplications.nodup)
head(data.duplications.nodup)
## only consider genes that were expressed in leaves, reduce the bias resulted from expression selections;
data.duplications<-rbind(data.duplications.nodup,data.duplications.dup);

data.duplications.LeafExpressed<-data.duplications[intersect(row.names(data.duplications),row.names(data.count.FPKM.WWFAC)),];
nrow(data.duplications.LeafExpressed);
head(data.duplications.LeafExpressed)
head(data.duplications.LeafExpressed);
dim(data.duplications);


############################################
## analyze the enrichment of duplicated genes
############################################

M4.gene<-row.names(M4.gene.expression);
data.duplications.induced<-data.duplications.LeafExpressed[intersect(row.names(data.duplications.LeafExpressed),M4.gene),];
head(data.duplications.induced)
##count genes:
table(data.duplications.induced$DuplicationTime)
## percentage of M4 genes involved in duplications;
(sum(table(data.duplications.induced$DuplicationTime))-218-16)/sum(table(data.duplications.induced$DuplicationTime))

table(data.duplications.induced$DuplicationTime)/sum(table(data.duplications.induced$DuplicationTime));
## 
## not duplicated: 0.195;
## singleton: 0.014;


## calculate genome-wide pattern;
table(data.duplications.LeafExpressed[,"DuplicationTime"])
## total number of genes involved in at least one duplication events:
sum(table(data.duplications.LeafExpressed[,"DuplicationTime"]))-4268-683

## calculate M4 module pattern;
table(data.duplications.induced[,"DuplicationTime"])
sum(table(data.duplications.induced[,"DuplicationTime"]))

## calculate odd ration and p-value for duplicated genes:

(sum(table(data.duplications.LeafExpressed$DuplicationTime))-sum(table(data.duplications.LeafExpressed$DuplicationTime)[c("not duplicated","singleton")]))/sum(table(data.duplications.LeafExpressed$DuplicationTime)[c("not duplicated","singleton")])
## output = 1.96
(sum(table(data.duplications.induced$DuplicationTime))-sum(table(data.duplications.induced$DuplicationTime)[c("not duplicated","singleton")]))/sum(table(data.duplications.induced$DuplicationTime)[c("not duplicated","singleton")])
## output =3.87
odd.dup<-3.87/1.96


##calculate likelyhood for genome-wide pattern;
p<-(sum(table(data.duplications.LeafExpressed$DuplicationTime))-sum(table(data.duplications.LeafExpressed$DuplicationTime)[c("not duplicated","singleton")]))/sum(table(data.duplications.LeafExpressed$DuplicationTime))

binom.test(x =906,n = 1140,p = p);

## calculate odd ration and p-value for Sol WGD genes and Nicotiana lineage duplicated (NLD) genes:
odd.sol<-(587/(1140-587)) / (6181/(14642-6181));
odd.NLD<-(87/(1175-87)) / (1249/(14642-1249));

p.sol<-6181/14642;
binom.test(x =587,n = 1140,p = p.sol);

p.nld<-1249/14642;
binom.test(x =87,n = 1175,p = p.nld);

## calculate odd and p-value for conserved genes in M4 module;
data.M4.conserved<-read.table(file="OutputData/data.gene.conserved.expression.txt",header=T,row.names=1)
data.M4.conserved.gene<-row.names(data.M4.conserved);
data.M4.conserved.gene<-intersect(M4.gene,data.M4.conserved.gene)
length(data.M4.conserved.gene)
data.duplications.conserved<-data.duplications.LeafExpressed[data.M4.conserved.gene,];
table(data.duplications.conserved$DuplicationTime);
sum(table(data.duplications.conserved$DuplicationTime))
table(data.duplications.LeafExpressed[,"DuplicationTime"])
#14695-683-4268
#708+594
odd.cons.dup<-(561/(692-561)) / (9691/(14642-9691));
odd.cons.sol<-(355/(692-355)) / (6181/(14642-6181));
odd.cons.NLD<-(65/(692-65)) / (1249/(14642-1249));

p.dup<-9691/14642 
binom.test(x =561,n = 692,p = p.dup);

p.cons.sol<-6181/14642;
binom.test(x =355,n = 692,p = p.sol);

p.cons.nld<-1249/14642;
binom.test(x =65,n = 692,p = p.nld);


############################################
## analyze the preferential gene retention;
############################################
## select gene pairs resulted from Sol. duplications.
head(data.duplications.LeafExpressed);
dim(data.duplications.LeafExpressed);

data.duplications.LeafExpressed.pair<-data.duplications.LeafExpressed[-grep("---",as.character(data.duplications.LeafExpressed$DuplicatedCopies)),];
data.duplications.LeafExpressed.pair<-data.duplications.LeafExpressed.pair[-grep(",",as.character(data.duplications.LeafExpressed.pair$DuplicatedCopies)),];
data.duplications.LeafExpressed.pair$BootstrapValue<-as.numeric(as.character(data.duplications.LeafExpressed.pair$BootstrapValue));
data.duplications.LeafExpressed.pair<-data.duplications.LeafExpressed.pair[data.duplications.LeafExpressed.pair$BootstrapValue>=0.90,]

head(data.duplications.LeafExpressed.pair);
data.duplications.LeafExpressed.pair.SOL<-data.duplications.LeafExpressed.pair[data.duplications.LeafExpressed.pair$DuplicationTime=="shared among Solanaceae",]
head(data.duplications.LeafExpressed.pair.SOL)
length(unique(c(data.duplications.LeafExpressed.pair.SOL$gene.copies,row.names(data.duplications.LeafExpressed.pair.SOL))));

dim(data.duplications.LeafExpressed.pair.SOL);

## remove redundancy;
df<-data.duplications.LeafExpressed.pair.SOL
head(df)
df$Name<-row.names(df);
df.sort <- t( apply(df[c(4,5)], 1, sort) )
df<-df[!duplicated(df.sort),];
data.duplications.LeafExpressed.pair<-df;
nrow(data.duplications.LeafExpressed.pair)*2
head(data.duplications.LeafExpressed.pair);
nrow(data.duplications.LeafExpressed.pair)

## filter out M4 module with same criteria;

data.duplications.LeafExpressed.duplicated<-data.duplications.LeafExpressed[!data.duplications.LeafExpressed$DuplicationTime=="not duplicated" & !data.duplications.LeafExpressed$DuplicationTime=="singleton" ,];
table(data.duplications.LeafExpressed.duplicated$DuplicationTime)
M4.duplicated<-intersect(M4.gene,row.names(data.duplications.LeafExpressed.duplicated));
length(M4.duplicated);
## find how many of them both were retained in the genome;
#min(data.duplications.LeafExpressed.duplicated$BootstrapValue)
M4.duplicated.pair<-data.duplications.LeafExpressed.duplicated[M4.duplicated,"DuplicatedCopies"];
M4.duplicated.clean<-M4.duplicated[-grep("---|,",M4.duplicated.pair)];

## calculate how many pairs:
data.duplications.LeafExpressed.pair.M4<-data.duplications.LeafExpressed.pair.SOL[row.names(data.duplications.LeafExpressed.pair.SOL) %in% M4.duplicated.clean & as.character(data.duplications.LeafExpressed.pair.SOL$DuplicatedCopies) %in% M4.duplicated.clean,]
nrow(data.duplications.LeafExpressed.pair.M4);

## collect information for statistical analysis:
#nrow(data.duplications.LeafExpressed.pair)
data.duplications.LeafExpressed.pair.nonM4<-data.duplications.LeafExpressed.pair[!row.names(data.duplications.LeafExpressed.pair) %in% M4.duplicated.clean & !as.character(data.duplications.LeafExpressed.pair$DuplicatedCopies) %in% M4.duplicated.clean,]
head(data.duplications.LeafExpressed.pair.nonM4);
nrow(data.duplications.LeafExpressed.pair.nonM4)
data.duplications.LeafExpressed.pair.totalgene<-unique(c(row.names(data.duplications.LeafExpressed.pair),as.character(data.duplications.LeafExpressed.pair$gene.copies)))

length(data.duplications.LeafExpressed.pair.totalgene);
length(intersect(M4.duplicated.clean,data.duplications.LeafExpressed.pair.totalgene));
length(M4.duplicated.clean)

nrow(data.duplications.LeafExpressed.pair)
nrow(data.duplications.LeafExpressed.pair)
## perform Chi-square test
P <- 428/8584
obs <- c(120,428-120,(8584-428));
exp <- sum(obs) * c(P^2, choose(2,1)*P*(1-P), (1-P)^2 )
chistat <- sum( (obs-exp)^2 / exp );
## use Chisq to calculate P
p.value <- 1 -pchisq( chistat, df=2);
p.value
## use the proportion test to calculate P;
prop.test(x=120,n=4292,p=(P^2));
#(P^2)*4292


## calculate enrichment of TF and Protein kinases;
M4.gene.TF<-intersect(row.names(data.TF),M4.gene)
length(M4.gene.TF);
M4.gene.PK<-intersect(row.names(data.PK),M4.gene)
length(M4.gene.PK);

M4.gene.TF.duplication<-data.duplications.LeafExpressed[M4.gene.TF,]
table(M4.gene.TF.duplication$DuplicationTime)

M4.gene.PK.duplication<-data.duplications.LeafExpressed[M4.gene.PK,]
table(M4.gene.PK.duplication$DuplicationTime);

length(intersect(row.names(data.duplications.LeafExpressed.pair.M4),M4.gene.TF))

length(intersect(row.names(data.duplications.LeafExpressed.pair.M4),M4.gene.PK))

#nrow(data.duplications.LeafExpressed.pair.M4);


length(intersect(M4.gene.TF,row.names(data.duplications.LeafExpressed.pair)))+length(intersect(M4.gene.PK,row.names(data.duplications.LeafExpressed.pair)))
nrow(data.duplications.LeafExpressed.pair)
## percentage from genome-wide level;
100*110/4292

binom.test(x =37,n = 120,p = 110/4292);


############################################
## analyze the effects of DTT-NIC1 insertion for the network;
############################################

## read MITEs insertion data;
MITEs.all<-read.table(file="InputData/MITEsInsertion.gff",sep="\t",header=T);
head(MITEs.all);
MITEs.all.sub<-MITEs.all[!MITEs.all$MITEs=="NIATTr2_17_23844",] ## two MITEs showed no homolog to other species
MITEs.all.sub<-MITEs.all.sub[!MITEs.all.sub$MITEs=="NIATTr2_20_13501",] ## two MITEs showed no homolog to other species
MITEs.all.sub$MITEs<-factor(MITEs.all.sub$MITEs);
levels(MITEs.all.sub$MITEs)
M4.gene<-row.names(M4.gene.expression);
## a function to calculate levels of enrichment;
Enrichement_analysis<-function(Module=NULL,Genomewide=NULL,Insertion=NULL){
  InsertionInModule<-length(intersect(Module,Insertion));
  InsertionInModule
  
  odd<- (InsertionInModule/(length(Insertion)-InsertionInModule)) / (length(Module) / (length(Genomewide)-length(Module)));
  
  #prop.test(x =InsertionInModule,n = length(Insertion),p =length(M4.gene.clean)/nrow(data.count.FPKM.WWFAC) )
  p=length(Module) /length(Genomewide);
  test.out<-binom.test(InsertionInModule,length(Insertion),p);
  return(c(InsertionInModule,odd, test.out$p.value));
}
## enrichment analysis for genes with MITEs insertions;
MITEs.insertions.gene<-unique(intersect(as.character(MITEs.all.sub$GeneID),row.names(data.all.count.FPKM.WWFAC)));
round(Enrichement_analysis(M4.gene,row.names(data.all.count.FPKM.WWFAC),MITEs.insertions.gene),digits = 10); 


MITEs.annotation<-read.table(file="InputData/MITEs.annotation.txt",sep="\t",header=T,row.names=1);
Genomewide<-row.names(data.all.count.FPKM.WWFAC);
Insertion<-unique(intersect(as.character(MITEs.all.sub$GeneID),row.names(data.all.count.FPKM.WWFAC)))

output<-matrix(ncol=4,nrow=0);
#MITEs.all.sub$MITEs<-factor(MITEs.all.sub$MITEs)
#M<-"NIATTr2_7_20865"
Enrichement_analysis<-function(Module=NULL,Genomewide=NULL,Insertion=NULL){
  InsertionInModule<-length(intersect(Module,Insertion));
  InsertionInModule
  
  odd<- (InsertionInModule/(length(Insertion)-InsertionInModule)) / (length(Module) / (length(Genomewide)-length(Module)));
  
  #prop.test(x =InsertionInModule,n = length(Insertion),p =length(M4.gene.clean)/nrow(data.count.FPKM.WWFAC) )
  p=length(Module) /length(Genomewide);
  test.out<-binom.test(InsertionInModule,length(Insertion),p);
  return(c(InsertionInModule,odd, test.out$p.value));
}

for(M in levels(MITEs.all.sub$MITEs)){
  Module<-M4.gene
  Genomewide<-row.names(data.all.count.FPKM.WWFAC);
  MITE.all.sub.M<-MITEs.all.sub[MITEs.all.sub$MITEs==M,];
  
  Insertion<-unique(intersect(as.character(MITE.all.sub.M$GeneID),row.names(data.all.count.FPKM.WWFAC)));
  length(Insertion)
  out<-round(Enrichement_analysis(Module,Genomewide,Insertion),digits = 10); 
  output=rbind(output,c(M,out));
}

output<-data.frame(output);
colnames(output)<-c("MITE_id","GeneNumber","ODD","p-value");

output$MITE_name<-MITEs.annotation[as.character(output[,1]),"MITE_name"];
row.names(output)<-as.character(output$MITE_name);

duplicated (as.character(output$MITE_name))
output<-output[sort(as.character(output$MITE_name)),]
## write the output file;
write.table(output,file="OutputData/MITEs.enrichment.txt",sep="\t",col.names=T,row.names=T,quote = F)

## analyze enrichment using conserved dataset;
## all MITEs
data.M4.conserved<-intersect(M4.gene,data.)
Module<-M4.conserved;
length(M4.conserved);
GenomeWide<-row.names(data.all.count.FPKM.WWFAC);
length(GenomeWide)
Insertion<-unique(intersect(as.character(MITEs.all.sub$GeneID),row.names(data.all.count.FPKM.WWFAC)))
round(Enrichement_analysis(Module,GenomeWide,Insertion),digits = 10); 

## only DTT-NIC1
Module<-M4.conserved;
length(M4.conserved);
GenomeWide<-row.names(data.all.count.FPKM.WWFAC);
#length(GenomeWide)
Insertion<-unique(intersect(as.character(MITEs.all.sub$GeneID)[MITEs.all.sub$MITEs=="NIATTr2_7_20865"],row.names(data.all.count.FPKM.WWFAC)))
round(Enrichement_analysis(Module,GenomeWide,Insertion),digits = 10); 



