# This code is to plot correlation between oocyte vs embryonic PGC and oocyte vs embryonic soma.
# Two files (transcriptomes) were used in this code:
# 1. oocyte transctipome from Stoeckius et al., 2014. Table S1: "Stoeckius2014OTE.csv"
# 2. cuffdiff out put of differential expression anlaysis between embryonic soma vs embryonic PGCs. "gene_exp_SomaVsPGC_addUpperAndIndex.diff"

rm(list = ls())
oocyte <- read.table("Stoeckius2014OTE.csv", header = TRUE, sep = ",")
oocyte_notD <- oocyte[!duplicated(oocyte$GENEID),] # to clean up the data set by removing duplicated entry.
emb <- read.table("gene_exp_SomaVsPGC_addUpperAndIndex.diff",
                  header=TRUE, sep="\t")
emb_noD <-emb[!duplicated(emb$mapToUpper),]# to clean up the data set by removing duplicated entry.

### only keep FPKM > 0 for log10 transformation.
oocyte_notD_not0 <- oocyte_notD[oocyte_notD$oocyte_rpkm != 0, ]
emb_noD_not0 <- emb_noD[emb_noD$value_1 != 0,]  #note: Value 1 is the FPKM for emb soma.
emb_noD_not00 <- emb_noD_not0[emb_noD_not0$value_2 != 0,] #note: Value 2 is the FPKM for embryonic PGC.


### To plot XY scatter plot, gene IDs were cross-referenced with each other and ordered based on gene name.
oocyte_emb_notD_not0_int <- intersect(oocyte_notD_not0$GENEID,emb_noD_not00$mapToUpper) #note: intersect geneID from two lists.
oocyte_emb_notD_not0_ind_oocyte <- sapply(oocyte_notD_not0$GENEID, is.element,
                                          oocyte_emb_notD_not0_int)
oocyte_not00_cleaned <- oocyte_notD_not0[oocyte_emb_notD_not0_ind_oocyte,]

oocyte_emb_notD_not0_ind_emb <- sapply(emb_noD_not00$mapToUpper, is.element,
                                       oocyte_emb_notD_not0_int)
emb_not00_cleaned <- emb_noD_not00[oocyte_emb_notD_not0_ind_emb,]

emb_not00_cleaned_geneIDOrdered <- emb_not00_cleaned[order(emb_not00_cleaned$mapToUpper),]
oocyte_not00_cleaned_geneIDOrdered <- oocyte_not00_cleaned[order(oocyte_not00_cleaned$GENEID),]

###### intergrated FPKM of oocyte genes into the other list and log10 transformed FPKM value for oocyte, embryonic soma and PGC genes.
a <- oocyte_not00_cleaned_geneIDOrdered$oocyte_rpkm
emb_not00_cleaned_w_oocyteExp <- cbind(emb_not00_cleaned_geneIDOrdered,
                                       log10_value_oocyte = log10(a))
emb_not00_cleaned_w_oocyteExp_PGC <- cbind(emb_not00_cleaned_w_oocyteExp,
                                           log10_value_PGC = log10(emb_not00_cleaned_w_oocyteExp$value_2))
emb_not00_cleaned_w_oocyteExp_PGC_soma <- cbind (
  emb_not00_cleaned_w_oocyteExp_PGC, log10_value_soma = log10(emb_not00_cleaned_w_oocyteExp_PGC$value_1))
emb_log10.FPKM_OSP <- emb_not00_cleaned_w_oocyteExp_PGC_soma

### idenitify the maximum and minimum value of transformed FPKM for internal normalization ( e.g  (target gene- minimum)/(maximum-minimum))
log10.PGC_min <- min(emb_log10.FPKM_OSP$log10_value_PGC)
log10.PGC_max <- max(emb_log10.FPKM_OSP$log10_value_PGC)
log10.PGC_denom <- log10.PGC_max-log10.PGC_min
log10.oocyte_min <- min(emb_log10.FPKM_OSP$log10_value_oocyte)
log10.oocyte_max <- max(emb_log10.FPKM_OSP$log10_value_oocyte)
log10.oocyte_denom <- log10.oocyte_max-log10.oocyte_min
log10.soma_min <- min(emb_log10.FPKM_OSP$log10_value_soma)
log10.soma_max <- max(emb_log10.FPKM_OSP$log10_value_soma)
log10.soma_denom <- log10.soma_max-log10.soma_min
emb_log10.FPKM_OSP_sig <- emb_log10.FPKM_OSP[emb_log10.FPKM_OSP$significant == "yes",]
emb_log10.FPKM_OSP_PGC_enriched <- emb_log10.FPKM_OSP_sig[emb_log10.FPKM_OSP_sig$log2.fold_change.>0,]



plot((emb_log10.FPKM_OSP$log10_value_PGC-log10.PGC_min)/log10.PGC_denom,
     (emb_log10.FPKM_OSP$log10_value_oocyte - log10.oocyte_min)/log10.oocyte_denom,
     xlab="scaled log10(FPKM) in PGC", xlim = c(0.4,1), ylim = c(0,1),
     ylab="scaled log10(FPKM) in oocyte",main = "Transcriptome comparison btw\nemb PGC and oocyte",
     cex=0.5, pch=20)
cor.test((emb_log10.FPKM_OSP$log10_value_PGC-log10.PGC_min)/log10.PGC_denom,
         (emb_log10.FPKM_OSP$log10_value_oocyte - log10.oocyte_min)/log10.oocyte_denom)


plot((emb_log10.FPKM_OSP$log10_value_soma-log10.soma_min)/log10.soma_denom,
     (emb_log10.FPKM_OSP$log10_value_oocyte - log10.oocyte_min)/log10.oocyte_denom,
     xlab="scaled log10(FPKM) in soma", xlim = c(0.4,1), ylim = c(0,1),
     ylab="scaled log10(FPKM) in oocyte",main="transcriptome comparison btw \noocyte and emb soma" ,
     cex=0.5, pch=20)
cor.test((emb_log10.FPKM_OSP$log10_value_soma-log10.soma_min)/log10.soma_denom,
         (emb_log10.FPKM_OSP$log10_value_oocyte - log10.oocyte_min)/log10.oocyte_denom)
