# load R libraries
#library("rhdf5")

library("sleuth")
library('cowplot')
library("RSQLite")
library("dbplyr")
library("dplyr")
library("tidyr")
library("tximportData")
library("knitr")
library("rmarkdown")
new_position_theme <- theme(legend.position = c(0.80, 0.90))


#==================================================================================================
# Import GENCODE V24 annotation
#==================================================================================================

# transcript quantifications are generated with kallisto using GENCODE V24 annotation
# gene and transcript ids can be obtained from the kallisto's quantification file for any of the samples

# read in file
sample_id=c("rw_026_01_01_rnaseq","rw_030_01_01_rnaseq","rw_031_01_01_rnaseq","rw_032_01_01_rnaseq",
            "rw_039_01_01_rnaseq","rw_039_02_01_rnaseq","rw_040_01_01_rnaseq","rw_040_02_01_rnaseq", "rw_041_01_01_rnaseq",
            "rw_042_01_01_rnaseq", "rw_041_01_02_rnaseq","rw_042_01_02_rnaseq", "rw_043_01_01_rnaseq", "rw_044_01_01_rnaseq",
            "rw_043_01_02_rnaseq", "rw_044_01_02_rnaseq")

itsv <- file.path("/mnt/cluster_projects/data/rnaseq/samples", sample_id[1], "quantifications/kallisto/hg38_mmtv/paired_end/abundance.tsv")
#itsv <- file.path("/users/mbeato/projects/data/rnaseq/samples", sample_id[1], "quantifications/kallisto/hg38_mmtv/paired_end/abundance.tsv")

target_mapping <- read.delim(itsv)

# extract information from target_id
target_mapping$target_id_raw <- target_mapping$target_id
target_id_names <- c('transcript_id', 'gene_id', 'ott1', 'ott2', 'transcript_name', 'gene_name', 'transcript_length', 'gene_type', 'empty')
target_mapping <- target_mapping %>% separate(target_id_raw, target_id_names, "\\|")

# select specific columns
target_mapping <- target_mapping[, c("target_id", "gene_id", 'gene_name')]

kal_dirs <- file.path("/mnt/cluster_projects/data/rnaseq/samples", sample_id, "quantifications/kallisto/hg38_mmtv/paired_end/")
#kal_dirs <- file.path("/users/mbeato/projects/data/rnaseq/samples", sample_id, "quantifications/kallisto/hg38_mmtv/paired_end/")
kal_dirs

#original table
s2c <- read.table(file.path("/data/projects/roni_rnaseq_de_feb2019", "metadata", "hiseq_info.txt"), header = TRUE, stringsAsFactors=FALSE)

s2c <- dplyr::select(s2c, sample = sample, condition, sequencing, space, full_cond)
s2c <- dplyr::mutate(s2c, path = kal_dirs)
print(s2c) #Full table

so <- sleuth_prep(s2c, ~ space+condition, target_mapping=target_mapping, extra_bootstrap_summary = TRUE) 

#General PCA with all samples
png("/data/projects/roni_rnaseq_de_feb2019/figures/pca_tpm_all_samples.png", res = 150, w = 1600, h = 750)
par(mfrow = c(1, 2))
p1 <- plot_pca(so, pc_x = 1L, pc_y = 2L, units = 'est_counts', color_by = 'full_cond', text_labels = TRUE) +
  new_position_theme
p2 <- plot_pc_variance(so, units = 'est_counts')
plot_grid(p1, p2, align = "h")
dev.off()

###################################
###################################


#Redo the analysis removing the two outlier samples "rw_043_01_02_rnaseq", "rw_044_01_02_rnaseq"
sample_id=c("rw_026_01_01_rnaseq","rw_030_01_01_rnaseq","rw_031_01_01_rnaseq","rw_032_01_01_rnaseq",
            "rw_039_01_01_rnaseq","rw_039_02_01_rnaseq","rw_040_01_01_rnaseq","rw_040_02_01_rnaseq", "rw_041_01_01_rnaseq",
            "rw_042_01_01_rnaseq", "rw_041_01_02_rnaseq","rw_042_01_02_rnaseq", "rw_043_01_01_rnaseq", "rw_044_01_01_rnaseq")

itsv <- file.path("/mnt/cluster_projects/data/rnaseq/samples", sample_id[1], "quantifications/kallisto/hg38_mmtv/paired_end/abundance.tsv")
target_mapping <- read.delim(itsv)

# extract information from target_id
target_mapping$target_id_raw <- target_mapping$target_id
target_id_names <- c('transcript_id', 'gene_id', 'ott1', 'ott2', 'transcript_name', 'gene_name', 'transcript_length', 'gene_type', 'empty')
target_mapping <- target_mapping %>% separate(target_id_raw, target_id_names, "\\|")

# select specific columns
target_mapping <- target_mapping[, c("target_id", "gene_id", 'gene_name')]

kal_dirs <- file.path("/mnt/cluster_projects/data/rnaseq/samples", sample_id, "quantifications/kallisto/hg38_mmtv/paired_end/")
kal_dirs

#removing problematic samples
s2c <- read.table(file.path("/data/projects/roni_rnaseq_de_feb2019", "metadata", "hiseq_info2.txt"), header = TRUE, stringsAsFactors=FALSE)

s2c <- dplyr::select(s2c, sample = sample, condition, sequencing, space, full_cond)
s2c <- dplyr::mutate(s2c, path = kal_dirs)
print(s2c) #Full table

so <- sleuth_prep(s2c, target_mapping=target_mapping, aggregation_column = 'gene_id', extra_bootstrap_summary = TRUE, num_cores=4, gene_mode=TRUE) 

#PCA with all filtered samples
png("/data/projects/roni_rnaseq_de_feb2019/figures/gene_mode/pca_tpm_filt_samples_sequencing.png", res = 200, w = 1200, h = 900)
#par(mfrow = c(1, 2))
p1 <- plot_pca(so, pc_x = 1L, pc_y = 2L, units = 'est_counts', color_by = 'sequencing', text_labels = FALSE)
#p2 <- plot_pc_variance(so, units = 'est_counts')
#plot_grid(p1, p2, align = "h")
p1
dev.off()

#First reduced model
so <- sleuth_fit(so, ~sequencing + condition, 'reduced_space')
so <- sleuth_fit(so, ~sequencing + condition + space, 'full_space')

so <- sleuth_fit(so, ~sequencing + space, 'reduced_condition')
so <- sleuth_fit(so, ~sequencing + space + condition, 'full_condition')
#test interactions
so <- sleuth_fit(so, ~sequencing + space + condition, 'reduced_condition_int')
so <- sleuth_fit(so, ~sequencing + space * condition, 'full_condition_int')

#The likelihood ratio test (lrt) is performed with
so <- sleuth_lrt(so, 'reduced_space', 'full_space')
so <- sleuth_lrt(so, 'reduced_condition', 'full_condition')
so <- sleuth_lrt(so, 'reduced_condition_int', 'full_condition_int')


#Generate tables with significant genes for space
sleuth_table_gene_space <- sleuth_results(so, 'reduced_space:full_space', 'lrt', show_all = FALSE, pval_aggregate=FALSE)
sleuth_table_gene_space <- dplyr::filter(sleuth_table_gene_space, qval <= 0.05)

#Generate tables with significant genes for condition
sleuth_table_gene_condition <- sleuth_results(so, 'reduced_condition:full_condition', 'lrt', show_all = FALSE, pval_aggregate=FALSE)
sleuth_table_gene_condition <- dplyr::filter(sleuth_table_gene_condition, qval <= 0.05)

#Generate tables with significant genes for condition with interaction with space
sleuth_table_gene_condition_int <- sleuth_results(so, 'reduced_condition_int:full_condition_int', 'lrt', show_all = FALSE, pval_aggregate=FALSE)
sleuth_table_gene_condition_int <- dplyr::filter(sleuth_table_gene_condition_int, qval <= 0.05)

#Save the sleuth object
sleuth_save(so, "/mnt/cluster_projects/projects/jvillanueva/analysis/2019-02-roni_rnaseq_de_analysis/sleuth_so.obj")


png("/data/projects/roni_rnaseq_de_feb2019/figures/group_density.png", res = 150, w = 1500, h = 500)
plot_group_density(so, use_filtered = TRUE, units = "est_counts",
                   trans = "log", grouping = setdiff(colnames(so$sample_to_covariates),
                                                     "sample"), offset = 1)
dev.off()


#First comparison 2D vs 3D

#Number of DE genes
nrow(sleuth_table_gene_space)

s2c_AvsB<-s2c

#Add information on up/down regulation. Calculating means from samples.
temphigh<-s2c_AvsB$sample[s2c_AvsB$space=="2D"]
templow<-s2c_AvsB$sample[s2c_AvsB$space=="3D"]
hiraw<-subset(so$obs_raw,so$obs_raw$sample %in% temphigh)
lowraw<-subset(so$obs_raw,so$obs_raw$sample %in% templow)
highmeans<-aggregate(tpm~target_id,data=hiraw,FUN=function(x) c(mean=mean(x)))
colnames(highmeans)<-c("target_id","tpm_2D")
lowmeans<-aggregate(tpm~target_id,data=lowraw,FUN=function(x) c(mean=mean(x)))
colnames(lowmeans)<-c("target_id","tpm_3D")
merged_means<-merge(lowmeans,highmeans,by=c("target_id"))
merged_means2<-merge(merged_means,target_mapping,by=c("target_id"))
sleuth_table_gene_space$gene_id <-sleuth_table_gene_space$target_id
sleuth_table_wTPM<-left_join(sleuth_table_gene_space, merged_means2, by=c("gene_id"))


twoD<-aggregate(tpm_2D ~gene_id, data=sleuth_table_wTPM, sum, na.rm=TRUE)
threeD<-aggregate(tpm_3D ~gene_id, data=sleuth_table_wTPM, sum, na.rm=TRUE)

sleuth_table_wTPM$tpm_2D<-NULL
sleuth_table_wTPM$tpm_3D<-NULL
sleuth_table_wTPM$target_id.y<-NULL
sleuth_table_wTPM<-sleuth_table_wTPM[!duplicated(sleuth_table_wTPM), ] #Remove duplicates
sleuth_table_wTPM<-left_join(sleuth_table_wTPM, twoD, by=c("gene_id"))
sleuth_table_wTPM<-left_join(sleuth_table_wTPM, threeD, by=c("gene_id"))
sleuth_table_wTPM$tpm_2D_by_3D=log2((sleuth_table_wTPM$tpm_2D+0.01)/(sleuth_table_wTPM$tpm_3D+0.01))

sleuth_significant <- dplyr::filter(sleuth_table_wTPM, qval <= 0.05)
sleuth_significant$target_id.x<-NULL
sleuth_table_wTPM$threshold <- as.factor(ifelse(sleuth_table_wTPM$tpm_2D_by_3D >= 1.5, 'Up',ifelse(sleuth_table_wTPM$tpm_2D_by_3D <= -1.5, 'Down','-')))
write.csv(sleuth_significant, file = "/data/projects/roni_rnaseq_de_feb2019/tables/significant2D_vs_3D.csv", quote=FALSE,row.names=FALSE)
write.csv(sleuth_table_wTPM, file = "/data/projects/roni_rnaseq_de_feb2019/tables/sleuth_table_wTPM_2D_vs_3D.csv", quote=FALSE,row.names=FALSE)


#Volcano plot
png("/data/projects/roni_rnaseq_de_feb2019/figures/volcano_2D_3D.png", res = 150, w = 1500, h = 500)
ggplot(data=sleuth_table_wTPM, 
       aes(x=(tpm_2D_by_3D), y =-log10(qval), 
           colour=threshold,fill=threshold)) +
  scale_color_manual(values=c("grey", "red","blue"))+
  geom_point(alpha=0.4, size=1.2) +
  xlim(c(-4, 4)) +
  theme_bw(base_size = 12, base_family = "Times") +
  geom_vline(xintercept=c(-1.5,1.5),lty=4,col="grey",lwd=0.6)+
  geom_hline(yintercept = -log10(0.05),lty=4,col="grey",lwd=0.6)+
  theme(legend.position="right",
        panel.grid=element_blank(),
        legend.title = element_blank(),
        legend.text= element_text(face="bold", color="black",family = "Times", size=8),
        plot.title = element_text(hjust = 0.5),
        axis.text.x = element_text(face="bold", color="black", size=12),
        axis.text.y = element_text(face="bold",  color="black", size=12),
        axis.title.x = element_text(face="bold", color="black", size=12),
        axis.title.y = element_text(face="bold",color="black", size=12))+
  labs(x="log2 (fold change)",y="-log10 (q-value)",title="Volcano plot of DEG 2D vs 3D")
dev.off()


#Second comparison wt vs kd
#Number of DE genes
nrow(sleuth_table_gene_condition)

s2c_AvsB<-s2c

#Add information on up/down regulation. Calculating means from samples.
temphigh<-s2c_AvsB$sample[s2c_AvsB$condition=="wt"]
templow<-s2c_AvsB$sample[s2c_AvsB$condition=="kd"]
hiraw<-subset(so$obs_raw,so$obs_raw$sample %in% temphigh)
lowraw<-subset(so$obs_raw,so$obs_raw$sample %in% templow)
highmeans<-aggregate(tpm~target_id,data=hiraw,FUN=function(x) c(mean=mean(x)))
colnames(highmeans)<-c("target_id","tpm_wt")
lowmeans<-aggregate(tpm~target_id,data=lowraw,FUN=function(x) c(mean=mean(x)))
colnames(lowmeans)<-c("target_id","tpm_kd")
merged_means<-merge(lowmeans,highmeans,by=c("target_id"))
merged_means2<-merge(merged_means,target_mapping,by=c("target_id"))
sleuth_table_gene_space$gene_id <-sleuth_table_gene_space$target_id
sleuth_table_wTPM<-left_join(sleuth_table_gene_space, merged_means2, by=c("gene_id"))


wt<-aggregate(tpm_wt ~gene_id, data=sleuth_table_wTPM, sum, na.rm=TRUE)
kd<-aggregate(tpm_kd ~gene_id, data=sleuth_table_wTPM, sum, na.rm=TRUE)

sleuth_table_wTPM$tpm_wt<-NULL
sleuth_table_wTPM$tpm_kd<-NULL
sleuth_table_wTPM$target_id.y<-NULL
sleuth_table_wTPM<-sleuth_table_wTPM[!duplicated(sleuth_table_wTPM), ] #Remove duplicates
sleuth_table_wTPM<-left_join(sleuth_table_wTPM, wt, by=c("gene_id"))
sleuth_table_wTPM<-left_join(sleuth_table_wTPM, kd, by=c("gene_id"))
sleuth_table_wTPM$tpm_wt_by_kd=log2((sleuth_table_wTPM$tpm_wt+0.01)/(sleuth_table_wTPM$tpm_kd+0.01))

sleuth_significant <- dplyr::filter(sleuth_table_wTPM, qval <= 0.05)
sleuth_significant$target_id.x<-NULL
sleuth_table_wTPM$threshold <- as.factor(ifelse(sleuth_table_wTPM$tpm_wt_by_kd >= 1.5, 'Up',ifelse(sleuth_table_wTPM$tpm_wt_by_kd <= -1.5, 'Down','-')))
write.csv(sleuth_significant, file = "/data/projects/roni_rnaseq_de_feb2019/tables/significantwt_vs_kd.csv", quote=FALSE,row.names=FALSE)
write.csv(sleuth_table_wTPM, file = "/data/projects/roni_rnaseq_de_feb2019/tables/sleuth_table_wTPM_wt_vs_kd.csv", quote=FALSE,row.names=FALSE)


#Volcano plot
png("/data/projects/roni_rnaseq_de_feb2019/figures/volcano_wt_kd.png", res = 150, w = 1500, h = 500)
ggplot(data=sleuth_table_wTPM, 
       aes(x=(tpm_wt_by_kd), y =-log10(qval), 
           colour=threshold,fill=threshold)) +
  scale_color_manual(values=c("grey", "red","blue"))+
  geom_point(alpha=0.4, size=1.2) +
  xlim(c(-4, 4)) +
  theme_bw(base_size = 12, base_family = "Times") +
  geom_vline(xintercept=c(-1.5,1.5),lty=4,col="grey",lwd=0.6)+
  geom_hline(yintercept = -log10(0.05),lty=4,col="grey",lwd=0.6)+
  theme(legend.position="right",
        panel.grid=element_blank(),
        legend.title = element_blank(),
        legend.text= element_text(face="bold", color="black",family = "Times", size=8),
        plot.title = element_text(hjust = 0.5),
        axis.text.x = element_text(face="bold", color="black", size=12),
        axis.text.y = element_text(face="bold",  color="black", size=12),
        axis.title.x = element_text(face="bold", color="black", size=12),
        axis.title.y = element_text(face="bold",color="black", size=12))+
  labs(x="log2 (fold change)",y="-log10 (q-value)",title="Volcano plot of DEG WT vs KD")
dev.off()

theplot <- function(id) rbind(filter(hiraw, grepl(id, target_id)), filter(lowraw, grepl(id, target_id))) %>% inner_join(s2c) %>% ggplot(aes(x = target_id, y = tpm, col = space, group = space, shape = sequencing)) + geom_point(position = position_dodge(width = .5), size = 3) + scale_y_continuous(trans = "log10") + facet_wrap(~condition, ncol = 1)
theplot(id)