Downstream analyis using R for miRNAs (4 MKL-1 + 17 WaGa samples)
#Input file
#exceRpt_miRNA_ReadCounts.txt
#exceRpt_piRNA_ReadCounts.txt
## MKL-1 wild-type controls
#MKL-1_wt_cells (nf780, nf796, nf797)
#MKL-1_wt_EV (nf655)
#
## WaGa experimental groups (scr = scramble control; sT = target knockdown)
#WaGa_scr_DMSO_EV (nf933, nf938, nf973)
#WaGa_scr_Dox_EV (nf934, nf939, nf974)
#WaGa_sT_DMSO_EV (nf931, nf936, nf971)
#WaGa_sT_Dox_EV (nf932, nf937, nf972)
#
## WaGa wild-type controls
#WaGa_wt_cells (nf774, nf961, nf962)
#WaGa_wt_EV (nf657 (deleted), nf930, nf935)
cd ~/DATA/Data_Ute_exceRpt_workspace/summaries
mamba activate r_env
R
#> .libPaths()
#[1] "/home/jhuang/mambaforge/envs/r_env/lib/R/library"
#BiocManager::install("AnnotationDbi")
#BiocManager::install("clusterProfiler")
#BiocManager::install(c("ReactomePA","org.Hs.eg.db"))
#BiocManager::install("limma")
#BiocManager::install("sva")
#install.packages("writexl")
#install.packages("openxlsx")
library("AnnotationDbi")
library("clusterProfiler")
library("ReactomePA")
library("org.Hs.eg.db")
library(DESeq2)
library(gplots)
library(limma)
library(sva)
#library(writexl) #d.raw_with_rownames <- cbind(RowNames = rownames(d.raw), d.raw); write_xlsx(d.raw, path = "d_raw.xlsx");
library(openxlsx)
setwd("../summaries/")
d.raw<- read.delim2("exceRpt_miRNA_ReadCounts.txt",sep="\t", header=TRUE, row.names=1)
# Desired column order
desired_order <- c(
"nf780", "nf796", "nf797",
"nf655",
"nf933", "nf938", "nf973",
"nf934", "nf939", "nf974",
"nf931", "nf936", "nf971",
"nf932", "nf937", "nf972",
"nf774", "nf961", "nf962",
"nf930", "nf935" #delete "nf657",
)
# Reorder columns
d.raw <- d.raw[, desired_order]
setdiff(desired_order, colnames(d.raw)) # Shows missing or misnamed columns
#sapply(d.raw, is.numeric)
d.raw[] <- lapply(d.raw, as.numeric)
#d.raw[] <- lapply(d.raw, function(x) as.numeric(as.character(x)))
d.raw <- round(d.raw)
write.csv(d.raw, file ="d_raw.csv")
write.xlsx(d.raw, file = "d_raw.xlsx", rowNames = TRUE)
# ------ Code sent to Ute ------
#d.raw <- read.delim2("d_raw.csv",sep=",", header=TRUE, row.names=1)
Cell_or_EV = as.factor(c("Cell","Cell","Cell", "EV", "EV","EV","EV", "EV","EV","EV", "EV","EV","EV", "EV","EV","EV", "Cell","Cell","Cell", "EV","EV")) #delete "EV",
replicates = as.factor(c("MKL-1_wt_cells","MKL-1_wt_cells","MKL-1_wt_cells", "MKL-1_wt_EV", "WaGa_scr_DMSO_EV","WaGa_scr_DMSO_EV","WaGa_scr_DMSO_EV", "WaGa_scr_Dox_EV","WaGa_scr_Dox_EV","WaGa_scr_Dox_EV", "WaGa_sT_DMSO_EV","WaGa_sT_DMSO_EV","WaGa_sT_DMSO_EV", "WaGa_sT_Dox_EV","WaGa_sT_Dox_EV","WaGa_sT_Dox_EV", "WaGa_wt_cells", "WaGa_wt_cells","WaGa_wt_cells", "WaGa_wt_EV", "WaGa_wt_EV")) #delete "WaGa_wt_EV",
ids = as.factor(c("nf780", "nf796", "nf797",
"nf655",
"nf933", "nf938", "nf973",
"nf934", "nf939", "nf974",
"nf931", "nf936", "nf971",
"nf932", "nf937", "nf972",
"nf774", "nf961", "nf962",
"nf930", "nf935")) #delete "nf657",
cData = data.frame(row.names=colnames(d.raw), replicates=replicates, ids=ids, Cell_or_EV=Cell_or_EV)
dds<-DESeqDataSetFromMatrix(countData=d.raw, colData=cData, design=~replicates)
# Filter low-count miRNAs
dds <- dds[ rowSums(counts(dds)) > 10, ] #1322-->903
rld <- rlogTransformation(dds)
# -- before pca --
png("pca.png", 1200, 800)
plotPCA(rld, intgroup=c("replicates"))
#plotPCA(rld, intgroup = c("replicates", "batch"))
#plotPCA(rld, intgroup = c("replicates", "ids"))
#plotPCA(rld, "batch")
dev.off()
png("pca2.png", 1200, 800)
#plotPCA(rld, intgroup=c("replicates"))
#plotPCA(rld, intgroup = c("replicates", "batch"))
plotPCA(rld, intgroup = c("replicates", "ids"))
#plotPCA(rld, "batch")
dev.off()
# Batch Effect Removal Methods (Non-batch effect removal applied!)
#### STEP2: DEGs ####
#- Heatmap untreated/wt vs parental; 1x for WaGa cell line
#- Volcano plot untreated/wt vs parental; 1x for WaGa cell line
#- Manhattan plot miRNAs; 1x for WaGa cell line
#- Distribution of different small RNA species untreated/wt and parental; 1x for WaGa cell line
#- Motif analysis: identify RNA-binding proteins that may regulate small RNA loading; 1x for WaGa cell line
#convert bam to bigwig using deepTools by feeding inverse of DESeq’s size Factor
sizeFactors(dds)
#NULL
dds <- estimateSizeFactors(dds)
sizeFactors(dds)
normalized_counts <- counts(dds, normalized=TRUE)
write.table(normalized_counts, file="normalized_counts.txt", sep="\t", quote=F, col.names=NA)
write.xlsx(normalized_counts, file = "normalized_counts.xlsx", rowNames = TRUE)
#---- untreated, scr_control, DMSO_control, scr_DMSO_control, sT_knockdown to parental_cells ----
dds<-DESeqDataSetFromMatrix(countData=d.raw, colData=cData, design=~replicates)
dds$replicates <- relevel(dds$replicates, "untreated")
dds = DESeq(dds, betaPrior=FALSE)
resultsNames(dds)
clist <- c("DMSO_control_vs_untreated", "scr_control_vs_untreated", "scr_DMSO_control_vs_untreated", "sT_knockdown_vs_untreated")
dds$replicates <- relevel(dds$replicates, "DMSO_control")
dds = DESeq(dds, betaPrior=FALSE)
resultsNames(dds)
clist <- c("sT_knockdown_vs_DMSO_control")
dds$replicates <- relevel(dds$replicates, "scr_control")
dds = DESeq(dds, betaPrior=FALSE)
resultsNames(dds)
clist <- c("sT_knockdown_vs_scr_control")
dds$replicates <- relevel(dds$replicates, "scr_DMSO_control")
dds = DESeq(dds, betaPrior=FALSE)
resultsNames(dds)
clist <- c("sT_knockdown_vs_scr_DMSO_control")
dds$replicates <- relevel(dds$replicates, "WaGa_wt_cells")
dds = DESeq(dds, betaPrior=FALSE) #default betaPrior is FALSE
resultsNames(dds)
clist <- c("WaGa_wt_EV_vs_WaGa_wt_cells")
dds$replicates <- relevel(dds$replicates, "MKL-1_wt_cells")
dds = DESeq(dds, betaPrior=FALSE) #default betaPrior is FALSE
resultsNames(dds)
clist <- c("MKL.1_wt_EV_vs_MKL.1_wt_cells")
#NOTE that the results sent to Ute is |padj|<=0.1.
for (i in clist) {
contrast = paste("replicates", i, sep="_")
res = results(dds, name=contrast)
res <- res[!is.na(res$log2FoldChange),]
#https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#why-are-some-p-values-set-to-na
res$padj <- ifelse(is.na(res$padj), 1, res$padj)
res_df <- as.data.frame(res)
write.csv(as.data.frame(res_df[order(res_df$pvalue),]), file = paste(i, "all.txt", sep="-"))
up <- subset(res_df, padj<=0.05 & log2FoldChange>=2)
down <- subset(res_df, padj<=0.05 & log2FoldChange<=-2)
write.csv(as.data.frame(up[order(up$log2FoldChange,decreasing=TRUE),]), file = paste(i, "up.txt", sep="-"))
write.csv(as.data.frame(down[order(abs(down$log2FoldChange),decreasing=TRUE),]), file = paste(i, "down.txt", sep="-"))
}
~/Tools/csv2xls-0.4/csv_to_xls.py \
untreated_vs_parental_cells-all.txt \
untreated_vs_parental_cells-up.txt \
untreated_vs_parental_cells-down.txt \
-d$',' -o untreated_vs_parental_cells.xls;
~/Tools/csv2xls-0.4/csv_to_xls.py \
DMSO_control_vs_untreated-all.txt \
DMSO_control_vs_untreated-up.txt \
DMSO_control_vs_untreated-down.txt \
-d$',' -o DMSO_control_vs_untreated.xls;
~/Tools/csv2xls-0.4/csv_to_xls.py \
scr_control_vs_untreated-all.txt \
scr_control_vs_untreated-up.txt \
scr_control_vs_untreated-down.txt \
-d$',' -o scr_control_vs_untreated.xls;
~/Tools/csv2xls-0.4/csv_to_xls.py \
scr_DMSO_control_vs_untreated-all.txt \
scr_DMSO_control_vs_untreated-up.txt \
scr_DMSO_control_vs_untreated-down.txt \
-d$',' -o scr_DMSO_control_vs_untreated.xls;
~/Tools/csv2xls-0.4/csv_to_xls.py \
sT_knockdown_vs_untreated-all.txt \
sT_knockdown_vs_untreated-up.txt \
sT_knockdown_vs_untreated-down.txt \
-d$',' -o sT_knockdown_vs_untreated.xls;
~/Tools/csv2xls-0.4/csv_to_xls.py \
sT_knockdown_vs_DMSO_control-all.txt \
sT_knockdown_vs_DMSO_control-up.txt \
sT_knockdown_vs_DMSO_control-down.txt \
-d$',' -o sT_knockdown_vs_DMSO_control.xls;
~/Tools/csv2xls-0.4/csv_to_xls.py \
sT_knockdown_vs_scr_control-all.txt \
sT_knockdown_vs_scr_control-up.txt \
sT_knockdown_vs_scr_control-down.txt \
-d$',' -o sT_knockdown_vs_scr_control.xls;
~/Tools/csv2xls-0.4/csv_to_xls.py \
sT_knockdown_vs_scr_DMSO_control-all.txt \
sT_knockdown_vs_scr_DMSO_control-up.txt \
sT_knockdown_vs_scr_DMSO_control-down.txt \
-d$',' -o sT_knockdown_vs_scr_DMSO_control.xls;
# ------------------- volcano_plot -------------------
library(ggplot2)
library(ggrepel)
geness_res <- read.csv(file = paste("untreated_vs_parental_cells", "all.txt", sep="-"), row.names=1)
external_gene_name <- rownames(geness_res)
geness_res <- cbind(geness_res, external_gene_name)
#top_g are from ids
top_g <- c("hsa-let-7b-5p","hsa-let-7g-5p","hsa-let-7i-5p","hsa-miR-103a-3p","hsa-miR-107","hsa-miR-1224-5p","hsa-miR-122-5p","hsa-miR-1226-5p","hsa-miR-1246","hsa-miR-127-3p","hsa-miR-1290","hsa-miR-130a-3p","hsa-miR-139-3p","hsa-miR-141-3p","hsa-miR-143-3p","hsa-miR-148b-3p","hsa-miR-155-5p","hsa-miR-15a-5p","hsa-miR-17-5p","hsa-miR-184","hsa-miR-18a-3p","hsa-miR-18a-5p","hsa-miR-190a-5p","hsa-miR-191-5p","hsa-miR-193b-5p","hsa-miR-197-5p","hsa-miR-200a-3p","hsa-miR-200b-5p","hsa-miR-206","hsa-miR-20a-5p","hsa-miR-210-3p","hsa-miR-2110","hsa-miR-21-5p","hsa-miR-218-5p","hsa-miR-219a-1-3p","hsa-miR-221-3p","hsa-miR-23b-3p","hsa-miR-27a-3p","hsa-miR-27b-3p","hsa-miR-27b-5p","hsa-miR-28-3p","hsa-miR-30a-5p","hsa-miR-30c-5p","hsa-miR-30e-5p","hsa-miR-3127-5p","hsa-miR-3131","hsa-miR-3180|hsa-miR-3180-3p","hsa-miR-320a","hsa-miR-320b","hsa-miR-320c","hsa-miR-320d","hsa-miR-330-3p","hsa-miR-335-3p","hsa-miR-33b-5p","hsa-miR-340-5p","hsa-miR-342-5p","hsa-miR-3605-5p","hsa-miR-361-3p","hsa-miR-365a-5p","hsa-miR-374b-5p","hsa-miR-378i","hsa-miR-379-5p","hsa-miR-3940-5p","hsa-miR-409-3p","hsa-miR-411-5p","hsa-miR-423-3p","hsa-miR-423-5p","hsa-miR-4286","hsa-miR-429","hsa-miR-432-5p","hsa-miR-4326","hsa-miR-451a","hsa-miR-4520-3p","hsa-miR-454-3p","hsa-miR-4646-5p","hsa-miR-4667-5p","hsa-miR-4748","hsa-miR-483-5p","hsa-miR-486-5p","hsa-miR-5010-5p","hsa-miR-504-3p","hsa-miR-5187-5p","hsa-miR-590-3p","hsa-miR-6128","hsa-miR-625-5p","hsa-miR-6726-5p","hsa-miR-6730-5p","hsa-miR-676-3p","hsa-miR-6767-5p","hsa-miR-6777-5p","hsa-miR-6780a-5p","hsa-miR-6794-5p","hsa-miR-6817-3p","hsa-miR-708-5p","hsa-miR-7-5p","hsa-miR-766-5p","hsa-miR-7854-3p","hsa-miR-873-3p","hsa-miR-885-3p","hsa-miR-92b-5p","hsa-miR-93-5p","hsa-miR-937-3p","hsa-miR-9-5p","hsa-miR-98-5p")
subset(geness_res, external_gene_name %in% top_g & pvalue < 0.05 & (abs(geness_res$log2FoldChange) >= 2.0))
geness_res$Color <- "NS or log2FC < 2.0"
geness_res$Color[geness_res$pvalue < 0.05] <- "P < 0.05"
geness_res$Color[geness_res$padj < 0.05] <- "P-adj < 0.05"
geness_res$Color[abs(geness_res$log2FoldChange) < 2.0] <- "NS or log2FC < 2.0"
write.csv(geness_res, "untreated_vs_parental_cells_with_Category.csv")
geness_res$invert_P <- (-log10(geness_res$pvalue)) * sign(geness_res$log2FoldChange)
geness_res <- geness_res[, -1*ncol(geness_res)]
png("volcano_plot_untreated_vs_parental_cells.png",width=1200, height=1400)
#svg("untreated_vs_parental_cells.svg",width=12, height=14)
ggplot(geness_res, aes(x = log2FoldChange, y = -log10(pvalue), color = Color, label = external_gene_name)) + geom_vline(xintercept = c(2.0, -2.0), lty = "dashed") + geom_hline(yintercept = -log10(0.05), lty = "dashed") + geom_point() + labs(x = "log2(FC)", y = "Significance, -log10(P)", color = "Significance") + scale_color_manual(values = c("P < 0.05"="orange","P-adj < 0.05"="red","NS or log2FC < 2.0"="darkgray"),guide = guide_legend(override.aes = list(size = 4))) + scale_y_continuous(expand = expansion(mult = c(0,0.05))) + geom_text_repel(data = subset(geness_res, external_gene_name %in% top_g & pvalue < 0.05 & (abs(geness_res$log2FoldChange) >= 2.0)), size = 4, point.padding = 0.15, color = "black", min.segment.length = .1, box.padding = .2, lwd = 2) + theme_bw(base_size = 16) + theme(legend.position = "bottom")
dev.off()
# ------------------ differentially_expressed_miRNAs_heatmap -----------------
# prepare all_genes
rld <- rlogTransformation(dds)
mat <- assay(rld)
mm <- model.matrix(~replicates, colData(rld))
mat <- limma::removeBatchEffect(mat, batch=rld$batch, design=mm)
assay(rld) <- mat
RNASeq.NoCellLine <- assay(rld)
# reorder the columns
#colnames(RNASeq.NoCellLine) = c("0505 WaGa sT DMSO","1905 WaGa sT DMSO","0505 WaGa sT Dox","1905 WaGa sT Dox","0505 WaGa scr DMSO","1905 WaGa scr DMSO","0505 WaGa scr Dox","1905 WaGa scr Dox","0505 WaGa wt","1905 WaGa wt","control MKL1","control WaGa")
#col.order <-c("control MKL1", "control WaGa","0505 WaGa wt","1905 WaGa wt","0505 WaGa sT DMSO","1905 WaGa sT DMSO","0505 WaGa sT Dox","1905 WaGa sT Dox","0505 WaGa scr DMSO","1905 WaGa scr DMSO","0505 WaGa scr Dox","1905 WaGa scr Dox")
#RNASeq.NoCellLine <- RNASeq.NoCellLine[,col.order]
#Option4: manully defining
#for i in untreated_vs_parental_cells sT_knockdown_vs_untreated DMSO_control_vs_untreated scr_control_vs_untreated scr_DMSO_control_vs_untreated sT_knockdown_vs_DMSO_control sT_knockdown_vs_scr_control sT_knockdown_vs_scr_DMSO_control; do
# echo "cut -d',' -f1-1 ${i}-up.txt > ${i}-up.id";
# echo "cut -d',' -f1-1 ${i}-down.txt > ${i}-down.id";
#done
#cat *.id | sort -u > ids
##add Gene_Id in the first line, delete the ""
GOI <- read.csv("ids")$Gene_Id
datamat = RNASeq.NoCellLine[GOI, ]
# clustering the genes and draw heatmap
#datamat <- datamat[,-1] #delete the sample "control MKL1"
#datamat <- datamat[, 1:5]
#parental_cells_1 parental_cells_2 parental_cells_3 untreated_1 untreated_2 scr_control_1 scr_control_2 scr_control_3 DMSO_control_1 DMSO_control_2 DMSO_control_3 scr_DMSO_control_1 scr_DMSO_control_2 scr_DMSO_control_3 sT_knockdown_1 sT_knockdown_2 sT_knockdown_3 -->
#parental cells 1 parental cells 2 parental cells 3 untreated 1 untreated 2 scr control 1 scr control 2 scr control 3 DMSO control 1 DMSO control 2 DMSO control 3 scr DMSO control 1 scr DMSO control 2 scr DMSO control 3 sT knockdown 1 sT knockdown 2 sT knockdown 3
colnames(datamat)[1] <- "parental cells 1"
colnames(datamat)[2] <- "parental cells 2"
colnames(datamat)[3] <- "parental cells 3"
colnames(datamat)[4] <- "untreated 1"
colnames(datamat)[5] <- "untreated 2"
colnames(datamat)[6] <- "scr control 1"
colnames(datamat)[7] <- "scr control 2"
colnames(datamat)[8] <- "scr control 3"
colnames(datamat)[9] <- "DMSO control 1"
colnames(datamat)[10] <- "DMSO control 2"
colnames(datamat)[11] <- "DMSO control 3"
colnames(datamat)[12] <- "scr DMSO control 1"
colnames(datamat)[13] <- "scr DMSO control 2"
colnames(datamat)[14] <- "scr DMSO control 3"
colnames(datamat)[15] <- "sT knockdown 1"
colnames(datamat)[16] <- "sT knockdown 2"
colnames(datamat)[17] <- "sT knockdown 3"
write.csv(datamat, file ="gene_expression_keeping_replicates.txt")
write.xlsx(datamat, file = "gene_expression_keeping_replicates.xlsx", rowNames = TRUE)
#"ward.D"’, ‘"ward.D2"’,‘"single"’, ‘"complete"’, ‘"average"’ (= UPGMA), ‘"mcquitty"’(= WPGMA), ‘"median"’ (= WPGMC) or ‘"centroid"’ (= UPGMC)
hr <- hclust(as.dist(1-cor(t(datamat), method="pearson")), method="complete")
hc <- hclust(as.dist(1-cor(datamat, method="spearman")), method="complete")
mycl = cutree(hr, h=max(hr$height)/1.1)
mycol = c("YELLOW", "BLUE", "ORANGE", "CYAN", "GREEN", "MAGENTA", "GREY", "LIGHTCYAN", "RED", "PINK", "DARKORANGE", "MAROON", "LIGHTGREEN", "DARKBLUE", "DARKRED", "LIGHTBLUE", "DARKCYAN", "DARKGREEN", "DARKMAGENTA");
mycol = mycol[as.vector(mycl)]
rownames(datamat) <- sub("\\|.*", "", rownames(datamat))
png("DEGs_heatmap_keeping_replicates.png", width=1000, height=1400)
#svg("DEGs_heatmap_keeping_replicates.svg", width=6, height=8)
heatmap.2(as.matrix(datamat),
Rowv=as.dendrogram(hr),
Colv=NA,
dendrogram='row',
labRow=row.names(datamat),
scale='row',
trace='none',
col=bluered(75),
RowSideColors=mycol,
srtCol=30,
lhei=c(1,8),
cexRow=1.4, # Increase row label font size
cexCol=1.7, # Increase column label font size
margin=c(8, 12)
)
dev.off()
# ----------------------------------------
# ----------- manhattan_plot -------------
Rscript manhattan_plot_Carmen_custom_labels.R #exceRpt_miRNA_ReadCounts.txt