Import data and pca-plot
#mamba activate r_env
#install.packages("ggfun")
# Import the required libraries
library("AnnotationDbi")
library("clusterProfiler")
library("ReactomePA")
library(gplots)
library(tximport)
library(DESeq2)
#library("org.Hs.eg.db")
library(dplyr)
library(tidyverse)
#install.packages("devtools")
#devtools::install_version("gtable", version = "0.3.0")
library(gplots)
library("RColorBrewer")
#install.packages("ggrepel")
library("ggrepel")
# install.packages("openxlsx")
library(openxlsx)
library(EnhancedVolcano)
library(DESeq2)
library(edgeR)
setwd("~/DATA/Data_Tam_RNAseq_2026_on_AYE/results/star_salmon")
# Define paths to your Salmon output quantification files
# Store sample names in a character vector
samples <- c(
"AYE-O_Azi20_solid_r1", "AYE-O_Azi20_solid_r2", "AYE-O_Azi20_solid_r3", "AYE-O_ctr_r1", "AYE-O_ctr_r2",
"AYE-O_ctr_r3", "AYE-O_ctr_solid_r1", "AYE-O_ctr_solid_r2", "AYE-O_ctr_solid_r3",
"AYE-O_Diclo375_r1", "AYE-O_Diclo375_r2", "AYE-O_Diclo375_r3", "AYE-O_Mero0.5_r1",
"AYE-O_Mero0.5_r2", "AYE-O_Mero0.5_r3", "AYE-O_Rifampicin2_r1", "AYE-O_Rifampicin2_r2",
"AYE-O_Rifampicin2_r3", "AYE-T_Azi20_solid_r1", "AYE-T_Azi20_solid_r2", "AYE-T_Azi20_solid_r3",
"AYE-T_ctr_r1", "AYE-T_ctr_r2", "AYE-T_ctr_r3", "AYE-T_ctr_solid_r1", "AYE-T_ctr_solid_r2",
"AYE-T_ctr_solid_r3", "AYE-T_Diclo375_r1", "AYE-T_Diclo375_r2", "AYE-T_Diclo375_r3",
"AYE-T_Mero0.15_r1", "AYE-T_Mero0.15_r2", "AYE-T_Mero0.15_r3", "AYE-T_Rifampicin2_r1",
"AYE-T_Rifampicin2_r2", "AYE-T_Rifampicin2_r3", "AYE-WT_Azi20_solid_r1", "AYE-WT_Azi20_solid_r2",
"AYE-WT_Azi20_solid_r3", "AYE-WT_ctr_r1", "AYE-WT_ctr_r2", "AYE-WT_ctr_r3", "AYE-WT_ctr_solid_r1",
"AYE-WT_ctr_solid_r2", "AYE-WT_ctr_solid_r3", "AYE-WT_Diclo1250_solid_r1", "AYE-WT_Diclo1250_solid_r2",
"AYE-WT_Diclo1250_solid_r3", "AYE-WT_Diclo750_r1", "AYE-WT_Diclo750_r2", "AYE-WT_Diclo750_r3",
"AYE-WT_Mero0.35-0.5_r1", "AYE-WT_Mero0.35-0.5_r2", "AYE-WT_Mero0.35-0.5_r3", "AYE-WT_Rifampicin1.5_r1",
"AYE-WT_Rifampicin1.5_r2", "AYE-WT_Rifampicin1.5_r3", "F_Azi20_solid_r1", "F_Azi20_solid_r2",
"F_Azi20_solid_r3", "F_ctr_solid_r1", "F_ctr_solid_r2", "F_ctr_solid_r3", "O-Trans_ctr_r1",
"O-Trans_ctr_r2", "O-Trans_ctr_r3", "O-Trans_Diclo375_r1", "O-Trans_Diclo375_r2", "O-Trans_Diclo375_r3",
"O-Trans_Mero0.25_r1", "O-Trans_Mero0.25_r2", "O-Trans_Mero0.25_r3", "O-Trans_Rifampicin2_r1",
"O-Trans_Rifampicin2_r2", "O-Trans_Rifampicin2_r3", "WT-Trans_ctr_r1", "WT-Trans_ctr_r2",
"WT-Trans_ctr_r3", "WT-Trans_Diclo750_r1", "WT-Trans_Diclo750_r2", "WT-Trans_Diclo750_r3"
)
## Automatically generate the named vector
files <- setNames(paste0("./", samples, "/quant.sf"), samples)
# -----------------------------------------------------------------
# ---- Step 1: Create Detailed Metadata from Your Sample Names ----
# Extract metadata from sample names
samples <- names(files)
# Parse the complex sample names
metadata <- data.frame(
sample = samples,
stringsAsFactors = FALSE
)
# Extract strain (everything before first underscore or hyphen treatment)
metadata$strain <- sapply(strsplit(samples, "[-_]"), function(x) {
if(x[1] %in% c("AYE", "O", "WT", "F")) {
if(x[1] == "AYE" && length(x) > 1 && x[2] %in% c("WT", "T", "O")) {
paste(x[1:2], collapse = "-")
} else if(x[1] %in% c("O", "WT") && x[2] == "Trans") {
paste(x[1:2], collapse = "-")
} else {
x[1]
}
} else {
x[1]
}
})
# Extract treatment type
metadata$treatment <- sapply(samples, function(x) {
if(grepl("_ctr", x)) return("ctrl")
if(grepl("Diclo", x)) return("Diclo")
if(grepl("Mero", x)) return("Mero")
if(grepl("Azi", x)) return("Azi")
if(grepl("Rifampicin", x)) return("Rifampicin")
return("ctrl")
})
# Extract concentration
metadata$concentration <- sapply(samples, function(x) {
if(grepl("Diclo1250", x)) return("1250")
if(grepl("Diclo750", x)) return("750")
if(grepl("Diclo375", x)) return("375")
if(grepl("Mero0.5", x)) return("0.5")
if(grepl("Mero0.35", x)) return("0.35")
if(grepl("Mero0.25", x)) return("0.25")
if(grepl("Mero0.15", x)) return("0.15")
if(grepl("Azi20", x)) return("20")
if(grepl("Rifampicin2", x)) return("2")
if(grepl("Rifampicin1.5", x)) return("1.5")
return("0")
})
# Extract condition (solid vs liquid)
metadata$condition <- ifelse(grepl("_solid", samples), "solid", "liquid")
# Extract replicate
metadata$replicate <- sapply(strsplit(samples, "_"), function(x) {
rep_part <- x[length(x)]
gsub("r", "", rep_part)
})
# Create combined group for easy comparisons
metadata$group <- paste(metadata$strain, metadata$treatment, metadata$concentration, sep = "_")
# Set row names
rownames(metadata) <- metadata$sample
# Reorder to match txi columns
metadata <- metadata[colnames(txi$counts), ]
# ---------------------------------------------
# ---- Step 2: Choose Your Design Strategy ----
# Strategy A: Full Factorial Design (if balanced)
dds <- DESeqDataSetFromTximport(txi, metadata,
design = ~ strain + treatment + condition)
# --> Strategy B: Combined Group Factor ⭐ RECOMMENDED
metadata$group <- factor(paste(metadata$strain,
metadata$treatment,
metadata$concentration,
metadata$condition,
sep = "_"))
dds <- DESeqDataSetFromTximport(txi, metadata,
design = ~ group)
dds <- DESeq(dds)
# See all available comparisons
resultsNames(dds)
# -------------------------------------------------------------
# ---- Step 3: Set Up Specific Comparisons from Your Notes ----
# ==========================================
# 1. Define Exact Comparisons from Your Notes
# ==========================================
planned_comparisons <- list(
# --- Baseline / Strain Controls ---
AYE_T_ctr_vs_AYE_WT_ctr = list(treat = "AYE-T_ctrl_0_liquid", ctrl = "AYE-WT_ctrl_0_liquid"),
AYE_O_ctr_vs_AYE_WT_ctr = list(treat = "AYE-O_ctrl_0_liquid", ctrl = "AYE-WT_ctrl_0_liquid"),
O_Trans_ctr_vs_AYE_WT_ctr = list(treat = "O-Trans_ctrl_0_liquid", ctrl = "AYE-WT_ctrl_0_liquid"),
WT_Trans_ctr_vs_AYE_WT_ctr = list(treat = "WT-Trans_ctrl_0_liquid",ctrl = "AYE-WT_ctrl_0_liquid"),
AYE_O_ctr_vs_AYE_T = list(treat = "AYE-O_ctrl_0_liquid", ctrl = "AYE-T_ctrl_0_liquid"),
O_Trans_ctr_vs_AYE_T = list(treat = "O-Trans_ctrl_0_liquid", ctrl = "AYE-T_ctrl_0_liquid"),
WT_Trans_ctr_vs_AYE_T = list(treat = "WT-Trans_ctrl_0_liquid",ctrl = "AYE-T_ctrl_0_liquid"),
# --- Condition Effects (Solid vs Liquid) ---
AYE_WT_ctr_solid_vs_AYE_WT_ctr = list(treat = "AYE-WT_ctrl_0_solid", ctrl = "AYE-WT_ctrl_0_liquid"),
AYE_O_ctr_solid_vs_AYE_O_ctr = list(treat = "AYE-O_ctrl_0_solid", ctrl = "AYE-O_ctrl_0_liquid"),
AYE_T_ctr_solid_vs_AYE_T_ctr = list(treat = "AYE-T_ctrl_0_solid", ctrl = "AYE-T_ctrl_0_liquid"),
AYE_O_ctr_solid_vs_AYE_WT_ctr_solid= list(treat = "AYE-O_ctrl_0_solid", ctrl = "AYE-WT_ctrl_0_solid"),
AYE_T_ctr_solid_vs_AYE_WT_ctr_solid= list(treat = "AYE-T_ctrl_0_solid", ctrl = "AYE-WT_ctrl_0_solid"),
# --- Diclofenac ---
AYE_WT_Diclo750_vs_AYE_WT_ctr = list(treat = "AYE-WT_Diclo_750_liquid", ctrl = "AYE-WT_ctrl_0_liquid"),
AYE_T_Diclo375_vs_AYE_WT_ctr = list(treat = "AYE-T_Diclo_375_liquid", ctrl = "AYE-WT_ctrl_0_liquid"),
AYE_O_Diclo375_vs_AYE_WT_ctr = list(treat = "AYE-O_Diclo_375_liquid", ctrl = "AYE-WT_ctrl_0_liquid"),
O_Trans_Diclo375_vs_AYE_WT_ctr = list(treat = "O-Trans_Diclo_375_liquid", ctrl = "AYE-WT_ctrl_0_liquid"),
WT_Trans_Diclo750_vs_AYE_WT_ctr = list(treat = "WT-Trans_Diclo_750_liquid", ctrl = "AYE-WT_ctrl_0_liquid"),
Diclo_AYE_WT_1250_solid_vs_solid_ctr = list(treat = "AYE-WT_Diclo_1250_solid", ctrl = "AYE-WT_ctrl_0_solid"),
# --- Meropenem ---
AYE_WT_Mero_vs_AYE_WT_ctr = list(treat = "AYE-WT_Mero_0.35_liquid", ctrl = "AYE-WT_ctrl_0_liquid"),
AYE_T_Mero_vs_AYE_WT_ctr = list(treat = "AYE-T_Mero_0.15_liquid", ctrl = "AYE-WT_ctrl_0_liquid"),
AYE_O_Mero_vs_AYE_WT_ctr = list(treat = "AYE-O_Mero_0.5_liquid", ctrl = "AYE-WT_ctrl_0_liquid"),
O_Trans_Mero_vs_AYE_WT_ctr = list(treat = "O-Trans_Mero_0.25_liquid", ctrl = "AYE-WT_ctrl_0_liquid"),
AYE_T_Mero_vs_AYE_T_ctr = list(treat = "AYE-T_Mero_0.15_liquid", ctrl = "AYE-T_ctrl_0_liquid"),
# --- Azithromycin (Solid) ---
AYE_WT_Azi_vs_solid_ctr = list(treat = "AYE-WT_Azi_20_solid", ctrl = "AYE-WT_ctrl_0_solid"),
AYE_T_Azi_vs_solid_ctr = list(treat = "AYE-T_Azi_20_solid", ctrl = "AYE-T_ctrl_0_solid"),
AYE_O_Azi_vs_solid_ctr = list(treat = "AYE-O_Azi_20_solid", ctrl = "AYE-O_ctrl_0_solid"),
F_Azi_vs_F_solid_ctr = list(treat = "F_Azi_20_solid", ctrl = "F_ctrl_0_solid"),
# --- Rifampicin ---
AYE_WT_Rif_vs_AYE_WT_ctr = list(treat = "AYE-WT_Rifampicin_1.5_liquid", ctrl = "AYE-WT_ctrl_0_liquid"),
AYE_T_Rif_vs_AYE_T_ctr = list(treat = "AYE-T_Rifampicin_2_liquid", ctrl = "AYE-T_ctrl_0_liquid"),
AYE_O_Rif_vs_AYE_O_ctr = list(treat = "AYE-O_Rifampicin_2_liquid", ctrl = "AYE-O_ctrl_0_liquid"),
O_Trans_Rif_vs_O_Trans_ctr = list(treat = "O-Trans_Rifampicin_2_liquid", ctrl = "O-Trans_ctrl_0_liquid")
)
# ==========================================
# 2. Verification & Validation Script
# ==========================================
# Identify which column in colData holds your group names
group_col <- if("group" %in% colnames(colData(dds))) "group" else
if("treatment" %in% colnames(colData(dds))) "treatment" else
stop("❌ Please specify the correct colData column containing group names.")
actual_groups <- unique(colData(dds)[[group_col]])
cat("\n", paste(rep("=", 85), collapse=""), "\n")
cat("📋 VERIFICATION OF NOTE-DERIVED COMPARISONS\n")
cat(paste(rep("=", 85), collapse=""), "\n\n")
validation_results <- data.frame(
Comparison_Name = character(),
Treatment_String = character(),
Control_String = character(),
Status = character(),
Suggested_Contrast = character(),
stringsAsFactors = FALSE
)
for(name in names(planned_comparisons)) {
trt <- planned_comparisons[[name]]$treat
ctl <- planned_comparisons[[name]]$ctrl
# Find closest matches in actual data
trt_match <- actual_groups[grepl(trt, actual_groups, fixed = TRUE)]
ctl_match <- actual_groups[grepl(ctl, actual_groups, fixed = TRUE)]
status <- if(length(trt_match) > 0 && length(ctl_match) > 0) "✅ VALID" else "⚠️ CHECK"
contrast_str <- if(status == "✅ VALID")
paste0('c("', group_col, '", "', trt_match[1], '", "', ctl_match[1], '")') else "N/A"
validation_results <- rbind(validation_results, data.frame(
Comparison_Name = name,
Treatment_String = trt,
Control_String = ctl,
Status = status,
Suggested_Contrast = contrast_str,
stringsAsFactors = FALSE
))
cat(sprintf("%-45s | T:%-25s C:%-20s | %s\n", name, trt, ctl, status))
if(status == "⚠️ CHECK") {
if(length(trt_match) == 0) cat(" 🔍 Treat not found. Closest: ", paste(head(actual_groups[grepl(strsplit(trt, "_")[[1]][1], actual_groups)], 3), collapse=", "), "\n")
if(length(ctl_match) == 0) cat(" 🔍 Ctrl not found. Closest: ", paste(head(actual_groups[grepl(strsplit(ctl, "_")[[1]][1], actual_groups)], 3), collapse=", "), "\n")
}
}
# ==========================================
# 3. Auto-Generate DESeq2 results() Calls (Optional)
# ==========================================
valid_comparisons <- validation_results[validation_results$Status == "✅ VALID", ]
if(nrow(valid_comparisons) > 0) {
cat("\n📜 READY-TO-RUN DESeq2 CONTRASTS:\n")
cat(paste(rep("-", 60), collapse=""), "\n")
for(i in seq_len(nrow(valid_comparisons))) {
cat(sprintf('res_%s <- results(dds, contrast = %s)\n',
gsub("[^A-Za-z0-9]", "_", valid_comparisons$Comparison_Name[i]),
valid_comparisons$Suggested_Contrast[i]))
}
} else {
cat("\n⚠️ No exact matches found. Check your colData group naming convention.\n")
}
# -----------------------------
# ---- Step 4: PCA figures ----
# 🔍 What each figure shows:
#
# 01_PCA_by_Strain.png → Tests if genetic background (AYE-WT, AYE-T, AYE-O, Trans, F) is the dominant source of variation.
# 02_PCA_by_Treatment.png → Shows clustering by antibiotic/drug exposure (ctrl, Diclo, Mero, Azi, Rifampicin).
# 03_PCA_by_Condition.png → Reveals batch/growth media effects (solid vs liquid).
# 04_PCA_CombinedGroups.png → Full experimental grouping with labeled sample names for quick outlier detection.
# 05_PCA_Ellipses.png → Adds 95% confidence boundaries per strain to visualize group spread and overlap.
#
# ⚠️ Quick Checklist Before Running:
#
# Ensure metadata columns (strain, treatment, condition, group) are attached to colData(dds).
# If ggrepel is missing, run install.packages("ggrepel").
# All PNGs will save to your current working directory (getwd()).
# Install if missing: install.packages(c("ggplot2", "ggrepel"))
library(DESeq2)
library(ggplot2)
library(ggrepel)
# 1. Variance Stabilizing Transformation & Extract PCA Data
vsd <- vst(dds, blind = FALSE)
pca_data <- plotPCA(vsd, intgroup = c("strain", "treatment", "condition", "group"), returnData = TRUE)
percent_var <- round(100 * attr(pca_data, "percentVar"))
# Consistent theme for all plots
base_theme <- theme_bw(base_size = 12) +
theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 13),
legend.position = "right",
legend.title = element_text(face = "bold"),
panel.grid.major = element_line(color = "grey90"),
panel.grid.minor = element_blank())
# --- Plot 1: Colored by Strain ---
p1 <- ggplot(pca_data, aes(x = PC1, y = PC2, color = strain, shape = condition)) +
geom_point(size = 3, alpha = 0.8) +
geom_text_repel(aes(label = name), size = 2.5, max.overlaps = 20, show.legend = FALSE) +
labs(x = paste0("PC1: ", percent_var[1], "% variance"),
y = paste0("PC2: ", percent_var[2], "% variance"),
title = "PCA: Samples Colored by Strain",
color = "Strain", shape = "Condition") +
base_theme
ggsave("01_PCA_by_Strain.png", p1, width = 8, height = 6, dpi = 300)
# --- Plot 2: Colored by Treatment ---
p2 <- ggplot(pca_data, aes(x = PC1, y = PC2, color = treatment, shape = condition)) +
geom_point(size = 3, alpha = 0.8) +
labs(x = paste0("PC1: ", percent_var[1], "% variance"),
y = paste0("PC2: ", percent_var[2], "% variance"),
title = "PCA: Samples Colored by Treatment",
color = "Treatment", shape = "Condition") +
base_theme
ggsave("02_PCA_by_Treatment.png", p2, width = 8, height = 6, dpi = 300)
# --- Plot 3: Colored by Condition (Solid vs Liquid) ---
p3 <- ggplot(pca_data, aes(x = PC1, y = PC2, color = condition, shape = strain)) +
geom_point(size = 3, alpha = 0.8) +
labs(x = paste0("PC1: ", percent_var[1], "% variance"),
y = paste0("PC2: ", percent_var[2], "% variance"),
title = "PCA: Samples Colored by Growth Condition",
color = "Condition", shape = "Strain") +
base_theme
ggsave("03_PCA_by_Condition.png", p3, width = 8, height = 6, dpi = 300)
# --- Plot 4: Combined Groups with Sample Labels ---
p4 <- ggplot(pca_data, aes(x = PC1, y = PC2, color = group)) +
geom_point(size = 3, alpha = 0.8) +
geom_text_repel(aes(label = name), size = 2, max.overlaps = 30, box.padding = 0.3) +
labs(x = paste0("PC1: ", percent_var[1], "% variance"),
y = paste0("PC2: ", percent_var[2], "% variance"),
title = "PCA: Combined Experimental Groups",
color = "Group") +
base_theme + theme(legend.position = "none")
ggsave("04_PCA_CombinedGroups.png", p4, width = 9, height = 7, dpi = 300)
# --- Plot 5: 95% Confidence Ellipses (by Strain) ---
p5 <- ggplot(pca_data, aes(x = PC1, y = PC2, color = strain, fill = strain)) +
geom_point(size = 3, alpha = 0.7) +
stat_ellipse(level = 0.95, alpha = 0.2, geom = "polygon", show.legend = FALSE) +
labs(x = paste0("PC1: ", percent_var[1], "% variance"),
y = paste0("PC2: ", percent_var[2], "% variance"),
title = "PCA: 95% Confidence Ellipses by Strain",
color = "Strain", fill = "Strain") +
base_theme
ggsave("05_PCA_Ellipses.png", p5, width = 8, height = 6, dpi = 300)
message("✅ All 5 PCA plots saved to working directory!")