# ============================================================================== # COMPLETE PIPELINE: Quant Import -> DESeq2 -> Annotation -> 31 Comparisons # Includes: tx2gene fix, Metadata Parsing, Excel Export, Volcano Plots # ============================================================================== # 1. LOAD LIBRARIES ------------------------------------------------------------ suppressPackageStartupMessages({ library(tximport) library(DESeq2) library(dplyr) library(stringr) library(readr) library(openxlsx) library(ggplot2) library(ggrepel) }) # 2. SET WORKING DIRECTORY & DEFINE SAMPLES ------------------------------------ setwd("~/DATA/Data_Tam_RNAseq_2026_on_AYE/results/star_salmon") # Define all samples samples <- c( "AYE-O_Azi20_solid_r1", "AYE-O_Azi20_solid_r2", "AYE-O_Azi20_solid_r3", "AYE-O_ctr_r1", "AYE-O_ctr_r2", "AYE-O_ctr_r3", "AYE-O_ctr_solid_r1", "AYE-O_ctr_solid_r2", "AYE-O_ctr_solid_r3", "AYE-O_Diclo375_r1", "AYE-O_Diclo375_r2", "AYE-O_Diclo375_r3", "AYE-O_Mero0.5_r1", "AYE-O_Mero0.5_r2", "AYE-O_Mero0.5_r3", "AYE-O_Rifampicin2_r1", "AYE-O_Rifampicin2_r2", "AYE-O_Rifampicin2_r3", "AYE-T_Azi20_solid_r1", "AYE-T_Azi20_solid_r2", "AYE-T_Azi20_solid_r3", "AYE-T_ctr_r1", "AYE-T_ctr_r2", "AYE-T_ctr_r3", "AYE-T_ctr_solid_r1", "AYE-T_ctr_solid_r2", "AYE-T_ctr_solid_r3", "AYE-T_Diclo375_r1", "AYE-T_Diclo375_r2", "AYE-T_Diclo375_r3", "AYE-T_Mero0.15_r1", "AYE-T_Mero0.15_r2", "AYE-T_Mero0.15_r3", "AYE-T_Rifampicin2_r1", "AYE-T_Rifampicin2_r2", "AYE-T_Rifampicin2_r3", "AYE-WT_Azi20_solid_r1", "AYE-WT_Azi20_solid_r2", "AYE-WT_Azi20_solid_r3", "AYE-WT_ctr_r1", "AYE-WT_ctr_r2", "AYE-WT_ctr_r3", "AYE-WT_ctr_solid_r1", "AYE-WT_ctr_solid_r2", "AYE-WT_ctr_solid_r3", "AYE-WT_Diclo1250_solid_r1", "AYE-WT_Diclo1250_solid_r2", "AYE-WT_Diclo1250_solid_r3", "AYE-WT_Diclo750_r1", "AYE-WT_Diclo750_r2", "AYE-WT_Diclo750_r3", "AYE-WT_Mero0.35-0.5_r1", "AYE-WT_Mero0.35-0.5_r2", "AYE-WT_Mero0.35-0.5_r3", "AYE-WT_Rifampicin1.5_r1", "AYE-WT_Rifampicin1.5_r2", "AYE-WT_Rifampicin1.5_r3", "F_Azi20_solid_r1", "F_Azi20_solid_r2", "F_Azi20_solid_r3", "F_ctr_solid_r1", "F_ctr_solid_r2", "F_ctr_solid_r3", "O-Trans_ctr_r1", "O-Trans_ctr_r2", "O-Trans_ctr_r3", "O-Trans_Diclo375_r1", "O-Trans_Diclo375_r2", "O-Trans_Diclo375_r3", "O-Trans_Mero0.25_r1", "O-Trans_Mero0.25_r2", "O-Trans_Mero0.25_r3", "O-Trans_Rifampicin2_r1", "O-Trans_Rifampicin2_r2", "O-Trans_Rifampicin2_r3", "WT-Trans_ctr_r1", "WT-Trans_ctr_r2", "WT-Trans_ctr_r3", "WT-Trans_Diclo750_r1", "WT-Trans_Diclo750_r2", "WT-Trans_Diclo750_r3" ) ## Automatically generate the named vector for files files <- setNames(paste0("./", samples, "/quant.sf"), samples) # 3. PARSE ANNOTATION FILE TO CREATE tx2gene (FIX FOR summarizeFail ERROR) ----- # This maps Transcript IDs (in quant.sf) to Gene IDs (for DESeq2) ANNOT_FILE <- "/mnt/md1/DATA/Data_Tam_RNAseq_2026_on_AYE/results/genome/CU459141_m.txt" cat("📖 Parsing annotation file to build tx2gene mapping...\n") gff <- read_tsv(ANNOT_FILE, col_names = FALSE, show_col_types = FALSE) tx2gene <- gff %>% select(X9) %>% mutate( TXNAME = str_extract(X9, '(?<=transcript_id ")[^"]+'), GENEID = str_extract(X9, '(?<=gene_id ")[^"]+') ) %>% select(TXNAME, GENEID) #%>% #drop_na() # Remove rows that didn't match the regex cat(sprintf("✅ Created tx2gene mapping with %d entries.\n", nrow(tx2gene))) # 4. IMPORT QUANTIFICATIONS ---------------------------------------------------- # Use tx2gene to summarize to gene level (prevents the error) cat("📥 Importing Salmon quantifications...\n") txi <- tximport(files, type = "salmon", tx2gene = tx2gene) # ----------------------------------------------------------------- # ---- Step 1: Create Detailed Metadata from Your Sample Names ---- # ----------------------------------------------------------------- # Extract metadata from sample names samples <- names(files) # Ensure we use the names from files metadata <- data.frame( sample = samples, stringsAsFactors = FALSE ) # Extract strain (everything before first underscore or hyphen treatment) metadata$strain <- sapply(strsplit(samples, "[-_]"), function(x) { if(x[1] %in% c("AYE", "O", "WT", "F")) { if(x[1] == "AYE" && length(x) > 1 && x[2] %in% c("WT", "T", "O")) { paste(x[1:2], collapse = "-") } else if(x[1] %in% c("O", "WT") && x[2] == "Trans") { paste(x[1:2], collapse = "-") } else { x[1] } } else { x[1] } }) # Extract treatment type metadata$treatment <- sapply(samples, function(x) { if(grepl("_ctr", x)) return("ctrl") if(grepl("Diclo", x)) return("Diclo") if(grepl("Mero", x)) return("Mero") if(grepl("Azi", x)) return("Azi") if(grepl("Rifampicin", x)) return("Rifampicin") return("ctrl") }) # Extract concentration metadata$concentration <- sapply(samples, function(x) { if(grepl("Diclo1250", x)) return("1250") if(grepl("Diclo750", x)) return("750") if(grepl("Diclo375", x)) return("375") if(grepl("Mero0.5", x)) return("0.5") if(grepl("Mero0.35-0.5", x)) return("0.35-0.5") # Added explicit check for range if(grepl("Mero0.35", x)) return("0.35") if(grepl("Mero0.25", x)) return("0.25") if(grepl("Mero0.15", x)) return("0.15") if(grepl("Azi20", x)) return("20") if(grepl("Rifampicin2", x)) return("2") if(grepl("Rifampicin1.5", x)) return("1.5") return("0") }) # Extract condition (solid vs liquid) metadata$condition <- ifelse(grepl("_solid", samples), "solid", "liquid") # Extract replicate metadata$replicate <- sapply(strsplit(samples, "_"), function(x) { rep_part <- x[length(x)] gsub("r", "", rep_part) }) # Create combined group for easy comparisons # This creates strings like "AYE-O_Diclo_375_liquid" metadata$group <- factor(paste(metadata$strain, metadata$treatment, metadata$concentration, metadata$condition, sep = "_")) # Set row names rownames(metadata) <- metadata$sample # Reorder to match txi columns metadata <- metadata[colnames(txi$counts), ] # --------------------------------------------- # ---- Step 2: Build DESeq2 Object ---- # --------------------------------------------- # Strategy B: Combined Group Factor ⭐ RECOMMENDED dds <- DESeqDataSetFromTximport(txi, colData = metadata, design = ~ group) # Pre-filter low-count genes (Improves speed and stability) keep <- rowSums(counts(dds) >= 10) >= 3 dds <- dds[keep, ] cat("🚀 Running DESeq2 pipeline...\n") dds <- DESeq(dds) # See all available comparisons to verify group names cat("Available groups in dds:\n") print(levels(colData(dds)$group)) # ============================================================================== # 5. DEFINE ALL 31 COMPARISONS (Matches EXACT group names generated above) # ============================================================================== # Note: Group names follow the pattern Strain_Treatment_Concentration_Condition planned_comparisons <- list( # Baseline / Strain Controls "01_AYE_T_ctr_vs_AYE_WT_ctr" = c("group", "AYE-T_ctrl_0_liquid", "AYE-WT_ctrl_0_liquid"), "02_AYE_O_ctr_vs_AYE_WT_ctr" = c("group", "AYE-O_ctrl_0_liquid", "AYE-WT_ctrl_0_liquid"), "03_O_Trans_ctr_vs_AYE_WT_ctr" = c("group", "O-Trans_ctrl_0_liquid", "AYE-WT_ctrl_0_liquid"), "04_WT_Trans_ctr_vs_AYE_WT_ctr" = c("group", "WT-Trans_ctrl_0_liquid","AYE-WT_ctrl_0_liquid"), "05_AYE_O_ctr_vs_AYE_T_ctr" = c("group", "AYE-O_ctrl_0_liquid", "AYE-T_ctrl_0_liquid"), "06_O_Trans_ctr_vs_AYE_T_ctr" = c("group", "O-Trans_ctrl_0_liquid", "AYE-T_ctrl_0_liquid"), "07_WT_Trans_ctr_vs_AYE_T_ctr" = c("group", "WT-Trans_ctrl_0_liquid","AYE-T_ctrl_0_liquid"), # Condition Effects "08_AYE_WT_ctr_solid_vs_liquid" = c("group", "AYE-WT_ctrl_0_solid", "AYE-WT_ctrl_0_liquid"), "09_AYE_O_ctr_solid_vs_liquid" = c("group", "AYE-O_ctrl_0_solid", "AYE-O_ctrl_0_liquid"), "10_AYE_T_ctr_solid_vs_liquid" = c("group", "AYE-T_ctrl_0_solid", "AYE-T_ctrl_0_liquid"), "11_AYE_O_ctr_solid_vs_AYE_WT_solid"=c("group", "AYE-O_ctrl_0_solid", "AYE-WT_ctrl_0_solid"), "12_AYE_T_ctr_solid_vs_AYE_WT_solid"=c("group", "AYE-T_ctrl_0_solid", "AYE-WT_ctrl_0_solid"), # Diclofenac "13_AYE_WT_Diclo750_vs_Ctrl" = c("group", "AYE-WT_Diclo_750_liquid", "AYE-WT_ctrl_0_liquid"), "14_AYE_T_Diclo375_vs_Ctrl" = c("group", "AYE-T_Diclo_375_liquid", "AYE-WT_ctrl_0_liquid"), "15_AYE_O_Diclo375_vs_Ctrl" = c("group", "AYE-O_Diclo_375_liquid", "AYE-WT_ctrl_0_liquid"), "16_O_Trans_Diclo375_vs_Ctrl" = c("group", "O-Trans_Diclo_375_liquid", "AYE-WT_ctrl_0_liquid"), "17_WT_Trans_Diclo750_vs_Ctrl" = c("group", "WT-Trans_Diclo_750_liquid", "AYE-WT_ctrl_0_liquid"), "18_AYE_WT_Diclo1250_solid_vs_Ctrl_solid" = c("group", "AYE-WT_Diclo_1250_solid", "AYE-WT_ctrl_0_solid"), # Meropenem "19_AYE_WT_Mero_vs_Ctrl" = c("group", "AYE-WT_Mero_0.35-0.5_liquid", "AYE-WT_ctrl_0_liquid"), "20_AYE_T_Mero_vs_Ctrl" = c("group", "AYE-T_Mero_0.15_liquid", "AYE-WT_ctrl_0_liquid"), "21_AYE_O_Mero_vs_Ctrl" = c("group", "AYE-O_Mero_0.5_liquid", "AYE-WT_ctrl_0_liquid"), "22_O_Trans_Mero_vs_Ctrl" = c("group", "O-Trans_Mero_0.25_liquid", "AYE-WT_ctrl_0_liquid"), "23_AYE_T_Mero_vs_AYE_T_Ctrl" = c("group", "AYE-T_Mero_0.15_liquid", "AYE-T_ctrl_0_liquid"), # Azithromycin (Solid) "24_AYE_WT_Azi_solid_vs_Ctrl_solid"= c("group", "AYE-WT_Azi_20_solid", "AYE-WT_ctrl_0_solid"), "25_AYE_T_Azi_solid_vs_Ctrl_solid" = c("group", "AYE-T_Azi_20_solid", "AYE-T_ctrl_0_solid"), "26_AYE_O_Azi_solid_vs_Ctrl_solid" = c("group", "AYE-O_Azi_20_solid", "AYE-O_ctrl_0_solid"), "27_F_Azi_solid_vs_Ctrl_solid" = c("group", "F_Azi_20_solid", "F_ctrl_0_solid"), # Rifampicin "28_AYE_WT_Rif_vs_Ctrl" = c("group", "AYE-WT_Rifampicin_1.5_liquid", "AYE-WT_ctrl_0_liquid"), "29_AYE_T_Rif_vs_Ctrl" = c("group", "AYE-T_Rifampicin_2_liquid", "AYE-T_ctrl_0_liquid"), "30_AYE_O_Rif_vs_Ctrl" = c("group", "AYE-O_Rifampicin_2_liquid", "AYE-O_ctrl_0_liquid"), "31_O_Trans_Rif_vs_Ctrl" = c("group", "O-Trans_Rifampicin_2_liquid", "O-Trans_ctrl_0_liquid") ) # 6. PROCESSING LOOP (Excel + Volcano) ----------------------------------------- OUTPUT_DIR <- "DEG_Results_Complete" dir.create(OUTPUT_DIR, showWarnings = FALSE, recursive = TRUE) PADJ_CUTOFF <- 0.05 LFC_CUTOFF <- 2.0 VOLCANO_N <- 30 # Re-use annotation file to get gene names for Excel output cat("📖 Parsing annotation file for gene names...\n") anno_df <- gff %>% mutate( gene_id = str_extract(X9, '(?<=gene_id ")[^"]+'), gene_name = str_extract(X9, '(?<=gene_name ")[^"]+'), locus_tag = str_extract(X9, '(?<=locus_tag ")[^"]+') ) %>% filter(!is.na(gene_id)) %>% distinct(gene_id, .keep_all = TRUE) %>% mutate( gene_name = ifelse(is.na(gene_name) | gene_name == "", locus_tag, gene_name), gene_id_clean = gsub("^gene-", "", gene_id) ) %>% select(gene_id, gene_id_clean, gene_name) summary_list <- list() cat("\n🚀 PROCESSING", length(planned_comparisons), "COMPARISONS\n") cat(sprintf("Thresholds: padj <= %s, |log2FC| >= %s\n", PADJ_CUTOFF, LFC_CUTOFF)) cat(paste(rep("=", 80), collapse = ""), "\n") for (comp_name in names(planned_comparisons)) { contrast_vec <- planned_comparisons[[comp_name]] idx <- which(names(planned_comparisons) == comp_name) cat(sprintf("\n[%02d/31] %s\n", idx, comp_name)) # Validate groups if (!all(contrast_vec[2:3] %in% levels(colData(dds)$group))) { warning(sprintf(" ⚠️ Skipping: '%s' or '%s' not in colData$group", contrast_vec[2], contrast_vec[3])) next } # Extract DESeq2 results res <- results(dds, contrast = contrast_vec, alpha = PADJ_CUTOFF, independentFiltering = TRUE) res <- res[!is.na(res$log2FoldChange), ] res_df <- as.data.frame(res) res_df$GeneID <- rownames(res_df) res_df$GeneID_clean <- gsub("^gene-", "", res_df$GeneID) ## Annotate with true gene names #res_annot <- res_df %>% # left_join(anno_df, by = c("GeneID_clean" = "gene_id_clean")) %>% # mutate( # GeneName = ifelse(is.na(GeneName) | GeneName == "", GeneID_clean, GeneName), # padj_plot = ifelse(is.na(padj) | padj == 0, 1e-305, padj) # ) %>% # select(GeneID, GeneName, baseMean, log2FoldChange, lfcSE, stat, pvalue, padj, padj_plot) %>% # distinct(GeneID, .keep_all = TRUE) %>% # arrange(padj, desc(abs(log2FoldChange))) # --- DEBUG & FIX BLOCK --- cat("\n🔍 DEBUG: Columns in res_df :", paste(colnames(res_df), collapse = ", "), "\n") cat("🔍 DEBUG: Columns in anno_df :", paste(colnames(anno_df), collapse = ", "), "\n") res_annot <- res_df %>% left_join(anno_df, by = c("GeneID_clean" = "gene_id_clean")) %>% { cat("🔍 DEBUG: Columns after join:", paste(colnames(.), collapse = ", "), "\n") . } %>% mutate( # ✅ FIX: Use lowercase 'gene_name' as it comes from anno_df GeneName = ifelse(is.na(gene_name) | gene_name == "", GeneID_clean, gene_name), padj_plot = ifelse(is.na(padj) | padj == 0, 1e-305, padj) ) %>% select(GeneID, GeneName, baseMean, log2FoldChange, lfcSE, stat, pvalue, padj, padj_plot) %>% distinct(GeneID, .keep_all = TRUE) %>% arrange(padj, desc(abs(log2FoldChange))) cat("✅ Annotation step completed successfully.\n") # --- END DEBUG BLOCK --- # Filter Up/Down regulated genes up_genes <- res_annot %>% filter(padj <= PADJ_CUTOFF & log2FoldChange >= LFC_CUTOFF) %>% arrange(desc(log2FoldChange)) down_genes <- res_annot %>% filter(padj <= PADJ_CUTOFF & log2FoldChange <= -LFC_CUTOFF) %>% arrange(log2FoldChange) # --- Write 3-Sheet Excel --- wb <- createWorkbook() addWorksheet(wb, "Complete_Results") writeData(wb, "Complete_Results", res_annot) setColWidths(wb, "Complete_Results", cols = 1:2, widths = c(18, 25)) setColWidths(wb, "Complete_Results", cols = 3:ncol(res_annot), widths = "auto") addWorksheet(wb, "Up_Regulated") writeData(wb, "Up_Regulated", if(nrow(up_genes)>0) up_genes else data.frame(Message="None meet thresholds")) if(nrow(up_genes)>0) { setColWidths(wb, "Up_Regulated", cols = 1:2, widths = c(18, 25)); setColWidths(wb, "Up_Regulated", cols = 3:ncol(up_genes), widths = "auto") } addWorksheet(wb, "Down_Regulated") writeData(wb, "Down_Regulated", if(nrow(down_genes)>0) down_genes else data.frame(Message="None meet thresholds")) if(nrow(down_genes)>0) { setColWidths(wb, "Down_Regulated", cols = 1:2, widths = c(18, 25)); setColWidths(wb, "Down_Regulated", cols = 3:ncol(down_genes), widths = "auto") } saveWorkbook(wb, file.path(OUTPUT_DIR, sprintf("DEG_%s.xlsx", comp_name)), overwrite = TRUE) # --- Volcano Plot (Labels use GeneName) --- plot_data <- res_annot %>% mutate( sig_cat = case_when( padj <= PADJ_CUTOFF & log2FoldChange >= LFC_CUTOFF ~ "Up", padj <= PADJ_CUTOFF & log2FoldChange <= -LFC_CUTOFF ~ "Down", TRUE ~ "NS" ), label = ifelse(sig_cat != "NS" & rank(padj) <= VOLCANO_N, GeneName, "") ) p <- ggplot(plot_data, aes(x = log2FoldChange, y = -log10(padj_plot))) + geom_point(aes(color = sig_cat), alpha = 0.6, size = 1.0) + scale_color_manual(values = c("Up" = "#E41A1C", "Down" = "#377EB8", "NS" = "grey70"), name = "Expression") + geom_text_repel(data = filter(plot_data, label != ""), aes(label = label), size = 2.2, max.overlaps = 50, box.padding = 0.3, segment.color = "grey40") + geom_vline(xintercept = c(-LFC_CUTOFF, LFC_CUTOFF), linetype = "dashed", alpha = 0.4) + geom_hline(yintercept = -log10(PADJ_CUTOFF), linetype = "dashed", alpha = 0.4) + labs(title = gsub("_", " ", comp_name), subtitle = sprintf("padj ≤ %s | |log₂FC| ≥ %s", PADJ_CUTOFF, LFC_CUTOFF), caption = sprintf("Total: %d | Up: %d | Down: %d", nrow(plot_data), nrow(up_genes), nrow(down_genes)), x = "log2 Fold Change", y = "-log10 Adjusted P-value") + theme_minimal(base_size = 10) + theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 11), plot.subtitle = element_text(hjust = 0.5, size = 9), plot.caption = element_text(size = 8), legend.position = "right", panel.grid.minor = element_blank()) ggsave(file.path(OUTPUT_DIR, sprintf("Volcano_%s.png", comp_name)), p, width = 9, height = 7, dpi = 300, bg = "white") summary_list[[comp_name]] <- list(name = comp_name, total = nrow(res_annot), up = nrow(up_genes), down = nrow(down_genes)) cat(sprintf(" ✅ Excel + Volcano saved (Up: %d, Down: %d)\n", nrow(up_genes), nrow(down_genes))) } # 7. SUMMARY REPORT ------------------------------------------------------------ summary_df <- do.call(rbind, lapply(summary_list, as.data.frame)) %>% mutate(sig_total = up + down, pct_sig = round(100 * sig_total / total, 1)) %>% arrange(desc(sig_total)) write.csv(summary_df, file.path(OUTPUT_DIR, "DEG_Summary_All_31.csv"), row.names = FALSE) cat("\n", paste(rep("=", 80), collapse = ""), "\n") cat("📊 FINAL SUMMARY OF ALL 31 COMPARISONS\n") print(summary_df, row.names = FALSE) cat(sprintf("\n✨ All files saved to: %s\n", normalizePath(OUTPUT_DIR))) cat("📁 Each comparison contains:\n") cat(" 1️⃣ DEG_.xlsx (3 sheets: Complete_Results, Up_Regulated, Down_Regulated)\n") cat(" 2️⃣ Volcano_.png (GeneName labels for top significant genes)\n")