# ============================================================================== # CORRECTED PIPELINE: 6 Groups (Urine/AUM/MH ± AZI) -> DESeq2 -> Annotation -> 9 Comparisons # Fixed: drop_na() namespace error & NA p-value handling # ============================================================================== # 1. LOAD LIBRARIES ------------------------------------------------------------ suppressPackageStartupMessages({ library(tximport) library(DESeq2) library(dplyr) library(stringr) library(readr) library(openxlsx) library(ggplot2) library(ggrepel) }) # 2. SET WORKING DIRECTORY & DEFINE SAMPLES ------------------------------------ setwd("/mnt/md1/DATA/Data_Tam_RNAseq_2024_AUM_MHB_Urine_on_ATCC19606/results/star_salmon") files <- c( "AUM-AZI_r1" = "./AUM-AZI_r1/quant.sf", "AUM-AZI_r2" = "./AUM-AZI_r2/quant.sf", "AUM-AZI_r3" = "./AUM-AZI_r3/quant.sf", "AUM_r1" = "./AUM_r1/quant.sf", "AUM_r2" = "./AUM_r2/quant.sf", "AUM_r3" = "./AUM_r3/quant.sf", "MH-AZI_r1" = "./MH-AZI_r1/quant.sf", "MH-AZI_r2" = "./MH-AZI_r2/quant.sf", "MH-AZI_r3" = "./MH-AZI_r3/quant.sf", "MH_r1" = "./MH_r1/quant.sf", "MH_r2" = "./MH_r2/quant.sf", "MH_r3" = "./MH_r3/quant.sf", "Urine-AZI_r1" = "./Urine-AZI_r1/quant.sf", "Urine-AZI_r2" = "./Urine-AZI_r2/quant.sf", "Urine-AZI_r3" = "./Urine-AZI_r3/quant.sf", "Urine_r1" = "./Urine_r1/quant.sf", "Urine_r2" = "./Urine_r2/quant.sf", "Urine_r3" = "./Urine_r3/quant.sf" ) # ----------------------------------------------------------------- # ---- Step 1: Parse Metadata from Simplified Sample Names ---- # ----------------------------------------------------------------- samples <- names(files) metadata <- data.frame(sample = samples, stringsAsFactors = FALSE) # Extract Media (Urine, AUM, MH) metadata$media <- gsub("[-_].*", "", samples) # Extract Treatment (AZI or Control) metadata$treatment <- ifelse(grepl("AZI", samples), "AZI", "Control") # Extract Replicate number metadata$replicate <- as.numeric(gsub(".*r", "", samples)) # Create combined group factor for DESeq2 metadata$group <- factor(paste(metadata$media, metadata$treatment, sep = "_")) # Set row names and reorder to match txi columns later rownames(metadata) <- metadata$sample # ----------------------------------------------------------------- # ---- Step 2: Build tx2gene Mapping & Import Quantifications ---- # ----------------------------------------------------------------- ANNOT_FILE <- "/mnt/md1/DATA/Data_Tam_RNAseq_2024_AUM_MHB_Urine_on_ATCC19606/results/genome/CP059040_m.txt" cat("📖 Parsing annotation file to build tx2gene mapping...\n") gff <- read_tsv(ANNOT_FILE, col_names = FALSE, show_col_types = FALSE) # ✅ FIXED: Replaced drop_na() with dplyr::filter() to avoid missing package error tx2gene <- gff %>% select(X9) %>% mutate( TXNAME = str_extract(X9, '(?<=transcript_id ")[^"]+'), GENEID = str_extract(X9, '(?<=gene_id ")[^"]+') ) %>% select(TXNAME, GENEID) %>% filter(!is.na(TXNAME) & !is.na(GENEID)) cat(sprintf("✅ Created tx2gene mapping with %d entries.\n", nrow(tx2gene))) cat("📥 Importing Salmon quantifications...\n") txi <- tximport(files, type = "salmon", tx2gene = tx2gene) # Reorder metadata to match imported columns metadata <- metadata[colnames(txi$counts), ] # ----------------------------------------------------------------- # ---- Step 3: Build & Run DESeq2 ---- # ----------------------------------------------------------------- dds <- DESeqDataSetFromTximport(txi, colData = metadata, design = ~ group) # Pre-filter low-count genes keep <- rowSums(counts(dds) >= 10) >= 3 dds <- dds[keep, ] cat("🚀 Running DESeq2 pipeline...\n") dds <- DESeq(dds) cat("✅ Available groups in model:\n") print(levels(colData(dds)$group)) # ============================================================================== # 4. DEFINE MEANINGFUL COMPARISONS (9 Total) # ============================================================================== planned_comparisons <- list( "01_Urine_AZI_vs_Control" = c("group", "Urine_AZI", "Urine_Control"), "02_AUM_AZI_vs_Control" = c("group", "AUM_AZI", "AUM_Control"), "03_MH_AZI_vs_Control" = c("group", "MH_AZI", "MH_Control"), "04_AUM_vs_Urine_Control" = c("group", "AUM_Control", "Urine_Control"), "05_MH_vs_Urine_Control" = c("group", "MH_Control", "Urine_Control"), "06_MH_vs_AUM_Control" = c("group", "MH_Control", "AUM_Control"), "07_AUM_vs_Urine_AZI" = c("group", "AUM_AZI", "Urine_AZI"), "08_MH_vs_Urine_AZI" = c("group", "MH_AZI", "Urine_AZI"), "09_MH_vs_AUM_AZI" = c("group", "MH_AZI", "AUM_AZI") ) # ============================================================================== # 5. PROCESSING LOOP (Excel + Volcano) ----------------------------------------- OUTPUT_DIR <- "DEG_Results_6Groups" dir.create(OUTPUT_DIR, showWarnings = FALSE, recursive = TRUE) PADJ_CUTOFF <- 0.05 LFC_CUTOFF <- 2.0 VOLCANO_N <- 40 # Prepare annotation dataframe for gene names cat("📖 Parsing annotation file for gene names...\n") anno_df <- gff %>% mutate( gene_id = str_extract(X9, '(?<=gene_id ")[^"]+'), gene_name = str_extract(X9, '(?<=gene_name ")[^"]+'), locus_tag = str_extract(X9, '(?<=locus_tag ")[^"]+') ) %>% filter(!is.na(gene_id)) %>% distinct(gene_id, .keep_all = TRUE) %>% mutate( gene_name = ifelse(is.na(gene_name) | gene_name == "", locus_tag, gene_name), gene_id_clean = gsub("^gene-", "", gene_id) ) %>% select(gene_id, gene_id_clean, gene_name) summary_list <- list() cat("\n🚀 PROCESSING", length(planned_comparisons), "COMPARISONS\n") cat(sprintf("Thresholds: padj <= %s | |log2FC| >= %s\n", PADJ_CUTOFF, LFC_CUTOFF)) cat(paste(rep("=", 80), collapse = ""), "\n") for (comp_name in names(planned_comparisons)) { contrast_vec <- planned_comparisons[[comp_name]] idx <- which(names(planned_comparisons) == comp_name) cat(sprintf("\n[%02d/9] %s\n", idx, comp_name)) if (!all(contrast_vec[2:3] %in% levels(colData(dds)$group))) { warning(sprintf(" ⚠️ Skipping: '%s' or '%s' not in colData$group", contrast_vec[2], contrast_vec[3])) next } res <- results(dds, contrast = contrast_vec, alpha = PADJ_CUTOFF, independentFiltering = TRUE) res <- res[!is.na(res$log2FoldChange), ] res_df <- as.data.frame(res) res_df$GeneID <- rownames(res_df) res_df$GeneID_clean <- gsub("^gene-", "", res_df$GeneID) res_annot <- res_df %>% left_join(anno_df, by = c("GeneID_clean" = "gene_id_clean")) %>% mutate( GeneName = ifelse(is.na(gene_name) | gene_name == "", GeneID_clean, gene_name), # 🔧 UPDATE: Replace NA p-values/padj with 1 (treated as non-significant) pvalue = ifelse(is.na(pvalue), 1, pvalue), padj = ifelse(is.na(padj), 1, padj), # 📉 Plot value: replace 0 with small number to avoid -log10(0) = Inf padj_plot = ifelse(padj == 0, 1e-305, padj) ) %>% select(GeneID, GeneName, baseMean, log2FoldChange, lfcSE, stat, pvalue, padj, padj_plot) %>% distinct(GeneID, .keep_all = TRUE) %>% arrange(padj, desc(abs(log2FoldChange))) up_genes <- res_annot %>% filter(padj <= PADJ_CUTOFF & log2FoldChange >= LFC_CUTOFF) %>% arrange(desc(log2FoldChange)) down_genes <- res_annot %>% filter(padj <= PADJ_CUTOFF & log2FoldChange <= -LFC_CUTOFF) %>% arrange(log2FoldChange) wb <- createWorkbook() addWorksheet(wb, "Complete_Results") writeData(wb, "Complete_Results", res_annot) setColWidths(wb, "Complete_Results", cols = 1:2, widths = c(18, 25)) setColWidths(wb, "Complete_Results", cols = 3:ncol(res_annot), widths = "auto") addWorksheet(wb, "Up_Regulated") writeData(wb, "Up_Regulated", if(nrow(up_genes)>0) up_genes else data.frame(Message="None meet thresholds")) addWorksheet(wb, "Down_Regulated") writeData(wb, "Down_Regulated", if(nrow(down_genes)>0) down_genes else data.frame(Message="None meet thresholds")) saveWorkbook(wb, file.path(OUTPUT_DIR, sprintf("DEG_%s.xlsx", comp_name)), overwrite = TRUE) plot_data <- res_annot %>% mutate( sig_cat = case_when( padj <= PADJ_CUTOFF & log2FoldChange >= LFC_CUTOFF ~ "Up", padj <= PADJ_CUTOFF & log2FoldChange <= -LFC_CUTOFF ~ "Down", TRUE ~ "NS" ), label = ifelse(sig_cat != "NS" & rank(padj) <= VOLCANO_N, GeneName, "") ) p <- ggplot(plot_data, aes(x = log2FoldChange, y = -log10(padj_plot))) + geom_point(aes(color = sig_cat), alpha = 0.6, size = 1.0) + scale_color_manual(values = c("Up" = "#E41A1C", "Down" = "#377EB8", "NS" = "grey70"), name = "Expression") + geom_text_repel(data = filter(plot_data, label != ""), aes(label = label), size = 2.2, max.overlaps = 30, box.padding = 0.3, segment.color = "grey40") + geom_vline(xintercept = c(-LFC_CUTOFF, LFC_CUTOFF), linetype = "dashed", alpha = 0.4) + geom_hline(yintercept = -log10(PADJ_CUTOFF), linetype = "dashed", alpha = 0.4) + labs(title = gsub("_", " ", comp_name), subtitle = sprintf("padj ≤ %s | |log₂FC| ≥ %s", PADJ_CUTOFF, LFC_CUTOFF), caption = sprintf("Total: %d | Up: %d | Down: %d", nrow(plot_data), nrow(up_genes), nrow(down_genes)), x = "log2 Fold Change", y = "-log10 Adjusted P-value") + theme_minimal(base_size = 10) + theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 11), plot.subtitle = element_text(hjust = 0.5, size = 9), plot.caption = element_text(size = 8), legend.position = "right", panel.grid.minor = element_blank()) ggsave(file.path(OUTPUT_DIR, sprintf("Volcano_%s.png", comp_name)), p, width = 8, height = 6, dpi = 300, bg = "white") summary_list[[comp_name]] <- list(name = comp_name, total = nrow(res_annot), up = nrow(up_genes), down = nrow(down_genes)) cat(sprintf(" ✅ Excel + Volcano saved (Up: %d, Down: %d)\n", nrow(up_genes), nrow(down_genes))) } # 6. SUMMARY REPORT ------------------------------------------------------------ summary_df <- do.call(rbind, lapply(summary_list, as.data.frame)) %>% mutate(sig_total = up + down, pct_sig = round(100 * sig_total / total, 1)) %>% arrange(desc(sig_total)) write.csv(summary_df, file.path(OUTPUT_DIR, "DEG_Summary_All_9.csv"), row.names = FALSE) cat("\n", paste(rep("=", 80), collapse = ""), "\n") cat("📊 FINAL SUMMARY OF ALL 9 COMPARISONS\n") print(summary_df, row.names = FALSE) cat(sprintf("\n✨ All files saved to: %s\n", normalizePath(OUTPUT_DIR)))