#============================================================================== # COMPLETE PIPELINE: Merged Host-Virus Counts -> DESeq2 -> 6 Comparisons # Includes: GTF Annotation, rlogTransformation, PCA Plot, Excel Export, Volcano Plots # Virus genes colored in GREEN, Top 150 genes labeled #============================================================================== # 1. LOAD LIBRARIES ------------------------------------------------------------ suppressPackageStartupMessages({ library(DESeq2) library(dplyr) library(readr) library(stringr) library(openxlsx) library(ggplot2) library(ggrepel) }) # 2. SET WORKING DIRECTORY & READ COUNTS --------------------------------------- setwd("~/DATA/Data_Nina_RNAseq_2026/") # Read the merged count table (Human + VZV) cat("📖 Reading merged_gene_counts.csv...\n") d.raw <- read.csv("merged_gene_counts.csv", header=TRUE, row.names=1) # 3. READ GTF FOR ANNOTATION --------------------------------------------------- cat("📖 Reading genome_genes.gtf for gene annotation...\n") gtf <- read_tsv("results_GRCh38/genome/genome_genes.gtf", comment = "#", col_names = FALSE, show_col_types = FALSE) gene_map <- gtf %>% filter(X3 == "gene") %>% mutate( gene_id = str_extract(X9, '(?<=gene_id ")[^"]+'), gene_name = str_extract(X9, '(?<=gene_name ")[^"]+') ) %>% select(gene_id, gene_name) %>% distinct() # Fallback to gene_id if gene_name is missing (common for virus genes) gene_map$gene_name[is.na(gene_map$gene_name)] <- gene_map$gene_id[is.na(gene_map$gene_name)] cat(sprintf("✅ Loaded %d gene annotations from GTF.\n", nrow(gene_map))) # 4. DEFINE METADATA (COLDATA) ----------------------------------------------- samples <- c("control_r1","control_r2","control_r3", "VZV.d10_r1","VZV.d10_r2","VZV.d10_r3", "VZV.d15_r1","VZV.d15_r2","VZV.d15_r3", "VZV.d20_r1","VZV.d20_r2","VZV.d20_r3") # Ensure count matrix columns match the samples order d.raw <- d.raw[, samples] replicate <- factor(c("r1", "r2", "r3", "r1", "r2", "r3", "r1", "r2", "r3", "r1", "r2", "r3")) # Set "control" as the reference level for fold change calculations condition <- factor(c("control", "control", "control", "VZV.d10", "VZV.d10", "VZV.d10", "VZV.d15", "VZV.d15", "VZV.d15", "VZV.d20", "VZV.d20", "VZV.d20"), levels = c("control", "VZV.d10", "VZV.d15", "VZV.d20")) colData <- data.frame(condition = condition, replicate = replicate, row.names = samples) # 5. BUILD DESEQ2 DATASET & RUN PIPELINE ------------------------------------- cat("🚀 Building DESeqDataSet and running DESeq...\n") dds <- DESeqDataSetFromMatrix(countData = round(d.raw), # Ensure integers colData = colData, design = ~ condition) # Pre-filter low-count genes (Improves speed and statistical power) keep <- rowSums(counts(dds) >= 10) >= 3 dds <- dds[keep, ] dds <- DESeq(dds) # 6. RLOG TRANSFORMATION (For PCA / Clustering) ------------------------------- cat("📊 Performing rlog transformation...\n") rld <- rlogTransformation(dds) # Save RData for later use save(dds, rld, file = "DESeq2_VZV_Merged_RData.RData") cat("✅ Saved DESeq2 object to DESeq2_VZV_Merged_RData.RData\n") # 7. DEFINE ALL 6 COMPARISONS ------------------------------------------------ planned_comparisons <- list( "01_VZV.d10_vs_control" = c("condition", "VZV.d10", "control"), "02_VZV.d15_vs_control" = c("condition", "VZV.d15", "control"), "03_VZV.d20_vs_control" = c("condition", "VZV.d20", "control"), "04_VZV.d15_vs_VZV.d10" = c("condition", "VZV.d15", "VZV.d10"), "05_VZV.d20_vs_VZV.d10" = c("condition", "VZV.d20", "VZV.d10"), "06_VZV.d20_vs_VZV.d15" = c("condition", "VZV.d20", "VZV.d15") ) # 8. PROCESSING LOOP (Excel + Volcano) ----------------------------------------- OUTPUT_DIR <- "DEG_Results_VZV_Merged" dir.create(OUTPUT_DIR, showWarnings = FALSE, recursive = TRUE) PADJ_CUTOFF <- 0.05 LFC_CUTOFF <- 2.0 # |log2FC| >= 2 VOLCANO_N <- 150 # Label top 150 genes summary_list <- list() cat("\n🚀 PROCESSING", length(planned_comparisons), "COMPARISONS\n") cat(sprintf("Thresholds: padj <= %s, |log2FC| >= %s\n", PADJ_CUTOFF, LFC_CUTOFF)) cat(sprintf("Labeling top %d significant genes in volcano plots\n", VOLCANO_N)) cat(paste(rep("=", 80), collapse = ""), "\n") for (comp_name in names(planned_comparisons)) { contrast_vec <- planned_comparisons[[comp_name]] idx <- which(names(planned_comparisons) == comp_name) cat(sprintf("\n[%02d/%02d] %s\n", idx, length(planned_comparisons), comp_name)) # Validate groups if (!all(contrast_vec[2:3] %in% levels(colData(dds)$condition))) { warning(sprintf(" ⚠️ Skipping: '%s' or '%s' not in colData$condition", contrast_vec[2], contrast_vec[3])) next } # Extract DESeq2 results res <- results(dds, contrast = contrast_vec, alpha = PADJ_CUTOFF, independentFiltering = TRUE) res <- res[!is.na(res$log2FoldChange), ] res_df <- as.data.frame(res) res_df$GeneID <- rownames(res_df) # --- ANNOTATION & VIRUS IDENTIFICATION --- res_annot <- res_df %>% left_join(gene_map, by = c("GeneID" = "gene_id")) %>% mutate( GeneName = ifelse(is.na(gene_name), GeneID, gene_name), padj = ifelse(is.na(padj), 1, padj), padj_plot = ifelse(padj == 0, 1e-300, padj), # Virus genes are those that do NOT start with "ENSG" (Ensembl ID) is_virus = !grepl("^ENSG", GeneID) ) %>% select(GeneID, GeneName, baseMean, log2FoldChange, lfcSE, stat, pvalue, padj, padj_plot, is_virus) %>% distinct(GeneID, .keep_all = TRUE) %>% arrange(padj, desc(abs(log2FoldChange))) cat(" ✅ Annotation step completed successfully.\n") # Filter Up/Down regulated genes up_genes <- res_annot %>% filter(padj <= PADJ_CUTOFF & log2FoldChange >= LFC_CUTOFF) %>% arrange(desc(log2FoldChange)) down_genes <- res_annot %>% filter(padj <= PADJ_CUTOFF & log2FoldChange <= -LFC_CUTOFF) %>% arrange(log2FoldChange) # --- Write 3-Sheet Excel --- wb <- createWorkbook() addWorksheet(wb, "Complete_Results") writeData(wb, "Complete_Results", res_annot) setColWidths(wb, "Complete_Results", cols = 1:2, widths = c(18, 25)) setColWidths(wb, "Complete_Results", cols = 3:ncol(res_annot), widths = "auto") addWorksheet(wb, "Up_Regulated") writeData(wb, "Up_Regulated", if(nrow(up_genes)>0) up_genes else data.frame(Message="None meet thresholds")) if(nrow(up_genes)>0) { setColWidths(wb, "Up_Regulated", cols = 1:2, widths = c(18, 25)); setColWidths(wb, "Up_Regulated", cols = 3:ncol(up_genes), widths = "auto") } addWorksheet(wb, "Down_Regulated") writeData(wb, "Down_Regulated", if(nrow(down_genes)>0) down_genes else data.frame(Message="None meet thresholds")) if(nrow(down_genes)>0) { setColWidths(wb, "Down_Regulated", cols = 1:2, widths = c(18, 25)); setColWidths(wb, "Down_Regulated", cols = 3:ncol(down_genes), widths = "auto") } saveWorkbook(wb, file.path(OUTPUT_DIR, sprintf("DEG_%s.xlsx", comp_name)), overwrite = TRUE) # --- Volcano Plot with Virus Genes in Green --- plot_data <- res_annot %>% mutate( sig_cat = case_when( is_virus ~ "Virus", padj <= PADJ_CUTOFF & log2FoldChange >= LFC_CUTOFF ~ "Host_Up", padj <= PADJ_CUTOFF & log2FoldChange <= -LFC_CUTOFF ~ "Host_Down", TRUE ~ "Host_NS" ), label = ifelse(sig_cat != "Host_NS" & rank(padj) <= VOLCANO_N, GeneName, "") ) # FIX: Plot Host genes first, then Virus genes on top so they aren't hidden at (0,0) p <- ggplot(plot_data, aes(x = log2FoldChange, y = -log10(padj_plot))) + # 1. Plot Host genes first (background) geom_point(data = filter(plot_data, sig_cat != "Virus"), aes(color = sig_cat), alpha = 0.6, size = 1.0) + # 2. Plot Virus genes on top (foreground) geom_point(data = filter(plot_data, sig_cat == "Virus"), color = "green", alpha = 0.8, size = 1.5) + scale_color_manual( name = "Expression", values = c( "Host_Up" = "#E41A1C", "Host_Down" = "#377EB8", "Host_NS" = "grey70", "Virus" = "green" ), labels = c( "Host_Up" = "Host Up-regulated", "Host_Down" = "Host Down-regulated", "Host_NS" = "Not Significant", "Virus" = "Virus RNA" ) ) + geom_text_repel(data = filter(plot_data, label != ""), aes(label = label), size = 2.2, max.overlaps = 50, box.padding = 0.3, segment.color = "grey40") + geom_vline(xintercept = c(-LFC_CUTOFF, LFC_CUTOFF), linetype = "dashed", alpha = 0.4) + geom_hline(yintercept = -log10(PADJ_CUTOFF), linetype = "dashed", alpha = 0.4) + labs(title = gsub("_", " ", comp_name), subtitle = sprintf("padj ≤ %s | |log₂FC| ≥ %s | Top %d genes labeled", PADJ_CUTOFF, LFC_CUTOFF, VOLCANO_N), caption = sprintf("Total: %d | Up: %d | Down: %d | Virus: %d", nrow(plot_data), nrow(up_genes), nrow(down_genes), sum(res_annot$is_virus)), x = "log2 Fold Change", y = "-log10 Adjusted P-value") + theme_minimal(base_size = 11) + theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 12), plot.subtitle = element_text(hjust = 0.5, size = 10), plot.caption = element_text(size = 9), legend.position = "right", legend.title = element_text(size = 10), legend.text = element_text(size = 9), panel.grid.minor = element_blank()) ggsave(file.path(OUTPUT_DIR, sprintf("Volcano_%s.png", comp_name)), p, width = 10, height = 8, dpi = 300, bg = "white") summary_list[[comp_name]] <- list( name = comp_name, total = nrow(res_annot), up = nrow(up_genes), down = nrow(down_genes), virus = sum(res_annot$is_virus) ) cat(sprintf(" ✅ Excel + Volcano saved (Up: %d, Down: %d, Virus genes: %d)\n", nrow(up_genes), nrow(down_genes), sum(res_annot$is_virus))) } # 9. SUMMARY REPORT ------------------------------------------------------------ summary_df <- do.call(rbind, lapply(summary_list, as.data.frame)) %>% mutate(sig_total = up + down, pct_sig = round(100 * sig_total / total, 1)) %>% arrange(desc(sig_total)) write.csv(summary_df, file.path(OUTPUT_DIR, "DEG_Summary_All_6.csv"), row.names = FALSE) cat("\n", paste(rep("=", 80), collapse = ""), "\n") cat("📊 FINAL SUMMARY OF ALL 6 COMPARISONS\n") print(summary_df, row.names = FALSE) cat(sprintf("\n✨ All files saved to: %s\n", normalizePath(OUTPUT_DIR))) # 10. PCA PLOT (Using rlog transformed data) ------------------------------------ cat("📊 Generating PCA plot...\n") pcaData <- plotPCA(rld, intgroup=c("condition", "replicate"), returnData=TRUE) p_pca <- ggplot(pcaData, aes(x = PC1, y = PC2, color = condition, shape = replicate)) + geom_point(size=3) + xlab(paste0("PC1: ", format(100 * attr(pcaData, "percentVar")[1], digits=3), "% variance")) + ylab(paste0("PC2: ", format(100 * attr(pcaData, "percentVar")[2], digits=3), "% variance")) + theme_minimal(base_size = 12) + theme(plot.title = element_text(hjust = 0.5, face = "bold")) + labs(title = "PCA Plot (Merged Host-Virus rlog Data)", color = "Condition", shape = "Replicate") ggsave(file.path(OUTPUT_DIR, "PCA_Merged_rlog.png"), p_pca, width = 8, height = 6, dpi = 300, bg = "white") cat("✅ PCA plot saved to PCA_Merged_rlog.png\n") cat("\n🎉 ALL ANALYSES COMPLETED SUCCESSFULLY!\n") cat("📁 Output files:\n") cat(" - 6 Excel files (DEG_*.xlsx) with 3 sheets each\n") cat(" - 6 Volcano plots (Volcano_*.png) with top 150 genes labeled\n") cat(" - 1 PCA plot (PCA_Merged_rlog.png)\n") cat(" - 1 Summary file (DEG_Summary_All_6.csv)\n")