#==============================================================================
# COMPLETE PIPELINE: Merged Host-Virus Counts -> DESeq2 -> 6 Comparisons
# Includes: GTF Annotation, rlogTransformation, PCA Plot, Excel Export, Volcano Plots
# Virus genes colored in GREEN, Top 150 genes labeled
#==============================================================================

# 1. LOAD LIBRARIES ------------------------------------------------------------
suppressPackageStartupMessages({
  library(DESeq2)
  library(dplyr)
  library(readr)
  library(stringr)
  library(openxlsx)
  library(ggplot2)
  library(ggrepel)
})

# 2. SET WORKING DIRECTORY & READ COUNTS ---------------------------------------
setwd("~/DATA/Data_Nina_RNAseq_2026/")

# Read the merged count table (Human + VZV)
cat("📖 Reading merged_gene_counts.csv...\n")
d.raw <- read.csv("merged_gene_counts.csv", header=TRUE, row.names=1)

# 3. READ GTF FOR ANNOTATION ---------------------------------------------------
cat("📖 Reading genome_genes.gtf for gene annotation...\n")
gtf <- read_tsv("results_GRCh38/genome/genome_genes.gtf", comment = "#", col_names = FALSE, show_col_types = FALSE)

gene_map <- gtf %>%
  filter(X3 == "gene") %>%
  mutate(
    gene_id = str_extract(X9, '(?<=gene_id ")[^"]+'),
    gene_name = str_extract(X9, '(?<=gene_name ")[^"]+')
  ) %>%
  select(gene_id, gene_name) %>%
  distinct()

# Fallback to gene_id if gene_name is missing (common for virus genes)
gene_map$gene_name[is.na(gene_map$gene_name)] <- gene_map$gene_id[is.na(gene_map$gene_name)]
cat(sprintf("✅ Loaded %d gene annotations from GTF.\n", nrow(gene_map)))

# 4. DEFINE METADATA (COLDATA) -----------------------------------------------
samples <- c("control_r1","control_r2","control_r3",
             "VZV.d10_r1","VZV.d10_r2","VZV.d10_r3",
             "VZV.d15_r1","VZV.d15_r2","VZV.d15_r3",
             "VZV.d20_r1","VZV.d20_r2","VZV.d20_r3")

# Ensure count matrix columns match the samples order
d.raw <- d.raw[, samples]

replicate <- factor(c("r1", "r2", "r3", "r1", "r2", "r3", "r1", "r2", "r3", "r1", "r2", "r3"))
# Set "control" as the reference level for fold change calculations
condition <- factor(c("control", "control", "control",
                      "VZV.d10", "VZV.d10", "VZV.d10",
                      "VZV.d15", "VZV.d15", "VZV.d15",
                      "VZV.d20", "VZV.d20", "VZV.d20"),
                    levels = c("control", "VZV.d10", "VZV.d15", "VZV.d20"))

colData <- data.frame(condition = condition, replicate = replicate, row.names = samples)

# 5. BUILD DESEQ2 DATASET & RUN PIPELINE -------------------------------------
cat("🚀 Building DESeqDataSet and running DESeq...\n")
dds <- DESeqDataSetFromMatrix(countData = round(d.raw), # Ensure integers
                              colData = colData,
                              design = ~ condition)

# Pre-filter low-count genes (Improves speed and statistical power)
keep <- rowSums(counts(dds) >= 10) >= 3
dds <- dds[keep, ]

dds <- DESeq(dds)

# 6. RLOG TRANSFORMATION (For PCA / Clustering) -------------------------------
cat("📊 Performing rlog transformation...\n")
rld <- rlogTransformation(dds)

# Save RData for later use
save(dds, rld, file = "DESeq2_VZV_Merged_RData.RData")
cat("✅ Saved DESeq2 object to DESeq2_VZV_Merged_RData.RData\n")

# 7. DEFINE ALL 6 COMPARISONS ------------------------------------------------
planned_comparisons <- list(
  "01_VZV.d10_vs_control" = c("condition", "VZV.d10", "control"),
  "02_VZV.d15_vs_control" = c("condition", "VZV.d15", "control"),
  "03_VZV.d20_vs_control" = c("condition", "VZV.d20", "control"),
  "04_VZV.d15_vs_VZV.d10" = c("condition", "VZV.d15", "VZV.d10"),
  "05_VZV.d20_vs_VZV.d10" = c("condition", "VZV.d20", "VZV.d10"),
  "06_VZV.d20_vs_VZV.d15" = c("condition", "VZV.d20", "VZV.d15")
)

# 8. PROCESSING LOOP (Excel + Volcano) -----------------------------------------
OUTPUT_DIR   <- "DEG_Results_VZV_Merged"
dir.create(OUTPUT_DIR, showWarnings = FALSE, recursive = TRUE)
PADJ_CUTOFF  <- 0.05
LFC_CUTOFF   <- 2.0  # |log2FC| >= 2
VOLCANO_N    <- 150  # Label top 150 genes

summary_list  <- list()
cat("\n🚀 PROCESSING", length(planned_comparisons), "COMPARISONS\n")
cat(sprintf("Thresholds: padj <= %s, |log2FC| >= %s\n", PADJ_CUTOFF, LFC_CUTOFF))
cat(sprintf("Labeling top %d significant genes in volcano plots\n", VOLCANO_N))
cat(paste(rep("=", 80), collapse = ""), "\n")

for (comp_name in names(planned_comparisons)) {
  contrast_vec <- planned_comparisons[[comp_name]]
  idx <- which(names(planned_comparisons) == comp_name)
  cat(sprintf("\n[%02d/%02d] %s\n", idx, length(planned_comparisons), comp_name))

  # Validate groups
  if (!all(contrast_vec[2:3] %in% levels(colData(dds)$condition))) {
    warning(sprintf("  ⚠️ Skipping: '%s' or '%s' not in colData$condition", contrast_vec[2], contrast_vec[3]))
    next
  }

  # Extract DESeq2 results
  res <- results(dds, contrast = contrast_vec, alpha = PADJ_CUTOFF, independentFiltering = TRUE)
  res <- res[!is.na(res$log2FoldChange), ]
  res_df <- as.data.frame(res)
  res_df$GeneID <- rownames(res_df)

  # --- ANNOTATION & VIRUS IDENTIFICATION ---
  res_annot <- res_df %>%
    left_join(gene_map, by = c("GeneID" = "gene_id")) %>%
    mutate(
      GeneName = ifelse(is.na(gene_name), GeneID, gene_name),
      padj = ifelse(is.na(padj), 1, padj),
      padj_plot = ifelse(padj == 0, 1e-300, padj),
      # Virus genes are those that do NOT start with "ENSG" (Ensembl ID)
      is_virus = !grepl("^ENSG", GeneID)
    ) %>%
    select(GeneID, GeneName, baseMean, log2FoldChange, lfcSE, stat, pvalue, padj, padj_plot, is_virus) %>%
    distinct(GeneID, .keep_all = TRUE) %>%
    arrange(padj, desc(abs(log2FoldChange)))

  cat("  ✅ Annotation step completed successfully.\n")

  # Filter Up/Down regulated genes
  up_genes   <- res_annot %>% filter(padj <= PADJ_CUTOFF & log2FoldChange >= LFC_CUTOFF) %>% arrange(desc(log2FoldChange))
  down_genes <- res_annot %>% filter(padj <= PADJ_CUTOFF & log2FoldChange <= -LFC_CUTOFF) %>% arrange(log2FoldChange)

  # --- Write 3-Sheet Excel ---
  wb <- createWorkbook()
  addWorksheet(wb, "Complete_Results")
  writeData(wb, "Complete_Results", res_annot)
  setColWidths(wb, "Complete_Results", cols = 1:2, widths = c(18, 25))
  setColWidths(wb, "Complete_Results", cols = 3:ncol(res_annot), widths = "auto")

  addWorksheet(wb, "Up_Regulated")
  writeData(wb, "Up_Regulated", if(nrow(up_genes)>0) up_genes else data.frame(Message="None meet thresholds"))
  if(nrow(up_genes)>0) { setColWidths(wb, "Up_Regulated", cols = 1:2, widths = c(18, 25)); setColWidths(wb, "Up_Regulated", cols = 3:ncol(up_genes), widths = "auto") }

  addWorksheet(wb, "Down_Regulated")
  writeData(wb, "Down_Regulated", if(nrow(down_genes)>0) down_genes else data.frame(Message="None meet thresholds"))
  if(nrow(down_genes)>0) { setColWidths(wb, "Down_Regulated", cols = 1:2, widths = c(18, 25)); setColWidths(wb, "Down_Regulated", cols = 3:ncol(down_genes), widths = "auto") }

  saveWorkbook(wb, file.path(OUTPUT_DIR, sprintf("DEG_%s.xlsx", comp_name)), overwrite = TRUE)

  # --- Volcano Plot with Virus Genes in Green ---
  plot_data  <- res_annot %>%
    mutate(
      sig_cat = case_when(
        is_virus ~ "Virus",
        padj <= PADJ_CUTOFF & log2FoldChange >= LFC_CUTOFF ~ "Host_Up",
        padj <= PADJ_CUTOFF & log2FoldChange <= -LFC_CUTOFF ~ "Host_Down",
        TRUE ~ "Host_NS"
      ),
      label = ifelse(sig_cat != "Host_NS" & rank(padj) <= VOLCANO_N, GeneName, "")
    )

  # FIX: Plot Host genes first, then Virus genes on top so they aren't hidden at (0,0)
  p <- ggplot(plot_data, aes(x = log2FoldChange, y = -log10(padj_plot))) +
    # 1. Plot Host genes first (background)
    geom_point(data = filter(plot_data, sig_cat != "Virus"),
               aes(color = sig_cat), alpha = 0.6, size = 1.0) +

    # 2. Plot Virus genes on top (foreground)
    geom_point(data = filter(plot_data, sig_cat == "Virus"),
               color = "green", alpha = 0.8, size = 1.5) +

    scale_color_manual(
      name = "Expression",
      values = c(
        "Host_Up" = "#E41A1C",
        "Host_Down" = "#377EB8",
        "Host_NS" = "grey70",
        "Virus" = "green"
      ),
      labels = c(
        "Host_Up" = "Host Up-regulated",
        "Host_Down" = "Host Down-regulated",
        "Host_NS" = "Not Significant",
        "Virus" = "Virus RNA"
      )
    ) +
    geom_text_repel(data = filter(plot_data, label != ""), aes(label = label),
                    size = 2.2, max.overlaps = 50, box.padding = 0.3, segment.color = "grey40") +
    geom_vline(xintercept = c(-LFC_CUTOFF, LFC_CUTOFF), linetype = "dashed", alpha = 0.4) +
    geom_hline(yintercept = -log10(PADJ_CUTOFF), linetype = "dashed", alpha = 0.4) +
    labs(title = gsub("_", " ", comp_name),
         subtitle = sprintf("padj ≤ %s | |log₂FC| ≥ %s | Top %d genes labeled", PADJ_CUTOFF, LFC_CUTOFF, VOLCANO_N),
         caption = sprintf("Total: %d | Up: %d | Down: %d | Virus: %d",
                           nrow(plot_data), nrow(up_genes), nrow(down_genes), sum(res_annot$is_virus)),
         x = "log2 Fold Change", y = "-log10 Adjusted P-value") +
    theme_minimal(base_size = 11) +
    theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 12),
          plot.subtitle = element_text(hjust = 0.5, size = 10),
          plot.caption = element_text(size = 9),
          legend.position = "right",
          legend.title = element_text(size = 10),
          legend.text = element_text(size = 9),
          panel.grid.minor = element_blank())

  ggsave(file.path(OUTPUT_DIR, sprintf("Volcano_%s.png", comp_name)), p, width = 10, height = 8, dpi = 300, bg = "white")

  summary_list[[comp_name]] <- list(
    name = comp_name,
    total = nrow(res_annot),
    up = nrow(up_genes),
    down = nrow(down_genes),
    virus = sum(res_annot$is_virus)
  )

  cat(sprintf("  ✅ Excel + Volcano saved (Up: %d, Down: %d, Virus genes: %d)\n",
              nrow(up_genes), nrow(down_genes), sum(res_annot$is_virus)))
}

# 9. SUMMARY REPORT ------------------------------------------------------------
summary_df <- do.call(rbind, lapply(summary_list, as.data.frame)) %>%
  mutate(sig_total = up + down, pct_sig = round(100 * sig_total / total, 1)) %>%
  arrange(desc(sig_total))

write.csv(summary_df, file.path(OUTPUT_DIR, "DEG_Summary_All_6.csv"), row.names = FALSE)

cat("\n", paste(rep("=", 80), collapse = ""), "\n")
cat("📊 FINAL SUMMARY OF ALL 6 COMPARISONS\n")
print(summary_df, row.names = FALSE)
cat(sprintf("\n✨ All files saved to: %s\n", normalizePath(OUTPUT_DIR)))

# 10. PCA PLOT (Using rlog transformed data) ------------------------------------
cat("📊 Generating PCA plot...\n")
pcaData <- plotPCA(rld, intgroup=c("condition", "replicate"), returnData=TRUE)

p_pca <- ggplot(pcaData, aes(x = PC1, y = PC2, color = condition, shape = replicate)) +
  geom_point(size=3) +
  xlab(paste0("PC1: ", format(100 * attr(pcaData, "percentVar")[1], digits=3), "% variance")) +
  ylab(paste0("PC2: ", format(100 * attr(pcaData, "percentVar")[2], digits=3), "% variance")) +
  theme_minimal(base_size = 12) +
  theme(plot.title = element_text(hjust = 0.5, face = "bold")) +
  labs(title = "PCA Plot (Merged Host-Virus rlog Data)", color = "Condition", shape = "Replicate")

ggsave(file.path(OUTPUT_DIR, "PCA_Merged_rlog.png"), p_pca, width = 8, height = 6, dpi = 300, bg = "white")
cat("✅ PCA plot saved to PCA_Merged_rlog.png\n")

cat("\n🎉 ALL ANALYSES COMPLETED SUCCESSFULLY!\n")
cat("📁 Output files:\n")
cat("   - 6 Excel files (DEG_*.xlsx) with 3 sheets each\n")
cat("   - 6 Volcano plots (Volcano_*.png) with top 150 genes labeled\n")
cat("   - 1 PCA plot (PCA_Merged_rlog.png)\n")
cat("   - 1 Summary file (DEG_Summary_All_6.csv)\n")