# ==============================================================================
# CORRECTED PIPELINE: 6 Groups (Urine/AUM/MH ± AZI) -> DESeq2 -> Annotation -> 9 Comparisons
# Fixed: drop_na() namespace error & NA p-value handling
# ==============================================================================

# 1. LOAD LIBRARIES ------------------------------------------------------------
suppressPackageStartupMessages({
  library(tximport)
  library(DESeq2)
  library(dplyr)
  library(stringr)
  library(readr)
  library(openxlsx)
  library(ggplot2)
  library(ggrepel)
})

# 2. SET WORKING DIRECTORY & DEFINE SAMPLES ------------------------------------
setwd("/mnt/md1/DATA/Data_Tam_RNAseq_2024_AUM_MHB_Urine_on_ATCC19606/results/star_salmon")

files <- c(
  "AUM-AZI_r1" = "./AUM-AZI_r1/quant.sf",
  "AUM-AZI_r2" = "./AUM-AZI_r2/quant.sf",
  "AUM-AZI_r3" = "./AUM-AZI_r3/quant.sf",
  "AUM_r1"     = "./AUM_r1/quant.sf",
  "AUM_r2"     = "./AUM_r2/quant.sf",
  "AUM_r3"     = "./AUM_r3/quant.sf",
  "MH-AZI_r1"  = "./MH-AZI_r1/quant.sf",
  "MH-AZI_r2"  = "./MH-AZI_r2/quant.sf",
  "MH-AZI_r3"  = "./MH-AZI_r3/quant.sf",
  "MH_r1"      = "./MH_r1/quant.sf",
  "MH_r2"      = "./MH_r2/quant.sf",
  "MH_r3"      = "./MH_r3/quant.sf",
  "Urine-AZI_r1" = "./Urine-AZI_r1/quant.sf",
  "Urine-AZI_r2" = "./Urine-AZI_r2/quant.sf",
  "Urine-AZI_r3" = "./Urine-AZI_r3/quant.sf",
  "Urine_r1"     = "./Urine_r1/quant.sf",
  "Urine_r2"     = "./Urine_r2/quant.sf",
  "Urine_r3"     = "./Urine_r3/quant.sf"
)

# -----------------------------------------------------------------
# ---- Step 1: Parse Metadata from Simplified Sample Names ----
# -----------------------------------------------------------------
samples <- names(files)
metadata <- data.frame(sample = samples, stringsAsFactors = FALSE)

# Extract Media (Urine, AUM, MH)
metadata$media <- gsub("[-_].*", "", samples)

# Extract Treatment (AZI or Control)
metadata$treatment <- ifelse(grepl("AZI", samples), "AZI", "Control")

# Extract Replicate number
metadata$replicate <- as.numeric(gsub(".*r", "", samples))

# Create combined group factor for DESeq2
metadata$group <- factor(paste(metadata$media, metadata$treatment, sep = "_"))

# Set row names and reorder to match txi columns later
rownames(metadata) <- metadata$sample

# -----------------------------------------------------------------
# ---- Step 2: Build tx2gene Mapping & Import Quantifications ----
# -----------------------------------------------------------------
ANNOT_FILE <- "/mnt/md1/DATA/Data_Tam_RNAseq_2024_AUM_MHB_Urine_on_ATCC19606/results/genome/CP059040_m.txt"

cat("📖 Parsing annotation file to build tx2gene mapping...\n")
gff <- read_tsv(ANNOT_FILE, col_names = FALSE, show_col_types = FALSE)

# ✅ FIXED: Replaced drop_na() with dplyr::filter() to avoid missing package error
tx2gene <- gff %>%
  select(X9) %>%
  mutate(
    TXNAME = str_extract(X9, '(?<=transcript_id ")[^"]+'),
    GENEID = str_extract(X9, '(?<=gene_id ")[^"]+')
  ) %>%
  select(TXNAME, GENEID) %>%
  filter(!is.na(TXNAME) & !is.na(GENEID))

cat(sprintf("✅ Created tx2gene mapping with %d entries.\n", nrow(tx2gene)))

cat("📥 Importing Salmon quantifications...\n")
txi <- tximport(files, type = "salmon", tx2gene = tx2gene)

# Reorder metadata to match imported columns
metadata <- metadata[colnames(txi$counts), ]

# -----------------------------------------------------------------
# ---- Step 3: Build & Run DESeq2 ----
# -----------------------------------------------------------------
dds <- DESeqDataSetFromTximport(txi, colData = metadata, design = ~ group)

# Pre-filter low-count genes
keep <- rowSums(counts(dds) >= 10) >= 3
dds <- dds[keep, ]

cat("🚀 Running DESeq2 pipeline...\n")
dds <- DESeq(dds)

cat("✅ Available groups in model:\n")
print(levels(colData(dds)$group))

# ==============================================================================
# 4. DEFINE MEANINGFUL COMPARISONS (9 Total)
# ==============================================================================
planned_comparisons <- list(
  "01_Urine_AZI_vs_Control"       = c("group", "Urine_AZI",     "Urine_Control"),
  "02_AUM_AZI_vs_Control"         = c("group", "AUM_AZI",       "AUM_Control"),
  "03_MH_AZI_vs_Control"          = c("group", "MH_AZI",        "MH_Control"),
  "04_AUM_vs_Urine_Control"       = c("group", "AUM_Control",   "Urine_Control"),
  "05_MH_vs_Urine_Control"        = c("group", "MH_Control",    "Urine_Control"),
  "06_MH_vs_AUM_Control"          = c("group", "MH_Control",    "AUM_Control"),
  "07_AUM_vs_Urine_AZI"           = c("group", "AUM_AZI",       "Urine_AZI"),
  "08_MH_vs_Urine_AZI"            = c("group", "MH_AZI",        "Urine_AZI"),
  "09_MH_vs_AUM_AZI"              = c("group", "MH_AZI",        "AUM_AZI")
)

# ==============================================================================
# 5. PROCESSING LOOP (Excel + Volcano) -----------------------------------------
OUTPUT_DIR   <- "DEG_Results_6Groups"
dir.create(OUTPUT_DIR, showWarnings = FALSE, recursive = TRUE)
PADJ_CUTOFF  <- 0.05
LFC_CUTOFF   <- 2.0
VOLCANO_N    <- 40

# Prepare annotation dataframe for gene names
cat("📖 Parsing annotation file for gene names...\n")
anno_df <- gff %>%
  mutate(
    gene_id   = str_extract(X9, '(?<=gene_id ")[^"]+'),
    gene_name = str_extract(X9, '(?<=gene_name ")[^"]+'),
    locus_tag = str_extract(X9, '(?<=locus_tag ")[^"]+')
  ) %>%
  filter(!is.na(gene_id)) %>%
  distinct(gene_id, .keep_all = TRUE) %>%
  mutate(
    gene_name     = ifelse(is.na(gene_name) | gene_name == "", locus_tag, gene_name),
    gene_id_clean = gsub("^gene-", "", gene_id)
  ) %>%
  select(gene_id, gene_id_clean, gene_name)

summary_list <- list()
cat("\n🚀 PROCESSING", length(planned_comparisons), "COMPARISONS\n")
cat(sprintf("Thresholds: padj <= %s | |log2FC| >= %s\n", PADJ_CUTOFF, LFC_CUTOFF))
cat(paste(rep("=", 80), collapse = ""), "\n")

for (comp_name in names(planned_comparisons)) {
  contrast_vec <- planned_comparisons[[comp_name]]
  idx <- which(names(planned_comparisons) == comp_name)
  cat(sprintf("\n[%02d/9] %s\n", idx, comp_name))

  if (!all(contrast_vec[2:3] %in% levels(colData(dds)$group))) {
    warning(sprintf("  ⚠️ Skipping: '%s' or '%s' not in colData$group", contrast_vec[2], contrast_vec[3]))
    next
  }

  res <- results(dds, contrast = contrast_vec, alpha = PADJ_CUTOFF, independentFiltering = TRUE)
  res <- res[!is.na(res$log2FoldChange), ]
  res_df <- as.data.frame(res)
  res_df$GeneID <- rownames(res_df)
  res_df$GeneID_clean <- gsub("^gene-", "", res_df$GeneID)

  res_annot <- res_df %>%
    left_join(anno_df, by = c("GeneID_clean" = "gene_id_clean")) %>%
    mutate(
      GeneName  = ifelse(is.na(gene_name) | gene_name == "", GeneID_clean, gene_name),
      # 🔧 UPDATE: Replace NA p-values/padj with 1 (treated as non-significant)
      pvalue    = ifelse(is.na(pvalue), 1, pvalue),
      padj      = ifelse(is.na(padj), 1, padj),
      # 📉 Plot value: replace 0 with small number to avoid -log10(0) = Inf
      padj_plot = ifelse(padj == 0, 1e-305, padj)
    ) %>%
    select(GeneID, GeneName, baseMean, log2FoldChange, lfcSE, stat, pvalue, padj, padj_plot) %>%
    distinct(GeneID, .keep_all = TRUE) %>%
    arrange(padj, desc(abs(log2FoldChange)))

  up_genes   <- res_annot %>% filter(padj <= PADJ_CUTOFF & log2FoldChange >= LFC_CUTOFF) %>% arrange(desc(log2FoldChange))
  down_genes <- res_annot %>% filter(padj <= PADJ_CUTOFF & log2FoldChange <= -LFC_CUTOFF) %>% arrange(log2FoldChange)

  wb <- createWorkbook()
  addWorksheet(wb, "Complete_Results")
  writeData(wb, "Complete_Results", res_annot)
  setColWidths(wb, "Complete_Results", cols = 1:2, widths = c(18, 25))
  setColWidths(wb, "Complete_Results", cols = 3:ncol(res_annot), widths = "auto")

  addWorksheet(wb, "Up_Regulated")
  writeData(wb, "Up_Regulated", if(nrow(up_genes)>0) up_genes else data.frame(Message="None meet thresholds"))
  addWorksheet(wb, "Down_Regulated")
  writeData(wb, "Down_Regulated", if(nrow(down_genes)>0) down_genes else data.frame(Message="None meet thresholds"))

  saveWorkbook(wb, file.path(OUTPUT_DIR, sprintf("DEG_%s.xlsx", comp_name)), overwrite = TRUE)

  plot_data <- res_annot %>%
    mutate(
      sig_cat = case_when(
        padj <= PADJ_CUTOFF & log2FoldChange >= LFC_CUTOFF ~ "Up",
        padj <= PADJ_CUTOFF & log2FoldChange <= -LFC_CUTOFF ~ "Down",
        TRUE ~ "NS"
      ),
      label = ifelse(sig_cat != "NS" & rank(padj) <= VOLCANO_N, GeneName, "")
    )

  p <- ggplot(plot_data, aes(x = log2FoldChange, y = -log10(padj_plot))) +
    geom_point(aes(color = sig_cat), alpha = 0.6, size = 1.0) +
    scale_color_manual(values = c("Up" = "#E41A1C", "Down" = "#377EB8", "NS" = "grey70"), name = "Expression") +
    geom_text_repel(data = filter(plot_data, label != ""), aes(label = label),
                    size = 2.2, max.overlaps = 30, box.padding = 0.3, segment.color = "grey40") +
    geom_vline(xintercept = c(-LFC_CUTOFF, LFC_CUTOFF), linetype = "dashed", alpha = 0.4) +
    geom_hline(yintercept = -log10(PADJ_CUTOFF), linetype = "dashed", alpha = 0.4) +
    labs(title = gsub("_", " ", comp_name),
         subtitle = sprintf("padj ≤ %s | |log₂FC| ≥ %s", PADJ_CUTOFF, LFC_CUTOFF),
         caption = sprintf("Total: %d | Up: %d | Down: %d", nrow(plot_data), nrow(up_genes), nrow(down_genes)),
         x = "log2 Fold Change", y = "-log10 Adjusted P-value") +
    theme_minimal(base_size = 10) +
    theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 11),
          plot.subtitle = element_text(hjust = 0.5, size = 9),
          plot.caption = element_text(size = 8),
          legend.position = "right", panel.grid.minor = element_blank())

  ggsave(file.path(OUTPUT_DIR, sprintf("Volcano_%s.png", comp_name)), p, width = 8, height = 6, dpi = 300, bg = "white")

  summary_list[[comp_name]] <- list(name = comp_name, total = nrow(res_annot), up = nrow(up_genes), down = nrow(down_genes))
  cat(sprintf("  ✅ Excel + Volcano saved (Up: %d, Down: %d)\n", nrow(up_genes), nrow(down_genes)))
}

# 6. SUMMARY REPORT ------------------------------------------------------------
summary_df <- do.call(rbind, lapply(summary_list, as.data.frame)) %>%
  mutate(sig_total = up + down, pct_sig = round(100 * sig_total / total, 1)) %>%
  arrange(desc(sig_total))

write.csv(summary_df, file.path(OUTPUT_DIR, "DEG_Summary_All_9.csv"), row.names = FALSE)
cat("\n", paste(rep("=", 80), collapse = ""), "\n")
cat("📊 FINAL SUMMARY OF ALL 9 COMPARISONS\n")
print(summary_df, row.names = FALSE)
cat(sprintf("\n✨ All files saved to: %s\n", normalizePath(OUTPUT_DIR)))