# =============================================================================
# Script: manhattan_plot_MKL1.R
# =============================================================================
library(ggplot2)
library(dplyr)
library(tidyr)
library(ggrepel)
library(openxlsx)

# 1. Set the correct path to where exceRpt saved the output
#    (Change this to the ACTUAL path found by the `find` command above)
output_dir <- "~/DATA/Data_Ute_smallRNA_via_exceRpt_workspace/summaries_MKL-1/"
input_file <- file.path(output_dir, "exceRpt_miRNA_ReadCounts.txt")

if (!file.exists(input_file)) {
  stop("❌ Input file not found at: ", input_file,
       "\n💡 Tip: Run `find ~/DATA/Data_Ute_smallRNA_via_exceRpt_workspace -name exceRpt_miRNA_ReadCounts.txt` in terminal to locate it.")
}

cat("✅ Successfully loaded:", input_file, "\n")

d.raw <- read.delim(input_file, sep = "\t", header = TRUE, row.names = 1, check.names = FALSE)
# Safely convert all columns to numeric
d.raw[] <- lapply(d.raw, as.numeric)

# 3. Define sample groups for MKL-1
mk1_cells <- c("nf780", "nf796", "nf797")
mk1_ev    <- c("2404_MKL1_wt_EVs", "2608_MKL1_wt_EVs")

# Check if samples exist in the dataset
all_samples <- c(mk1_cells, mk1_ev)
missing_samples <- setdiff(all_samples, colnames(d.raw))
if (length(missing_samples) > 0) {
  warning("⚠️ Missing samples in data file: ", paste(missing_samples, collapse = ", "))
  mk1_cells <- intersect(mk1_cells, colnames(d.raw))
  mk1_ev    <- intersect(mk1_ev, colnames(d.raw))
}
if (length(mk1_cells) == 0 | length(mk1_ev) == 0) stop("❌ Not enough samples to create groups.")

# 4. Calculate RPM correctly (per sample normalization)
d.sub <- d.raw[, c(mk1_cells, mk1_ev), drop = FALSE]
lib_sizes <- colSums(d.sub)
d.rpm <- sweep(d.sub, 2, lib_sizes, FUN = "/") * 1e6

# 5. Aggregate by group (Mean RPM)
df_agg <- data.frame(
  miRNA = rownames(d.rpm),
  MKL1_wt_cells = rowMeans(d.rpm[, mk1_cells, drop = FALSE], na.rm = TRUE),
  MKL1_wt_EV    = rowMeans(d.rpm[, mk1_ev, drop = FALSE], na.rm = TRUE),
  stringsAsFactors = FALSE
)

# 6. Reshape for plotting
df_long <- df_agg %>%
  pivot_longer(cols = -miRNA, names_to = "sample", values_to = "RPM") %>%
  mutate(logRPM = log10(RPM + 1)) %>%          # +1 to handle zeros
  arrange(miRNA) %>%
  group_by(sample) %>%
  mutate(Position = row_number()) %>%
  ungroup()

# 7. Define custom miRNAs to label
custom_mirnas <- c("hsa-miR-10b-5p", "hsa-miR-21-5p", "hsa-miR-1246",
                   "hsa-miR-182-5p", "hsa-miR-183-5p",
                   "hsa-miR-30a-5p", "hsa-miR-30d-5p",
                   "hsa-miR-200c-3p")
df_long$display_label <- df_long$miRNA  # default: show original name

## Family/cluster mapping for cleaner labels
#family_map <- list(
#  "miR-30-Family" = c("hsa-miR-30a-5p", "hsa-miR-30d-5p"),
#  "miR-182/183-Cluster" = c("hsa-miR-182-5p", "hsa-miR-183-5p")
#)

#for (label in names(family_map)) {
#  df_long$display_label[df_long$miRNA %in% family_map[[label]]] <- label
#}

# 8. Select miRNAs to highlight (custom + top 10 by mean RPM)
top_auto <- df_long %>%
  group_by(miRNA) %>%
  summarise(mean_RPM = mean(RPM, na.rm = TRUE)) %>%
  arrange(desc(mean_RPM)) %>%
  slice_head(n = 10) %>%
  pull(miRNA)

highlight_list <- unique(c(custom_mirnas, top_auto))
highlight_list <- highlight_list[highlight_list %in% df_long$miRNA]

# 9. Print & save verification table
cat("\n=== miRNAs TO BE LABELED IN PLOT ===\n")
labeled_info <- df_long %>%
  filter(miRNA %in% highlight_list) %>%
  select(miRNA, display_label, sample, RPM, logRPM) %>%
  arrange(display_label, miRNA, sample) %>%
  distinct()
print(labeled_info, n = 50)

write.xlsx(labeled_info, file = "labeled_miRNAs_MKL1_verification.xlsx", rowNames = FALSE)
cat("✅ Saved verification table: labeled_miRNAs_MKL1_verification.xlsx\n")

# 10. Assign colors
df_long$color <- ifelse(df_long$miRNA %in% highlight_list, "red", "darkblue")

# 11. Generate Manhattan-style plot
p <- ggplot(df_long, aes(x = Position, y = logRPM, color = color)) +
  scale_color_manual(values = c("red" = "red", "darkblue" = "darkblue")) +
  geom_jitter(width = 0.3, alpha = 0.8) +
  geom_text_repel(
    data = df_long %>% filter(miRNA %in% highlight_list),
    aes(label = display_label),
    box.padding = 0.6, point.padding = 0.5,
    segment.color = "grey50", size = 3.5, max.overlaps = Inf,
    color = "black", force = 2, force_pull = 1,
    label.padding = unit(0.25, "lines")
  ) +
  labs(x = "", y = "log10(Reads Per Million + 1) (RPM)") +
  facet_wrap(~sample, scales = "free_x", ncol = 2,
             labeller = labeller(sample = c("MKL1_wt_cells" = "MKL-1 wt Cells",
                                            "MKL1_wt_EV" = "MKL-1 wt EV"))) +
  theme_minimal(base_size = 14) +
  theme(
    panel.grid.major = element_line(color = "grey90", size = 0.5),
    panel.grid.minor = element_line(color = "grey95", size = 0.2),
    axis.text.x = element_blank(), axis.ticks.x = element_blank(),
    legend.position = "none",
    strip.text = element_text(size = 14, face = "bold"),
    panel.background = element_rect(fill = "white", color = NA)
  )

# 12. Export
ggsave("manhattan_plot_MKL1_vs_EV.png", plot = p, width = 6.5, height = 10, dpi = 200)
ggsave("manhattan_plot_MKL1_vs_EV.svg", plot = p, width = 6.5, height = 10)

write.xlsx(df_long %>% select(miRNA, display_label, sample, Position, RPM, logRPM, color),
           file = "manhattan_plot_MKL1_data.xlsx", rowNames = FALSE)

cat("\n✅ Successfully generated:\n")
cat("   📊 manhattan_plot_MKL1_vs_EV.png & .svg\n")
cat("   📈 manhattan_plot_MKL1_data.xlsx\n")
cat("   🏷️  labeled_miRNAs_MKL1_verification.xlsx\n")