# =============================================================================
# Script: manhattan_plot_top_miRNAs_based_on_mean_RPM.R
# Description: Generates a Manhattan-style plot highlighting the top 5 miRNAs
#              with the highest mean RPM across five selected samples.
# Input:     exceRpt_miRNA_ReadCounts.txt (tab-delimited raw counts)
# Output:    manhattan_plot_top_miRNAs_based_on_mean_RPM.png
# =============================================================================

# 1. Load required libraries
library(ggplot2)
library(dplyr)
library(tidyr)
library(ggrepel)

# 2. Set working directory and load raw count data
# Note: Update the path if your file is stored in a different location
setwd("~/DATA/Data_Ute/Data_Ute_smallRNA_7/summaries_exo7/")
d.raw <- read.delim2("exceRpt_miRNA_ReadCounts.txt", sep = "\t", header = TRUE, row.names = 1)

# 3. Select relevant samples for plotting
# Focuses on parental cells and untreated controls as originally specified
target_samples <- c("parental_cells_1", "parental_cells_2", "parental_cells_3",
                    "untreated_1", "untreated_2")
d.raw_5 <- d.raw[, target_samples]

# 4. Calculate Reads Per Million (RPM) for library size normalization
total_counts <- colSums(d.raw_5)
RPM <- sweep(d.raw_5, 2, total_counts, FUN = "/") * 1e6

# 5. Convert wide-format data to long-format for ggplot2 compatibility
RPM$miRNA <- rownames(RPM)
df <- pivot_longer(RPM, cols = -miRNA, names_to = "sample", values_to = "RPM")

# 6. Apply log10 transformation to RPM values (add 1 to avoid log(0))
df <- df %>% mutate(logRPM = log10(RPM + 1))

# 7. Assign x-axis positions for each miRNA within each sample
# Ensures consistent ordering across facets
df <- df %>%
  arrange(miRNA) %>%
  group_by(sample) %>%
  mutate(Position = row_number())

# 8. Identify the top 5 miRNAs with the highest mean RPM across all samples
top_mirnas <- df %>%
  group_by(miRNA) %>%
  summarise(mean_RPM = mean(RPM)) %>%
  arrange(desc(mean_RPM)) %>%
  head(5) %>%
  pull(miRNA)

# 9. Assign colors: red for top 5 miRNAs, darkblue for all others
df$color <- ifelse(df$miRNA %in% top_mirnas, "red", "darkblue")

# 10. Define readable sample labels for plot facets
sample_labels <- c(
  "parental_cells_1" = "Parental cell 1",
  "parental_cells_2" = "Parental cell 2",
  "parental_cells_3" = "Parental cell 3",
  "untreated_1"      = "Untreated 1",
  "untreated_2"      = "Untreated 2"
)

# 11. Generate and save the Manhattan plot
png("manhattan_plot_top_miRNAs_based_on_mean_RPM.png", width = 1200, height = 1200)
ggplot(df, aes(x = Position, y = logRPM, color = color)) +
  scale_color_manual(values = c("red" = "red", "darkblue" = "darkblue")) +
  geom_jitter(width = 0.4) +
  geom_text_repel(
    data = df %>% filter(miRNA %in% top_mirnas),
    aes(label = miRNA),
    box.padding = 0.5,
    point.padding = 0.5,
    segment.color = 'gray50',
    size = 5,
    max.overlaps = 8,
    color = "black"
  ) +
  labs(x = "", y = "log10(Reads Per Million) (RPM)") +
  facet_wrap(~sample, scales = "free_x", ncol = 5,
             labeller = labeller(sample = sample_labels)) +
  theme_minimal() +
  theme(
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank(),
    legend.position = "none",
    text = element_text(size = 16),
    axis.title = element_text(size = 18),
    strip.text = element_text(size = 16, face = "bold"),
    panel.spacing = unit(1.5, "lines")
  )
dev.off()

# 12. Print confirmation message
cat("Successfully generated: manhattan_plot_top_miRNAs_based_on_mean_RPM.png\n")