# ============================================================================= # Script: manhattan_plot_MKL1.R # ============================================================================= library(ggplot2) library(dplyr) library(tidyr) library(ggrepel) library(openxlsx) # 1. Set the correct path to where exceRpt saved the output # (Change this to the ACTUAL path found by the `find` command above) output_dir <- "~/DATA/Data_Ute_smallRNA_via_exceRpt_workspace/summaries_MKL-1/" input_file <- file.path(output_dir, "exceRpt_miRNA_ReadCounts.txt") if (!file.exists(input_file)) { stop("āŒ Input file not found at: ", input_file, "\nšŸ’” Tip: Run `find ~/DATA/Data_Ute_smallRNA_via_exceRpt_workspace -name exceRpt_miRNA_ReadCounts.txt` in terminal to locate it.") } cat("āœ… Successfully loaded:", input_file, "\n") d.raw <- read.delim(input_file, sep = "\t", header = TRUE, row.names = 1, check.names = FALSE) # Safely convert all columns to numeric d.raw[] <- lapply(d.raw, as.numeric) # 3. Define sample groups for MKL-1 mk1_cells <- c("nf780", "nf796", "nf797") mk1_ev <- c("2404_MKL1_wt_EVs", "2608_MKL1_wt_EVs") # Check if samples exist in the dataset all_samples <- c(mk1_cells, mk1_ev) missing_samples <- setdiff(all_samples, colnames(d.raw)) if (length(missing_samples) > 0) { warning("āš ļø Missing samples in data file: ", paste(missing_samples, collapse = ", ")) mk1_cells <- intersect(mk1_cells, colnames(d.raw)) mk1_ev <- intersect(mk1_ev, colnames(d.raw)) } if (length(mk1_cells) == 0 | length(mk1_ev) == 0) stop("āŒ Not enough samples to create groups.") # 4. Calculate RPM correctly (per sample normalization) d.sub <- d.raw[, c(mk1_cells, mk1_ev), drop = FALSE] lib_sizes <- colSums(d.sub) d.rpm <- sweep(d.sub, 2, lib_sizes, FUN = "/") * 1e6 # 5. Aggregate by group (Mean RPM) df_agg <- data.frame( miRNA = rownames(d.rpm), MKL1_wt_cells = rowMeans(d.rpm[, mk1_cells, drop = FALSE], na.rm = TRUE), MKL1_wt_EV = rowMeans(d.rpm[, mk1_ev, drop = FALSE], na.rm = TRUE), stringsAsFactors = FALSE ) # 6. Reshape for plotting df_long <- df_agg %>% pivot_longer(cols = -miRNA, names_to = "sample", values_to = "RPM") %>% mutate(logRPM = log10(RPM + 1)) %>% # +1 to handle zeros arrange(miRNA) %>% group_by(sample) %>% mutate(Position = row_number()) %>% ungroup() # 7. Define custom miRNAs to label custom_mirnas <- c("hsa-miR-10b-5p", "hsa-miR-21-5p", "hsa-miR-1246", "hsa-miR-182-5p", "hsa-miR-183-5p", "hsa-miR-30a-5p", "hsa-miR-30d-5p", "hsa-miR-200c-3p") df_long$display_label <- df_long$miRNA # default: show original name ## Family/cluster mapping for cleaner labels #family_map <- list( # "miR-30-Family" = c("hsa-miR-30a-5p", "hsa-miR-30d-5p"), # "miR-182/183-Cluster" = c("hsa-miR-182-5p", "hsa-miR-183-5p") #) #for (label in names(family_map)) { # df_long$display_label[df_long$miRNA %in% family_map[[label]]] <- label #} # 8. Select miRNAs to highlight (custom + top 10 by mean RPM) top_auto <- df_long %>% group_by(miRNA) %>% summarise(mean_RPM = mean(RPM, na.rm = TRUE)) %>% arrange(desc(mean_RPM)) %>% slice_head(n = 10) %>% pull(miRNA) highlight_list <- unique(c(custom_mirnas, top_auto)) highlight_list <- highlight_list[highlight_list %in% df_long$miRNA] # 9. Print & save verification table cat("\n=== miRNAs TO BE LABELED IN PLOT ===\n") labeled_info <- df_long %>% filter(miRNA %in% highlight_list) %>% select(miRNA, display_label, sample, RPM, logRPM) %>% arrange(display_label, miRNA, sample) %>% distinct() print(labeled_info, n = 50) write.xlsx(labeled_info, file = "labeled_miRNAs_MKL1_verification.xlsx", rowNames = FALSE) cat("āœ… Saved verification table: labeled_miRNAs_MKL1_verification.xlsx\n") # 10. Assign colors df_long$color <- ifelse(df_long$miRNA %in% highlight_list, "red", "darkblue") # 11. Generate Manhattan-style plot p <- ggplot(df_long, aes(x = Position, y = logRPM, color = color)) + scale_color_manual(values = c("red" = "red", "darkblue" = "darkblue")) + geom_jitter(width = 0.3, alpha = 0.8) + geom_text_repel( data = df_long %>% filter(miRNA %in% highlight_list), aes(label = display_label), box.padding = 0.6, point.padding = 0.5, segment.color = "grey50", size = 3.5, max.overlaps = Inf, color = "black", force = 2, force_pull = 1, label.padding = unit(0.25, "lines") ) + labs(x = "", y = "log10(Reads Per Million + 1) (RPM)") + facet_wrap(~sample, scales = "free_x", ncol = 2, labeller = labeller(sample = c("MKL1_wt_cells" = "MKL-1 wt Cells", "MKL1_wt_EV" = "MKL-1 wt EV"))) + theme_minimal(base_size = 14) + theme( panel.grid.major = element_line(color = "grey90", size = 0.5), panel.grid.minor = element_line(color = "grey95", size = 0.2), axis.text.x = element_blank(), axis.ticks.x = element_blank(), legend.position = "none", strip.text = element_text(size = 14, face = "bold"), panel.background = element_rect(fill = "white", color = NA) ) # 12. Export ggsave("manhattan_plot_MKL1_vs_EV.png", plot = p, width = 6.5, height = 10, dpi = 200) ggsave("manhattan_plot_MKL1_vs_EV.svg", plot = p, width = 6.5, height = 10) write.xlsx(df_long %>% select(miRNA, display_label, sample, Position, RPM, logRPM, color), file = "manhattan_plot_MKL1_data.xlsx", rowNames = FALSE) cat("\nāœ… Successfully generated:\n") cat(" šŸ“Š manhattan_plot_MKL1_vs_EV.png & .svg\n") cat(" šŸ“ˆ manhattan_plot_MKL1_data.xlsx\n") cat(" šŸ·ļø labeled_miRNAs_MKL1_verification.xlsx\n")