# ============================================================================= # Script: manhattan_plot_top_miRNAs_based_on_mean_RPM.R # Description: Generates a Manhattan-style plot highlighting the top 5 miRNAs # with the highest mean RPM across five selected samples. # Input: exceRpt_miRNA_ReadCounts.txt (tab-delimited raw counts) # Output: manhattan_plot_top_miRNAs_based_on_mean_RPM.png # ============================================================================= # 1. Load required libraries library(ggplot2) library(dplyr) library(tidyr) library(ggrepel) # 2. Set working directory and load raw count data # Note: Update the path if your file is stored in a different location setwd("~/DATA/Data_Ute/Data_Ute_smallRNA_7/summaries_exo7/") d.raw <- read.delim2("exceRpt_miRNA_ReadCounts.txt", sep = "\t", header = TRUE, row.names = 1) # 3. Select relevant samples for plotting # Focuses on parental cells and untreated controls as originally specified target_samples <- c("parental_cells_1", "parental_cells_2", "parental_cells_3", "untreated_1", "untreated_2") d.raw_5 <- d.raw[, target_samples] # 4. Calculate Reads Per Million (RPM) for library size normalization total_counts <- colSums(d.raw_5) RPM <- sweep(d.raw_5, 2, total_counts, FUN = "/") * 1e6 # 5. Convert wide-format data to long-format for ggplot2 compatibility RPM$miRNA <- rownames(RPM) df <- pivot_longer(RPM, cols = -miRNA, names_to = "sample", values_to = "RPM") # 6. Apply log10 transformation to RPM values (add 1 to avoid log(0)) df <- df %>% mutate(logRPM = log10(RPM + 1)) # 7. Assign x-axis positions for each miRNA within each sample # Ensures consistent ordering across facets df <- df %>% arrange(miRNA) %>% group_by(sample) %>% mutate(Position = row_number()) # 8. Identify the top 5 miRNAs with the highest mean RPM across all samples top_mirnas <- df %>% group_by(miRNA) %>% summarise(mean_RPM = mean(RPM)) %>% arrange(desc(mean_RPM)) %>% head(5) %>% pull(miRNA) # 9. Assign colors: red for top 5 miRNAs, darkblue for all others df$color <- ifelse(df$miRNA %in% top_mirnas, "red", "darkblue") # 10. Define readable sample labels for plot facets sample_labels <- c( "parental_cells_1" = "Parental cell 1", "parental_cells_2" = "Parental cell 2", "parental_cells_3" = "Parental cell 3", "untreated_1" = "Untreated 1", "untreated_2" = "Untreated 2" ) # 11. Generate and save the Manhattan plot png("manhattan_plot_top_miRNAs_based_on_mean_RPM.png", width = 1200, height = 1200) ggplot(df, aes(x = Position, y = logRPM, color = color)) + scale_color_manual(values = c("red" = "red", "darkblue" = "darkblue")) + geom_jitter(width = 0.4) + geom_text_repel( data = df %>% filter(miRNA %in% top_mirnas), aes(label = miRNA), box.padding = 0.5, point.padding = 0.5, segment.color = 'gray50', size = 5, max.overlaps = 8, color = "black" ) + labs(x = "", y = "log10(Reads Per Million) (RPM)") + facet_wrap(~sample, scales = "free_x", ncol = 5, labeller = labeller(sample = sample_labels)) + theme_minimal() + theme( axis.text.x = element_blank(), axis.ticks.x = element_blank(), legend.position = "none", text = element_text(size = 16), axis.title = element_text(size = 18), strip.text = element_text(size = 16, face = "bold"), panel.spacing = unit(1.5, "lines") ) dev.off() # 12. Print confirmation message cat("Successfully generated: manhattan_plot_top_miRNAs_based_on_mean_RPM.png\n")