Analysis of the RNA binding protein motifs for RNA Seq
TODOs_NEXTWEEK: how to use RBPmap results to enrich the significant RBPs for RNA-seq (3′ UTRs?) and miRNAs?
🧬 1. RBPmap (Web + Command-line)
Type: Web tool and downloadable Perl scripts
Purpose: Scans RNA sequences for known RBP motifs
Database: Uses curated RBP motif sets (e.g., from literature, CLIP data)
URL: https://rbpmap.technion.ac.il/
✅ Good for: Biologists without strong R background
🚫 Not a package, but scriptable via CLI or HTML results
filtering out RBP from the database
improve the output of the 3utr_down_cleaned.fasta
multiple testing
perform this for miRNA.
http://xgenes.com/article/article-content/146/peak-and-motif-analyses-in-promoters/
http://xgenes.com/article/article-content/154/density-of-motif-plots-and-its-statistical-tests/
There are several alternative R packages and tools to perform motif enrichment analysis for RNA-binding proteins (RBPs), beyond PWMEnrich::motifEnrichment(). Here are the most notable ones:
| Tool / Package | Enrichment | Custom Motifs | CLI or R? | RNA-specific? |
| ———————— | —————– | ————— | ——— | ————– |
| PWMEnrich | ✅ | ✅ | R | ✅ |
| RBPmap | ✅ | ❌ (uses own db) | Web/CLI | ✅ | —-> try RBPmap_results + enrichments!
| Biostrings/TFBSTools | ❌ (only scanning) | ✅ | R | ❌ | #ATtRACT + Biostrings / TFBSTools
| rmap | ✅ (CLIP-based) | ❌ | R | ✅ |
| Homer | ✅ | ✅ | CLI | ⚠ RNA optional |
| MEME (AME, FIMO) | ✅ | ✅ | Web/CLI | ⚠ Generic |
-
Using R to get 3UTR_sequences.fasta, 5UTR
setwd("~/DATA/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/motif_analysis") # === STEP 0: Load libraries === if (!requireNamespace("biomaRt", quietly = TRUE)) install.packages("biomaRt") library(biomaRt) # === STEP 1: Load DEG files === ~/DATA/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/results_2025_1/degenes/MKL-1_wt.EV_vs_parental-up.txt #20086 ~/DATA/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/results_2025_1/degenes/MKL-1_wt.EV_vs_parental-down.txt #634 ~/DATA/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/results_2025_1/degenes/WaGa_wt.EV_vs_parental-up.txt #23832 ~/DATA/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/results_2025_1/degenes/WaGa_wt.EV_vs_parental-down.txt #375 up_file <- "~/DATA/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/results_2025_1/degenes/MKL-1_wt.EV_vs_parental-up.txt" down_file <- "~/DATA/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/results_2025_1/degenes/MKL-1_wt.EV_vs_parental-down.txt" up_degs <- read.table(up_file, header = TRUE, sep = "\t", stringsAsFactors = FALSE) down_degs <- read.table(down_file, header = TRUE, sep = "\t", stringsAsFactors = FALSE) # === STEP 2: Clean & extract Ensembl gene IDs === up_ids <- unique(na.omit(up_degs[[1]])) down_ids <- unique(na.omit(down_degs[[1]])) # Optional: protein-coding filter # up_ids <- unique(up_degs$external_gene_name[up_degs$gene_biotype == "protein_coding"]) # down_ids <- unique(down_degs$external_gene_name[down_degs$gene_biotype == "protein_coding"]) # === STEP 3: Connect to Ensembl Asia mirror === ensembl <- useMart("ensembl", dataset = "hsapiens_gene_ensembl", host = "https://asia.ensembl.org") # === STEP 4: Robust, minimal-loss sequence fetch === get_sequences <- function(ids, seqType, batch_size = 300, retry_depth = 3) { all_seqs <- list() failed_ids <- c() safe_batch_query <- function(batch_ids, depth = 1) { if (length(batch_ids) == 0) return(NULL) tryCatch({ seqs <- getSequence(id = batch_ids, type = "ensembl_gene_id", seqType = seqType, mart = ensembl) seqs <- seqs[seqs[[seqType]] != "", ] return(seqs) }, error = function(e) { if (depth < retry_depth && length(batch_ids) > 1) { # Split batch and retry each half split_batches <- split(batch_ids, ceiling(seq_along(batch_ids)/ceiling(length(batch_ids)/2))) result <- lapply(split_batches, safe_batch_query, depth = depth + 1) return(do.call(rbind, result)) } else { message("❌ Final failure for ", length(batch_ids), " genes. Logging IDs.") failed_ids <<- c(failed_ids, batch_ids) return(NULL) } }) } id_batches <- split(ids, ceiling(seq_along(ids) / batch_size)) for (i in seq_along(id_batches)) { batch <- id_batches[[i]] cat(sprintf(" 🔹 Batch %d/%d (%d genes)\n", i, length(id_batches), length(batch))) result <- safe_batch_query(batch) if (!is.null(result) && nrow(result) > 0) { all_seqs[[length(all_seqs) + 1]] <- result } } if (length(failed_ids) > 0) { writeLines(failed_ids, paste0("failed_ids_", seqType, ".txt")) message("⚠️ Failed IDs saved to failed_ids_", seqType, ".txt") } if (length(all_seqs) == 0) return(data.frame()) return(do.call(rbind, all_seqs)) } # === STEP 5: Write FASTA === write_fasta <- function(sequences, names, file) { con <- file(file, "w") for (i in seq_along(sequences)) { cat(">", names[i], "\n", sequences[i], "\n", file = con, sep = "") } close(con) } # === STEP 6: Background sampling === cat("📦 Fetching background gene list...\n") all_deg_ids <- unique(c(up_ids, down_ids)) all_genes <- getBM(attributes = c("ensembl_gene_id"), mart = ensembl) bg_ids <- setdiff(all_genes$ensembl_gene_id, all_deg_ids) bg_sample <- sample(bg_ids, 1000) # === STEP 7: Fetch sequences by group and type === seq_types <- c("3utr", "5utr", "cds", "transcript") groups <- list( up = up_ids, down = down_ids, background = bg_sample ) for (seq_type in seq_types) { cat(sprintf("\n⏳ Fetching %s sequences...\n", seq_type)) for (group_name in names(groups)) { cat(sprintf(" 🔸 Group: %s\n", group_name)) ids <- groups[[group_name]] seqs <- get_sequences(ids, seq_type) if (nrow(seqs) > 0) { out_file <- paste0(seq_type, "_", group_name, ".fasta") write_fasta(seqs[[seq_type]], seqs$ensembl_gene_id, out_file) cat(" ✅ Written to", out_file, "\n") } else { message("⚠️ No sequences found for ", group_name, " (", seq_type, ")") } } } cat("\n✅ All FASTA files created for: 3′UTR, 5′UTR, CDS, and Transcript.\n")
-
Using https://rbpmap.technion.ac.il/1747409685/results.html, how to predict p-value for a specific RBP?
#Try using RBPmap ~/DATA/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/motif_analysis/run2/3utr_down.fasta
-
Significant RBP enrichments
install.packages("BiocManager") BiocManager::install("BSgenome") BiocManager::install("Biostrings") BiocManager::install("motifmatchr") #BiocManager::install("MotifDb") #BiocManager::install("motifStack") BiocManager::install("TFBSTools") #install.packages("pheatmap") library(Matrix) library(pheatmap) library(Biostrings) library(motifmatchr) library(GenomicRanges) #library(MotifDb) #library(motifStack) library(TFBSTools) library(SummarizedExperiment) library(ggplot2) library(pheatmap) library(PWMEnrich) library(PWMEnrich.Hsapiens.background) # ------------- Methods ----------- clean_fasta_file <- function(infile, outfile) { lines <- readLines(infile) # Initialize variables to store cleaned lines cleaned_lines <- list() add_line <- FALSE # To track when we need to add a sequence for (line in lines) { # If the line is a header and contains "Sequence unavailable", skip it if (startsWith(line, ">") && grepl("Sequence unavailable", line)) { add_line <- FALSE # Don't add this header or its sequence } else if (startsWith(line, ">")) { # If it's a new header, start adding the following sequence add_line <- TRUE cleaned_lines <- c(cleaned_lines, line) } else if (add_line) { # Add the sequence line if it follows a valid header cleaned_lines <- c(cleaned_lines, line) } } # Write cleaned lines to the output file writeLines(cleaned_lines, outfile) } # Function to clean sequences by replacing invalid characters with 'N' clean_invalid_sequences <- function(dna_string_set) { valid_bases <- c("A", "T", "C", "G", "N") # Define valid DNA bases clean_sequences <- lapply(dna_string_set, function(seq) { # Remove invalid characters, replace them with 'N' seq <- gsub("[^ATCGN]", "N", as.character(seq)) # Replace non-ATCGN characters with 'N' DNAString(seq) # Return as a valid DNAString object }) return(DNAStringSet(clean_sequences)) # Return cleaned sequences as a DNAStringSet } # ============================================ # 2. Read and clean 3′ UTR sequences # ============================================ # Clean your FASTA file clean_fasta_file("run2/3utr_up.fasta", "run2/3utr_up_cleaned.fasta") clean_fasta_file("run2/3utr_down.fasta", "run2/3utr_down_cleaned.fasta") clean_fasta_file("run2/3utr_background.fasta", "run2/3utr_background_cleaned.fasta") clean_fasta_file("run2/5utr_up.fasta", "run2/5utr_up_cleaned.fasta") clean_fasta_file("run2/5utr_down.fasta", "run2/5utr_down_cleaned.fasta") clean_fasta_file("run2/5utr_background.fasta", "run2/5utr_background_cleaned.fasta") # Read in the cleaned FASTA files utr_3_up_raw <- readDNAStringSet("run2/3utr_up_cleaned.fasta") utr_3_down_raw <- readDNAStringSet("run2/3utr_down_cleaned.fasta") utr_3_background_raw <- readDNAStringSet("run2/3utr_background_cleaned.fasta") utr_5_up_raw <- readDNAStringSet("run2/5utr_up_cleaned.fasta") utr_5_down_raw <- readDNAStringSet("run2/5utr_down_cleaned.fasta") utr_5_background_raw <- readDNAStringSet("run2/5utr_background_cleaned.fasta") # Clean sequences utr_3_up <- clean_invalid_sequences(utr_3_up_raw) utr_3_down <- clean_invalid_sequences(utr_3_down_raw) utr_3_background <- clean_invalid_sequences(utr_3_background_raw) utr_5_up <- clean_invalid_sequences(utr_5_up_raw) utr_5_down <- clean_invalid_sequences(utr_5_down_raw) utr_5_background <- clean_invalid_sequences(utr_5_background_raw) # Filter out sequences shorter than the shortest PWM (e.g. 6 bp) # and any sequences containing ambiguous bases (non‑ACGT) min_pwm_length <- 6 utr_3_down_clean <- utr_3_down[ width(utr_3_down) >= min_pwm_length & !grepl("[^ACGT]", as.character(utr_3_down)) ] #cat("Kept", length(utr_down_clean), "sequences after cleaning.\n") utr_3_background_clean <- utr_3_background[ width(utr_3_background) >= min_pwm_length & !grepl("[^ACGT]", as.character(utr_3_background)) ] # Install packages (if not already) if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager") BiocManager::install(c("PWMEnrich", "Biostrings", "PWMEnrich.Hsapiens.background", "MotifDb")) BiocManager::install("PWMEnrich.Hsapiens.background") # Load libraries library(PWMEnrich) library(Biostrings) library(PWMEnrich.Hsapiens.background) library(MotifDb) # ============================================ # 3. Load the hg19 GC‑aware background model # ============================================ data("PWMLogn.hg19.MotifDb.Hsap", package = "PWMEnrich.Hsapiens.background") bg_model <- PWMLogn.hg19.MotifDb.Hsap # ============================================ # 4. Define your list of RBP gene symbols # ============================================ # Specific RBP gene symbols rbp_genes <- c("A1CF", "A2BP1", "AC004381.6", "AC008073.5", "ACO1", "AKAP1", "AKAP17A", "AL662825.2", "AL662825.3", "AL662849.4", "AL844853.7", "ALKBH8", "ANKHD1", "ANKRD17", "APLF", "ASCC1", "BICC1", "BOLL", "BRAP", "BRUNOL4", "BRUNOL5", "BRUNOL6", "BX000357.2", "BX005143.2", "BX119957.7", "BX248507.1", "BX248518.2", "BX511012.1", "BX908728.4", "BX927220.1", "C14orf156", "C14orf21", "CARHSP1", "CELF1", "CELF2", "CELF3", "CIRBP", "CNBP", "CNOT4", "CPEB1", "CPEB2", "CPEB3", "CPEB4", "CPSF4", "CPSF4L", "CPSF6", "CPSF7", "CR388219.1", "CR388372.1", "CR753328.1", "CR759778.5", "CR759782.5", "CR847863.1", "CSDA", "CSDC2", "CSDE1", "CSTF2", "CSTF2T", "DAZ1", "DAZ2", "DAZ3", "DAZ4", "DAZAP1", "DAZL", "DDX41", "DDX43", "DDX53", "DHX57", "DHX8", "DNAJC17", "DPPA5", "DUS3L", "EIF2S1", "EIF3B", "EIF3G", "EIF4B", "EIF4H", "ELAVL1", "ELAVL2", "ELAVL3", "ELAVL4", "ENOX1", "ENOX2", "ENSG00000180771", "ENSG00000183403", "ENSG00000185728", "ENSG00000213250", "ENSG00000215042", "ENSG00000215492", "ENSG00000231942", "ENSG00000248163", "ENSG00000249536", "ENSG00000249644", "ENSG00000250177", "ERAL1", "ESRP1", "ESRP2", "EWSR1", "FMR1", "FUBP1", "FUBP3", "FUS", "Fusip1", "FXR1", "FXR2", "G3BP1", "G3BP2", "GRSF1", "HDLBP", "HELZ", "HNRNPA0", "HNRNPA1", "HNRNPA1L2", "HNRNPA2B1", "HNRNPA3", "HNRNPAB", "HNRNPC", "HNRNPCL1", "HNRNPD", "HNRNPF", "HNRNPH1", "HNRNPH2", "HNRNPH3", "hnRNPK", "HNRNPL", "hnRNPLL", "HNRNPM", "HNRNPR", "HNRPDL", "HTATSF1", "IGF2BP1", "IGF2BP2", "IGF2BP3", "KHDRBS1", "KHDRBS2", "KHDRBS3", "KHSRP", "KIAA0430", "KRR1", "LARP1", "LARP1B", "LARP4", "LARP4B", "LARP6", "LARP7", "LENG9", "LIN28A", "LIN28B", "MATR3", "MBNL1", "MBNL2", "MBNL3", "MDM2", "MEX3B", "MEX3C", "MEX3D", "MKI67IP", "MKRN1", "MKRN2", "MKRN3", "MRPS28", "MSI1", "MSI2", "MTHFSD", "MYEF2", "NCBP2", "NCBP2L", "NCL", "NEIL3", "NOL8", "NONO", "NOVA1", "NOVA2", "NPLOC4", "NUP153", "NUPL2", "PABPC1", "PABPC1L", "PABPC1L2A", "PABPC1L2B", "PABPC3", "PABPC4", "PABPC5", "PABPN1", "PABPN1L", "PAN3", "PARP12", "PCBP1", "PCBP2", "PCBP3", "PCBP4", "PDCD11", "PEG10", "PNMA3", "PNO1", "PNPT1", "POLDIP3", "POLR2G", "PPARGC1A", "PPARGC1B", "PPIE", "PPIL4", "PPP1R10", "PPRC1", "PRR3", "PSPC1", "PTBP1", "PTBP2", "PUF60", "PUM1", "PUM2", "QKI", "RALY", "RANBP2", "RAVER1", "RAVER2", "RBBP6", "RBCK1", "RBFOX2", "RBFOX3", "RBM10", "RBM11", "RBM12", "RBM12B", "RBM14-RBM4", "RBM15", "RBM15B", "RBM17", "RBM18", "RBM19", "RBM22", "RBM23", "RBM24", "RBM25", "RBM26", "RBM27", "RBM28", "RBM3", "RBM33", "RBM34", "RBM38", "RBM39", "RBM4", "RBM41", "RBM42", "RBM44", "RBM45", "RBM46", "RBM47", "RBM4B", "RBM5", "RBM6", "RBM7", "RBM8A", "RBMS1", "RBMS2", "RBMS3", "RBMX", "RBMX2", "RBMXL1", "RBMXL2", "RBMXL3", "RBMY1A1", "RBMY1B", "RBMY1D", "RBMY1E", "RBMY1F", "RBMY1J", "RBPMS", "RBPMS2", "RBP_Name", "RC3H1", "RC3H2", "RCAN2", "RDBP", "RDM1", "RNF113A", "RNF113B", "RNF31", "RNPC3", "RNPS1", "ROD1", "RP11-1286E23.4", "RRP7A", "RYBP", "SAFB", "SAFB2", "SAMD4A", "SAMD4B", "SART3", "SCAF4", "SCAF8", "SETD1A", "SETD1B", "SF1", "SF3B4", "SFPQ", "SHARPIN", "SLTM", "SNRNP35", "SNRNP70", "SNRPA", "SNRPB2", "SOLH", "SPEN", "SRBD1", "SREK1", "SRRT", "SRSF1", "SRSF11", "SRSF12", "SRSF2", "SRSF3", "SRSF4", "SRSF5", "SRSF6", "SRSF7", "SRSF9", "SSB", "STAR-PAP", "SUPT6H", "SYNCRIP", "TAB2", "TAB3", "TAF15", "TARDBP", "TDRD10", "TDRKH", "TEX13A", "THOC4", "TIA1", "TIAL1", "TMEM63A", "TMEM63B", "TNRC6A", "TNRC6B", "TNRC6C", "TOE1", "TRA2A", "TRA2B", "TRMT1", "TRMT2A", "TRNAU1AP", "TTC14", "U2AF1", "U2AF1L4", "U2AF2", "U2SURP", "UHMK1", "UNK", "UNKL", "UPF3B", "YAF2", "YB-1", "YBX2", "YTHDC1", "YTHDC2", "YTHDF1", "YTHDF2", "ZC3H10", "ZC3H13", "ZC3H15", "ZC3H18", "ZC3H3", "ZC3H4", "ZC3H6", "ZC3H7A", "ZC3H7B", "ZC3H8", "ZCCHC11", "ZCCHC13", "ZCCHC14", "ZCCHC17", "ZCCHC2", "ZCCHC3", "ZCCHC5", "ZCCHC6", "ZCCHC7", "ZCCHC8", "ZCCHC9", "ZCRB1", "ZFP36", "ZFP36L1", "ZFP36L2", "ZGPAT", "ZMAT5", "ZNF638", "ZRANB1", "ZRANB2", "ZRANB3", "ZRSR1", "ZRSR2") # ============================================ # 5. Extract all PWMs from background & filter to RBPs # ============================================ #all_pwms <- bg_model@pwms #2287 #rbp_pwms <- all_pwms[ # vapply(names(all_pwms), # function(nm) any(nm %in% rbp_genes), # logical(1)) #] # 如果你想做模糊匹配(基因名作为子串出现),可以用: rbp_pwms <- all_pwms[grepl(paste(rbp_genes, collapse="|"), names(all_pwms), ignore.case=TRUE)] cat("Number of RBP PWMs selected:", length(rbp_pwms), "\n") cat("Example PWM names:\n", head(names(rbp_pwms), 10), "\n") # ============================================ # 6. Create a custom background model with only RBP PWMs # ============================================ custom_bg <- bg_model custom_bg@pwms <- rbp_pwms # ============================================ # 7. Run motif enrichment # ============================================ enrichment_result <- motifEnrichment(utr_3_down_clean, custom_bg) # ============================================ # 8. Generate report & convert to data.frame # ============================================ report_list <- groupReport(enrichment_result) report_df <- as.data.frame(report_list) # ============================================ # 9. Apply BH correction and filter significant hits # ============================================ report_df$adj_pval <- p.adjust(report_df$p.value, method = "BH") significant_hits <- subset(report_df, adj_pval < 0.05) # ============================================ # 10. View results # ============================================ # Display motif name, raw p-value, adjusted p-value, and enrichment score colnames(significant_hits)[ match(c("target","id","raw.score"), colnames(significant_hits)) ] <- c("Gene", "Motif_ID", "EnrichmentScore") print( head( significant_hits[, c("Gene","Motif_ID","EnrichmentScore","p.value","adj_pval")], 6 ) ) p <- ggplot(significant_hits, aes(x = reorder(Motif_ID, adj_pval), y = -log10(adj_pval))) + geom_col() + coord_flip() + labs( x = "Motif ID", y = expression(-log[10]("FDR‑adjusted p‑value")), title = "Significant RBP Motif Enrichment" ) + theme_minimal() ggsave( filename = "RBP_motif_enrichment.png", # or any path you like plot = p, width = 8, # inches height = 6, # inches dpi = 300 # resolution )
-
Perform similar RBP-mapping by myself
# Load a set of RBP motifs from a database (e.g., RBPDB) RBPDB, ENCODE, JASPAR, or even UCL's RBP motifs. # Set path to your PWM directory pfm_dir <- "/media/jhuang/Elements/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/motif_analysis/PFMDir" # List all .pfm files pfm_files <- list.files(pfm_dir, pattern = "\\.pfm$", full.names = TRUE) # Create an empty list to store PWMs pfm_list <- list() # Loop through each file for (file in pfm_files) { # Read table pfm <- tryCatch({ mat <- read.table(file, header = FALSE) mat <- as.matrix(mat) # Optionally add column names (assuming 4 columns for A, C, G, T) if (ncol(mat) == 4) { colnames(mat) <- c("A", "C", "G", "T") } mat }, error = function(e) { message("Failed to read: ", file) NULL }) # Use the filename (without extension) as list name name <- tools::file_path_sans_ext(basename(file)) pfm_list[[name]] <- pfm } # Check one example str(pfm_list[[1]]) #saveRDS(pfm_list, file = "compiled_pfm_list.rds") #The error you're seeing (invalid PFM 'x': not an integer matrix) indicates that the PWM function expects the input matrix to represent raw position frequency matrices (PFMs), which are expected to be integer counts, but the matrices you have are likely log-odds matrices (floating point values), which represent the log-transformed scores instead of raw counts. # convert to PFMatrix pfm_named_list <- lapply(pfm_list, function(mat) { mat <- as.matrix(mat) if (!all(rownames(mat) %in% c("A", "C", "G", "T"))) { rownames(mat) <- c("A", "C", "G", "T") } PFMatrix(ID = "motif", name = "motif", profileMatrix = mat) }) # Create an empty list to store results match_results_list <- list() # Iterate through each PFMatrix object in the list for (i in seq_along(pfm_named_list)) { # Get the current PFMatrix object current_pfm <- pfm_named_list[[i]] # Run matchMotifs for the current motif match_result <- tryCatch({ #matchMotifs(pfm_obj, subject = utr_3_down, genome = NULL, p.cutoff = 1e-3) matchMotifs(current_pfm, subject = utr_3_down, genome = NULL, p.cutoff = 1e-3) }, error = function(e) { message("Error with motif ", names(pfm_named_list)[i], ": ", e$message) return(NULL) # Return NULL if error occurs }) # Store the result if no error if (!is.null(match_result)) { match_results_list[[names(pfm_named_list)[i]]] <- match_result } } # Check the results match_results_list # Extract motif match results for one motif motif_matches_1004_8676391 <- assay(match_results_list[["1004_8676391"]], "motifMatches") # Convert sparse matrix to dense matrix dense_matrix <- as.matrix(motif_matches_1004_8676391) # Optional: Add rownames for better visualization (if available) if (!is.null(names(utr_3_down))) { rownames(dense_matrix) <- names(utr_3_down) } # Convert logical matrix to numeric (1 for TRUE, 0 for FALSE) dense_matrix_num <- matrix(as.numeric(dense_matrix), nrow = nrow(dense_matrix), dimnames = dimnames(dense_matrix)) # Now plot the heatmap png("motif_match_1004_8676391.png", width = 800, height = 1600) pheatmap(dense_matrix_num, cluster_rows = FALSE, cluster_cols = FALSE, color = c("grey", "steelblue"), legend_breaks = c(0, 1), legend_labels = c("No match", "Match"), main = "Motif Match: 1004_8676391") dev.off() #✅ Output #You will see a binary heatmap, where: # * Blue cells = motif matched # * White cells = no match #Let me know if you want to loop this over all motifs or save the plot to a file.
-
Does hg19 vs hg38 Position Matter in Motif Enrichment?
✅ If you're doing motif enrichment analysis only (like PWMEnrich): Position DOES NOT matter. Here's why: Aspect Explanation 🔍 PWMEnrich Only scans sequences for motif matches based on nucleotide content (e.g., enrichment of AU-rich motifs in downregulated 3′UTRs). 📏 Coordinates Genomic coordinates (hg19 vs hg38) are not used in enrichment. Only the sequence itself matters. ⚖️ Statistical Test The enrichment is based on observed motif frequency vs background expectations (GC-aware or custom). No genome alignment is done. So: ✅ As long as your 3′ UTR sequences are correct and biologically relevant, using an hg19 background is safe — even if your gene models are from hg38. ❗️ When does genome version matter? If you're doing things like: Use case Position-sensitive? 🔬 Variant effect on motifs (e.g., SNPs in motifs) ✅ Yes 🧭 Mapping motifs to genome positions (e.g., ChIP-seq overlaps) ✅ Yes 📦 Loading sequences from coordinates (not FASTA) ✅ Yes 🧠 Expression + genomic track integration (e.g., motif + ATAC-seq) ✅ Yes Then you must ensure genome versions match exactly. ✅ In Your Case: Motif Enrichment of Downregulated 3′UTRs If you are: Feeding real 3′UTR sequences (as DNAStringSet) for downregulated genes Using PWMEnrich purely for motif enrichment Then: ✅ hg19 vs hg38 does not matter ❗ Only sequence content matters Let me know if you want to later map enriched motifs back to genome positions — that’s when we’d switch to position-aware tools like FIMO, MOODS, or rtracklayer.
-
Custom RBP-database preparation (failed!)
# Example: Create a PWM for 2 fake motifs (replace with your actual PWMs) #motif1 <- matrix(c(0.2, 0.3, 0.3, 0.2, # 0.1, 0.4, 0.4, 0.1, # 0.25, 0.25, 0.25, 0.25, # 0.3, 0.2, 0.3, 0.2), nrow = 4, byrow = TRUE, # dimnames = list(c("A", "C", "G", "T"), NULL)) #motif2 <- matrix(c(0.3, 0.2, 0.2, 0.3, # 0.4, 0.1, 0.4, 0.1, # 0.25, 0.25, 0.25, 0.25, # 0.2, 0.3, 0.2, 0.3), nrow = 4, byrow = TRUE, # dimnames = list(c("A", "C", "G", "T"), NULL)) #rbp_pwms <- list(RBP1 = motif1, RBP2 = motif2) pwm_dir <- "/media/jhuang/Elements/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/motif_analysis/PWMDir" pwm_files <- list.files(pwm_dir, pattern = "\\.pwm$", full.names = TRUE) rbp_pwms <- list() for (file in pwm_files) { # Read table pwm <- tryCatch({ mat <- read.table(file, header = FALSE) mat <- as.matrix(mat) # Optionally add column names (assuming 4 columns for A, C, G, T) if (ncol(mat) == 4) { colnames(mat) <- c("A", "C", "G", "T") } mat }, error = function(e) { message("Failed to read: ", file) NULL }) # Use the filename (without extension) as list name name <- tools::file_path_sans_ext(basename(file)) rbp_pwms[[name]] <- pwm } # -- filtering (ERROR) -- pwm_list <- motifs(bg_model) # Example: filter motifs containing 'ELAVL' or 'HNRNP' #rbp_pwms <- pwm_list[grep("ELAVL|HNRNP|IGF2BP|RBFOX|FMR1|PUM", names(pwm_list))] rbp_enrichment <- motifEnrichment(utr_3_down, bg_model, pwmList = rbp_pwms) rbp_report <- groupReport(rbp_enrichment) # -- from MotifDb -- library(MotifDb) human_motifs <- query(MotifDb, "Hsapiens") rbp_motifs <- query(human_motifs, "RBP") # ---- try Build your background model directly from these PWMs ---- # 1. List all PWM files files_all <- list.files("~/Downloads/cisbp-rna/pwms_all_motifs", pattern="\\.txt$", full.names=TRUE) # 2. Filter out empty files file_sizes <- file.info(files_all)$size files <- files_all[file_sizes > 0] cat("Reading", length(files), "non‑empty PWM files out of", length(files_all), "\n") # 3. Define the reader (as before) readCisbpPfm <- function(path) { df <- read.table(path, header=TRUE, stringsAsFactors=FALSE) mat_rna <- as.matrix(df[, c("A","C","G","U")]) mat_dna <- t(mat_rna) rownames(mat_dna) <- c("A","C","G","T") sweep(mat_dna, 2, colSums(mat_dna), FUN="/") } # 4. Read only the non‐empty files cisbp_mats <- lapply(files, readCisbpPfm) names(cisbp_mats) <- tools::file_path_sans_ext(basename(files)) # 5. (Optional) Check the first few head(names(cisbp_mats)) length(cisbp_mats) #193 # 6. Build your background model directly from these PWMs: bg_utrs <- utr_3_background_clean #readDNAStringSet("run2/3utr_background.fasta") #bg_utrs <- bg_utrs[ width(bg_utrs)>=6 & !grepl("[^ACGT]", as.character(bg_utrs)) ] custom_bg <- makeBackground( seqs = bg_utrs, motifs = cisbp_mats, type = "logn", organism = "hg19", verbose = TRUE ) custom_bg <- makeBackground( sequences = bg_utrs, # your DNAStringSet of background UTRs motifs = cisbp_mats, # your named list of PWM matrices type = "logn", # log‑normal GC‑aware model organism = NULL, # disable built‑in genomes verbose = TRUE ) custom_bg <- makeBackground( cisbp_mats, # 1) motifs organism = "hg19", type = "logn", # 2) background model type bg.seq = bg_utrs, # 3) override organism by supplying your own sequences quick = FALSE # (optional) fit to all sequences for best accuracy ) # 1. Pick a library size to re‑scale your PWMs into PFMs library_size <- 1000 # you can also try 1000 # 2. Convert each normalized matrix into integer counts cisbp_pfms <- lapply(cisbp_mats, function(pmat) { counts <- round(pmat * library_size) # ensure no column sums to zero (add pseudocount 1 if needed) zero_cols <- which(colSums(counts) == 0) if (length(zero_cols) > 0) { counts[, zero_cols] <- 1 } counts }) # 3. Build a custom background (PFMs + your bg UTRs) custom_bg <- makeBackground( cisbp_pfms, # 1) a list of integer PFMs type = "logn", # 2) log‑normal GC‑aware model bg.seq = bg_utrs,# 3) your background DNAStringSet quick = FALSE # fit on all sequences ) # … then continue with motifEnrichment() on your down‑regulated UTRs as before. # 5) Read & clean your down‑regulated 3′UTRs down_utrs <- readDNAStringSet("downregulated_utrs.fasta") down_utrs <- down_utrs[ width(down_utrs)>=6 & !grepl("[^ACGT]", as.character(down_utrs)) ] # 6) Run the enrichment res <- motifEnrichment(down_utrs, custom_bg) report <- groupReport(res) df <- as.data.frame(report) df$adj_pval <- p.adjust(df$p.value, method="BH") # 7) Subset significant sig <- subset(df, adj_pval < 0.05) print(sig[, c("motif.name","p.value","adj_pval","enrichmentScore")])