Analysis of the RNA binding protein motifs for RNA Seq

TODOs_NEXTWEEK: how to use RBPmap results to enrich the significant RBPs for RNA-seq (3′ UTRs?) and miRNAs?

    🧬 1. RBPmap (Web + Command-line)

        Type: Web tool and downloadable Perl scripts

        Purpose: Scans RNA sequences for known RBP motifs

        Database: Uses curated RBP motif sets (e.g., from literature, CLIP data)

        URL: https://rbpmap.technion.ac.il/

    ✅ Good for: Biologists without strong R background
    🚫 Not a package, but scriptable via CLI or HTML results

filtering out RBP from the database

improve the output of the 3utr_down_cleaned.fasta

multiple testing

perform this for miRNA.

http://xgenes.com/article/article-content/146/peak-and-motif-analyses-in-promoters/

http://xgenes.com/article/article-content/154/density-of-motif-plots-and-its-statistical-tests/

There are several alternative R packages and tools to perform motif enrichment analysis for RNA-binding proteins (RBPs), beyond PWMEnrich::motifEnrichment(). Here are the most notable ones:

| ———————— | —————– | ————— | ——— | ————– |

| PWMEnrich | ✅ | ✅ | R | ✅ |

| Biostrings/TFBSTools | ❌ (only scanning) | ✅ | R | ❌ | #ATtRACT + Biostrings / TFBSTools

| rmap | ✅ (CLIP-based) | ❌ | R | ✅ |

| Homer | ✅ | ✅ | CLI | ⚠ RNA optional |

Using R to get 3UTR_sequences.fasta, 5UTR

 setwd("~/DATA/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/motif_analysis")

 # === STEP 0: Load libraries ===
 if (!requireNamespace("biomaRt", quietly = TRUE)) install.packages("biomaRt")
 library(biomaRt)

 # === STEP 1: Load DEG files ===
 ~/DATA/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/results_2025_1/degenes/MKL-1_wt.EV_vs_parental-up.txt    #20086
 ~/DATA/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/results_2025_1/degenes/MKL-1_wt.EV_vs_parental-down.txt  #634
 ~/DATA/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/results_2025_1/degenes/WaGa_wt.EV_vs_parental-up.txt     #23832
 ~/DATA/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/results_2025_1/degenes/WaGa_wt.EV_vs_parental-down.txt   #375
 up_file <- "~/DATA/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/results_2025_1/degenes/MKL-1_wt.EV_vs_parental-up.txt"
 down_file <- "~/DATA/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/results_2025_1/degenes/MKL-1_wt.EV_vs_parental-down.txt"

 up_degs <- read.table(up_file, header = TRUE, sep = "\t", stringsAsFactors = FALSE)
 down_degs <- read.table(down_file, header = TRUE, sep = "\t", stringsAsFactors = FALSE)

 # === STEP 2: Clean & extract Ensembl gene IDs ===
 up_ids <- unique(na.omit(up_degs[[1]]))
 down_ids <- unique(na.omit(down_degs[[1]]))

 # Optional: protein-coding filter
 # up_ids <- unique(up_degs$external_gene_name[up_degs$gene_biotype == "protein_coding"])
 # down_ids <- unique(down_degs$external_gene_name[down_degs$gene_biotype == "protein_coding"])

 # === STEP 3: Connect to Ensembl Asia mirror ===
 ensembl <- useMart("ensembl", dataset = "hsapiens_gene_ensembl", host = "https://asia.ensembl.org")

 # === STEP 4: Robust, minimal-loss sequence fetch ===
 get_sequences <- function(ids, seqType, batch_size = 300, retry_depth = 3) {
 all_seqs <- list()
 failed_ids <- c()

 safe_batch_query <- function(batch_ids, depth = 1) {
     if (length(batch_ids) == 0) return(NULL)

     tryCatch({
     seqs <- getSequence(id = batch_ids,
                         type = "ensembl_gene_id",
                         seqType = seqType,
                         mart = ensembl)
     seqs <- seqs[seqs[[seqType]] != "", ]
     return(seqs)
     }, error = function(e) {
     if (depth < retry_depth && length(batch_ids) > 1) {
         # Split batch and retry each half
         split_batches <- split(batch_ids, ceiling(seq_along(batch_ids)/ceiling(length(batch_ids)/2)))
         result <- lapply(split_batches, safe_batch_query, depth = depth + 1)
         return(do.call(rbind, result))
     } else {
         message("❌ Final failure for ", length(batch_ids), " genes. Logging IDs.")
         failed_ids <<- c(failed_ids, batch_ids)
         return(NULL)
     }
     })
 }

 id_batches <- split(ids, ceiling(seq_along(ids) / batch_size))
 for (i in seq_along(id_batches)) {
     batch <- id_batches[[i]]
     cat(sprintf("    🔹 Batch %d/%d (%d genes)\n", i, length(id_batches), length(batch)))
     result <- safe_batch_query(batch)
     if (!is.null(result) && nrow(result) > 0) {
     all_seqs[[length(all_seqs) + 1]] <- result
     }
 }

 if (length(failed_ids) > 0) {
     writeLines(failed_ids, paste0("failed_ids_", seqType, ".txt"))
     message("⚠️ Failed IDs saved to failed_ids_", seqType, ".txt")
 }

 if (length(all_seqs) == 0) return(data.frame())
 return(do.call(rbind, all_seqs))
 }

 # === STEP 5: Write FASTA ===
 write_fasta <- function(sequences, names, file) {
 con <- file(file, "w")
 for (i in seq_along(sequences)) {
     cat(">", names[i], "\n", sequences[i], "\n", file = con, sep = "")
 }
 close(con)
 }

 # === STEP 6: Background sampling ===
 cat("📦 Fetching background gene list...\n")
 all_deg_ids <- unique(c(up_ids, down_ids))
 all_genes <- getBM(attributes = c("ensembl_gene_id"), mart = ensembl)
 bg_ids <- setdiff(all_genes$ensembl_gene_id, all_deg_ids)
 bg_sample <- sample(bg_ids, 1000)

 # === STEP 7: Fetch sequences by group and type ===
 seq_types <- c("3utr", "5utr", "cds", "transcript")
 groups <- list(
 up = up_ids,
 down = down_ids,
 background = bg_sample
 )

 for (seq_type in seq_types) {
 cat(sprintf("\n⏳ Fetching %s sequences...\n", seq_type))
 for (group_name in names(groups)) {
     cat(sprintf("  🔸 Group: %s\n", group_name))
     ids <- groups[[group_name]]
     seqs <- get_sequences(ids, seq_type)
     if (nrow(seqs) > 0) {
     out_file <- paste0(seq_type, "_", group_name, ".fasta")
     write_fasta(seqs[[seq_type]], seqs$ensembl_gene_id, out_file)
     cat("    ✅ Written to", out_file, "\n")
     } else {
     message("⚠️ No sequences found for ", group_name, " (", seq_type, ")")
     }
 }
 }

 cat("\n✅ All FASTA files created for: 3′UTR, 5′UTR, CDS, and Transcript.\n")

Using https://rbpmap.technion.ac.il/1747409685/results.html, how to predict p-value for a specific RBP?
```
 #Try using RBPmap
 ~/DATA/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/motif_analysis/run2/3utr_down.fasta
```

Significant RBP enrichments

 install.packages("BiocManager")
 BiocManager::install("BSgenome")
 BiocManager::install("Biostrings")
 BiocManager::install("motifmatchr")
 #BiocManager::install("MotifDb")
 #BiocManager::install("motifStack")
 BiocManager::install("TFBSTools")
 #install.packages("pheatmap")

 library(Matrix)
 library(pheatmap)
 library(Biostrings)
 library(motifmatchr)
 library(GenomicRanges)
 #library(MotifDb)
 #library(motifStack)
 library(TFBSTools)
 library(SummarizedExperiment)
 library(ggplot2)
 library(pheatmap)

 library(PWMEnrich)
 library(PWMEnrich.Hsapiens.background)

 # ------------- Methods -----------

 clean_fasta_file <- function(infile, outfile) {
 lines <- readLines(infile)

 # Initialize variables to store cleaned lines
 cleaned_lines <- list()
 add_line <- FALSE  # To track when we need to add a sequence

 for (line in lines) {
     # If the line is a header and contains "Sequence unavailable", skip it
     if (startsWith(line, ">") && grepl("Sequence unavailable", line)) {
     add_line <- FALSE  # Don't add this header or its sequence
     } else if (startsWith(line, ">")) {
     # If it's a new header, start adding the following sequence
     add_line <- TRUE
     cleaned_lines <- c(cleaned_lines, line)
     } else if (add_line) {
     # Add the sequence line if it follows a valid header
     cleaned_lines <- c(cleaned_lines, line)
     }
 }

 # Write cleaned lines to the output file
 writeLines(cleaned_lines, outfile)
 }

 # Function to clean sequences by replacing invalid characters with 'N'
 clean_invalid_sequences <- function(dna_string_set) {
 valid_bases <- c("A", "T", "C", "G", "N")  # Define valid DNA bases
 clean_sequences <- lapply(dna_string_set, function(seq) {
     # Remove invalid characters, replace them with 'N'
     seq <- gsub("[^ATCGN]", "N", as.character(seq))  # Replace non-ATCGN characters with 'N'
     DNAString(seq)  # Return as a valid DNAString object
 })
 return(DNAStringSet(clean_sequences))  # Return cleaned sequences as a DNAStringSet
 }

 # ============================================
 # 2. Read and clean 3′ UTR sequences
 # ============================================
 # Clean your FASTA file
 clean_fasta_file("run2/3utr_up.fasta", "run2/3utr_up_cleaned.fasta")
 clean_fasta_file("run2/3utr_down.fasta", "run2/3utr_down_cleaned.fasta")
 clean_fasta_file("run2/3utr_background.fasta", "run2/3utr_background_cleaned.fasta")
 clean_fasta_file("run2/5utr_up.fasta", "run2/5utr_up_cleaned.fasta")
 clean_fasta_file("run2/5utr_down.fasta", "run2/5utr_down_cleaned.fasta")
 clean_fasta_file("run2/5utr_background.fasta", "run2/5utr_background_cleaned.fasta")

 # Read in the cleaned FASTA files
 utr_3_up_raw <- readDNAStringSet("run2/3utr_up_cleaned.fasta")
 utr_3_down_raw <- readDNAStringSet("run2/3utr_down_cleaned.fasta")
 utr_3_background_raw <- readDNAStringSet("run2/3utr_background_cleaned.fasta")
 utr_5_up_raw <- readDNAStringSet("run2/5utr_up_cleaned.fasta")
 utr_5_down_raw <- readDNAStringSet("run2/5utr_down_cleaned.fasta")
 utr_5_background_raw <- readDNAStringSet("run2/5utr_background_cleaned.fasta")

 # Clean sequences
 utr_3_up <- clean_invalid_sequences(utr_3_up_raw)
 utr_3_down <- clean_invalid_sequences(utr_3_down_raw)
 utr_3_background <- clean_invalid_sequences(utr_3_background_raw)
 utr_5_up <- clean_invalid_sequences(utr_5_up_raw)
 utr_5_down <- clean_invalid_sequences(utr_5_down_raw)
 utr_5_background <- clean_invalid_sequences(utr_5_background_raw)

 # Filter out sequences shorter than the shortest PWM (e.g. 6 bp)
 # and any sequences containing ambiguous bases (non‑ACGT)
 min_pwm_length <- 6
 utr_3_down_clean <- utr_3_down[
 width(utr_3_down) >= min_pwm_length &
 !grepl("[^ACGT]", as.character(utr_3_down))
 ]
 #cat("Kept", length(utr_down_clean), "sequences after cleaning.\n")
 utr_3_background_clean <- utr_3_background[
 width(utr_3_background) >= min_pwm_length &
 !grepl("[^ACGT]", as.character(utr_3_background))
 ]

 # Install packages (if not already)
 if (!requireNamespace("BiocManager", quietly = TRUE)) install.packages("BiocManager")
 BiocManager::install(c("PWMEnrich", "Biostrings", "PWMEnrich.Hsapiens.background", "MotifDb"))
 BiocManager::install("PWMEnrich.Hsapiens.background")

 # Load libraries
 library(PWMEnrich)
 library(Biostrings)
 library(PWMEnrich.Hsapiens.background)
 library(MotifDb)

 # ============================================
 # 3. Load the hg19 GC‑aware background model
 # ============================================
 data("PWMLogn.hg19.MotifDb.Hsap",
     package = "PWMEnrich.Hsapiens.background")
 bg_model <- PWMLogn.hg19.MotifDb.Hsap

 # ============================================
 # 4. Define your list of RBP gene symbols
 # ============================================

 # Specific RBP gene symbols
 rbp_genes <- c("A1CF", "A2BP1", "AC004381.6", "AC008073.5", "ACO1", "AKAP1", "AKAP17A", "AL662825.2", "AL662825.3", "AL662849.4", "AL844853.7", "ALKBH8", "ANKHD1", "ANKRD17", "APLF", "ASCC1", "BICC1", "BOLL", "BRAP", "BRUNOL4", "BRUNOL5", "BRUNOL6", "BX000357.2", "BX005143.2", "BX119957.7", "BX248507.1", "BX248518.2", "BX511012.1", "BX908728.4", "BX927220.1", "C14orf156", "C14orf21", "CARHSP1", "CELF1", "CELF2", "CELF3", "CIRBP", "CNBP", "CNOT4", "CPEB1", "CPEB2", "CPEB3", "CPEB4", "CPSF4", "CPSF4L", "CPSF6", "CPSF7", "CR388219.1", "CR388372.1", "CR753328.1", "CR759778.5", "CR759782.5", "CR847863.1", "CSDA", "CSDC2", "CSDE1", "CSTF2", "CSTF2T", "DAZ1", "DAZ2", "DAZ3", "DAZ4", "DAZAP1", "DAZL", "DDX41", "DDX43", "DDX53", "DHX57", "DHX8", "DNAJC17", "DPPA5", "DUS3L", "EIF2S1", "EIF3B", "EIF3G", "EIF4B", "EIF4H", "ELAVL1", "ELAVL2", "ELAVL3", "ELAVL4", "ENOX1", "ENOX2", "ENSG00000180771", "ENSG00000183403", "ENSG00000185728", "ENSG00000213250", "ENSG00000215042", "ENSG00000215492", "ENSG00000231942", "ENSG00000248163", "ENSG00000249536", "ENSG00000249644", "ENSG00000250177", "ERAL1", "ESRP1", "ESRP2", "EWSR1", "FMR1", "FUBP1", "FUBP3", "FUS", "Fusip1", "FXR1", "FXR2", "G3BP1", "G3BP2", "GRSF1", "HDLBP", "HELZ", "HNRNPA0", "HNRNPA1", "HNRNPA1L2", "HNRNPA2B1", "HNRNPA3", "HNRNPAB", "HNRNPC", "HNRNPCL1", "HNRNPD", "HNRNPF", "HNRNPH1", "HNRNPH2", "HNRNPH3", "hnRNPK", "HNRNPL", "hnRNPLL", "HNRNPM", "HNRNPR", "HNRPDL", "HTATSF1", "IGF2BP1", "IGF2BP2", "IGF2BP3", "KHDRBS1", "KHDRBS2", "KHDRBS3", "KHSRP", "KIAA0430", "KRR1", "LARP1", "LARP1B", "LARP4", "LARP4B", "LARP6", "LARP7", "LENG9", "LIN28A", "LIN28B", "MATR3", "MBNL1", "MBNL2", "MBNL3", "MDM2", "MEX3B", "MEX3C", "MEX3D", "MKI67IP", "MKRN1", "MKRN2", "MKRN3", "MRPS28", "MSI1", "MSI2", "MTHFSD", "MYEF2", "NCBP2", "NCBP2L", "NCL", "NEIL3", "NOL8", "NONO", "NOVA1", "NOVA2", "NPLOC4", "NUP153", "NUPL2", "PABPC1", "PABPC1L", "PABPC1L2A", "PABPC1L2B", "PABPC3", "PABPC4", "PABPC5", "PABPN1", "PABPN1L", "PAN3", "PARP12", "PCBP1", "PCBP2", "PCBP3", "PCBP4", "PDCD11", "PEG10", "PNMA3", "PNO1", "PNPT1", "POLDIP3", "POLR2G", "PPARGC1A", "PPARGC1B", "PPIE", "PPIL4", "PPP1R10", "PPRC1", "PRR3", "PSPC1", "PTBP1", "PTBP2", "PUF60", "PUM1", "PUM2", "QKI", "RALY", "RANBP2", "RAVER1", "RAVER2", "RBBP6", "RBCK1", "RBFOX2", "RBFOX3", "RBM10", "RBM11", "RBM12", "RBM12B", "RBM14-RBM4", "RBM15", "RBM15B", "RBM17", "RBM18", "RBM19", "RBM22", "RBM23", "RBM24", "RBM25", "RBM26", "RBM27", "RBM28", "RBM3", "RBM33", "RBM34", "RBM38", "RBM39", "RBM4", "RBM41", "RBM42", "RBM44", "RBM45", "RBM46", "RBM47", "RBM4B", "RBM5", "RBM6", "RBM7", "RBM8A", "RBMS1", "RBMS2", "RBMS3", "RBMX", "RBMX2", "RBMXL1", "RBMXL2", "RBMXL3", "RBMY1A1", "RBMY1B", "RBMY1D", "RBMY1E", "RBMY1F", "RBMY1J", "RBPMS", "RBPMS2", "RBP_Name", "RC3H1", "RC3H2", "RCAN2", "RDBP", "RDM1", "RNF113A", "RNF113B", "RNF31", "RNPC3", "RNPS1", "ROD1", "RP11-1286E23.4", "RRP7A", "RYBP", "SAFB", "SAFB2", "SAMD4A", "SAMD4B", "SART3", "SCAF4", "SCAF8", "SETD1A", "SETD1B", "SF1", "SF3B4", "SFPQ", "SHARPIN", "SLTM", "SNRNP35", "SNRNP70", "SNRPA", "SNRPB2", "SOLH", "SPEN", "SRBD1", "SREK1", "SRRT", "SRSF1", "SRSF11", "SRSF12", "SRSF2", "SRSF3", "SRSF4", "SRSF5", "SRSF6", "SRSF7", "SRSF9", "SSB", "STAR-PAP", "SUPT6H", "SYNCRIP", "TAB2", "TAB3", "TAF15", "TARDBP", "TDRD10", "TDRKH", "TEX13A", "THOC4", "TIA1", "TIAL1", "TMEM63A", "TMEM63B", "TNRC6A", "TNRC6B", "TNRC6C", "TOE1", "TRA2A", "TRA2B", "TRMT1", "TRMT2A", "TRNAU1AP", "TTC14", "U2AF1", "U2AF1L4", "U2AF2", "U2SURP", "UHMK1", "UNK", "UNKL", "UPF3B", "YAF2", "YB-1", "YBX2", "YTHDC1", "YTHDC2", "YTHDF1", "YTHDF2", "ZC3H10", "ZC3H13", "ZC3H15", "ZC3H18", "ZC3H3", "ZC3H4", "ZC3H6", "ZC3H7A", "ZC3H7B", "ZC3H8", "ZCCHC11", "ZCCHC13", "ZCCHC14", "ZCCHC17", "ZCCHC2", "ZCCHC3", "ZCCHC5", "ZCCHC6", "ZCCHC7", "ZCCHC8", "ZCCHC9", "ZCRB1", "ZFP36", "ZFP36L1", "ZFP36L2", "ZGPAT", "ZMAT5", "ZNF638", "ZRANB1", "ZRANB2", "ZRANB3", "ZRSR1", "ZRSR2")

 # ============================================
 # 5. Extract all PWMs from background & filter to RBPs
 # ============================================
 #all_pwms <- bg_model@pwms  #2287
 #rbp_pwms <- all_pwms[
 #  vapply(names(all_pwms),
 #         function(nm) any(nm %in% rbp_genes),
 #         logical(1))
 #]

 # 如果你想做模糊匹配（基因名作为子串出现），可以用：
 rbp_pwms <- all_pwms[grepl(paste(rbp_genes, collapse="|"), names(all_pwms), ignore.case=TRUE)]

 cat("Number of RBP PWMs selected:", length(rbp_pwms), "\n")
 cat("Example PWM names:\n", head(names(rbp_pwms), 10), "\n")

 # ============================================
 # 6. Create a custom background model with only RBP PWMs
 # ============================================
 custom_bg <- bg_model
 custom_bg@pwms <- rbp_pwms

 # ============================================
 # 7. Run motif enrichment
 # ============================================
 enrichment_result <- motifEnrichment(utr_3_down_clean, custom_bg)

 # ============================================
 # 8. Generate report & convert to data.frame
 # ============================================
 report_list <- groupReport(enrichment_result)
 report_df   <- as.data.frame(report_list)

 # ============================================
 # 9. Apply BH correction and filter significant hits
 # ============================================
 report_df$adj_pval <- p.adjust(report_df$p.value, method = "BH")
 significant_hits <- subset(report_df, adj_pval < 0.05)

 # ============================================
 # 10. View results
 # ============================================
 # Display motif name, raw p-value, adjusted p-value, and enrichment score
 colnames(significant_hits)[
 match(c("target","id","raw.score"), colnames(significant_hits))
 ] <- c("Gene", "Motif_ID", "EnrichmentScore")

 print(
 head(
     significant_hits[, c("Gene","Motif_ID","EnrichmentScore","p.value","adj_pval")],
     6
 )
 )

 p <- ggplot(significant_hits,
             aes(x = reorder(Motif_ID, adj_pval),
                 y = -log10(adj_pval))) +
 geom_col() +
 coord_flip() +
 labs(
     x = "Motif ID",
     y = expression(-log[10]("FDR‑adjusted p‑value")),
     title = "Significant RBP Motif Enrichment"
 ) +
 theme_minimal()
 ggsave(
 filename = "RBP_motif_enrichment.png",  # or any path you like
 plot     = p,
 width    = 8,     # inches
 height   = 6,     # inches
 dpi      = 300    # resolution
 )

Perform similar RBP-mapping by myself

 # Load a set of RBP motifs from a database (e.g., RBPDB) RBPDB, ENCODE, JASPAR, or even UCL's RBP motifs.

 # Set path to your PWM directory
 pfm_dir <- "/media/jhuang/Elements/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/motif_analysis/PFMDir"

 # List all .pfm files
 pfm_files <- list.files(pfm_dir, pattern = "\\.pfm$", full.names = TRUE)

 # Create an empty list to store PWMs
 pfm_list <- list()

 # Loop through each file
 for (file in pfm_files) {
 # Read table
 pfm <- tryCatch({
     mat <- read.table(file, header = FALSE)
     mat <- as.matrix(mat)

     # Optionally add column names (assuming 4 columns for A, C, G, T)
     if (ncol(mat) == 4) {
     colnames(mat) <- c("A", "C", "G", "T")
     }

     mat
 }, error = function(e) {
     message("Failed to read: ", file)
     NULL
 })

 # Use the filename (without extension) as list name
 name <- tools::file_path_sans_ext(basename(file))
 pfm_list[[name]] <- pfm
 }

 # Check one example
 str(pfm_list[[1]])
 #saveRDS(pfm_list, file = "compiled_pfm_list.rds")

 #The error you're seeing (invalid PFM 'x': not an integer matrix) indicates that the PWM function expects the input matrix to represent raw position frequency matrices (PFMs), which are expected to be integer counts, but the matrices you have are likely log-odds matrices (floating point values), which represent the log-transformed scores instead of raw counts.

 # convert to PFMatrix
 pfm_named_list <- lapply(pfm_list, function(mat) {
 mat <- as.matrix(mat)
 if (!all(rownames(mat) %in% c("A", "C", "G", "T"))) {
     rownames(mat) <- c("A", "C", "G", "T")
 }
 PFMatrix(ID = "motif", name = "motif", profileMatrix = mat)
 })

 # Create an empty list to store results
 match_results_list <- list()

 # Iterate through each PFMatrix object in the list
 for (i in seq_along(pfm_named_list)) {
 # Get the current PFMatrix object
 current_pfm <- pfm_named_list[[i]]

 # Run matchMotifs for the current motif
 match_result <- tryCatch({
     #matchMotifs(pfm_obj, subject = utr_3_down, genome = NULL, p.cutoff = 1e-3)
     matchMotifs(current_pfm, subject = utr_3_down, genome = NULL, p.cutoff = 1e-3)
 }, error = function(e) {
     message("Error with motif ", names(pfm_named_list)[i], ": ", e$message)
     return(NULL)  # Return NULL if error occurs
 })

 # Store the result if no error
 if (!is.null(match_result)) {
     match_results_list[[names(pfm_named_list)[i]]] <- match_result
 }
 }

 # Check the results
 match_results_list

 # Extract motif match results for one motif
 motif_matches_1004_8676391 <- assay(match_results_list[["1004_8676391"]], "motifMatches")

 # Convert sparse matrix to dense matrix
 dense_matrix <- as.matrix(motif_matches_1004_8676391)

 # Optional: Add rownames for better visualization (if available)
 if (!is.null(names(utr_3_down))) {
 rownames(dense_matrix) <- names(utr_3_down)
 }

 # Convert logical matrix to numeric (1 for TRUE, 0 for FALSE)
 dense_matrix_num <- matrix(as.numeric(dense_matrix),
                         nrow = nrow(dense_matrix),
                         dimnames = dimnames(dense_matrix))

 # Now plot the heatmap
 png("motif_match_1004_8676391.png", width = 800, height = 1600)
 pheatmap(dense_matrix_num,
         cluster_rows = FALSE,
         cluster_cols = FALSE,
         color = c("grey", "steelblue"),
         legend_breaks = c(0, 1),
         legend_labels = c("No match", "Match"),
         main = "Motif Match: 1004_8676391")
 dev.off()

 #✅ Output
 #You will see a binary heatmap, where:
 #   * Blue cells = motif matched
 #   * White cells = no match
 #Let me know if you want to loop this over all motifs or save the plot to a file.

Does hg19 vs hg38 Position Matter in Motif Enrichment?

 ✅ If you're doing motif enrichment analysis only (like PWMEnrich):

 Position DOES NOT matter.
 Here's why:
 Aspect  Explanation
 🔍 PWMEnrich Only scans sequences for motif matches based on nucleotide content (e.g., enrichment of AU-rich motifs in downregulated 3′UTRs).
 📏 Coordinates   Genomic coordinates (hg19 vs hg38) are not used in enrichment. Only the sequence itself matters.
 ⚖️ Statistical Test The enrichment is based on observed motif frequency vs background expectations (GC-aware or custom). No genome alignment is done.

 So:

     ✅ As long as your 3′ UTR sequences are correct and biologically relevant, using an hg19 background is safe — even if your gene models are from hg38.

 ❗️ When does genome version matter?

 If you're doing things like:
 Use case    Position-sensitive?
 🔬 Variant effect on motifs (e.g., SNPs in motifs)   ✅ Yes
 🧭 Mapping motifs to genome positions (e.g., ChIP-seq overlaps)  ✅ Yes
 📦 Loading sequences from coordinates (not FASTA)    ✅ Yes
 🧠 Expression + genomic track integration (e.g., motif + ATAC-seq)   ✅ Yes

 Then you must ensure genome versions match exactly.
 ✅ In Your Case: Motif Enrichment of Downregulated 3′UTRs

     If you are:

     Feeding real 3′UTR sequences (as DNAStringSet) for downregulated genes

     Using PWMEnrich purely for motif enrichment

 Then:

     ✅ hg19 vs hg38 does not matter
     ❗ Only sequence content matters

 Let me know if you want to later map enriched motifs back to genome positions — that’s when we’d switch to position-aware tools like FIMO, MOODS, or rtracklayer.

Custom RBP-database preparation (failed!)

 # Example: Create a PWM for 2 fake motifs (replace with your actual PWMs)
 #motif1 <- matrix(c(0.2, 0.3, 0.3, 0.2,
 #                   0.1, 0.4, 0.4, 0.1,
 #                   0.25, 0.25, 0.25, 0.25,
 #                   0.3, 0.2, 0.3, 0.2), nrow = 4, byrow = TRUE,
 #                 dimnames = list(c("A", "C", "G", "T"), NULL))
 #motif2 <- matrix(c(0.3, 0.2, 0.2, 0.3,
 #                   0.4, 0.1, 0.4, 0.1,
 #                   0.25, 0.25, 0.25, 0.25,
 #                   0.2, 0.3, 0.2, 0.3), nrow = 4, byrow = TRUE,
 #                 dimnames = list(c("A", "C", "G", "T"), NULL))
 #rbp_pwms <- list(RBP1 = motif1, RBP2 = motif2)

 pwm_dir <- "/media/jhuang/Elements/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/motif_analysis/PWMDir"
 pwm_files <- list.files(pwm_dir, pattern = "\\.pwm$", full.names = TRUE)
 rbp_pwms <- list()
 for (file in pwm_files) {
 # Read table
 pwm <- tryCatch({
     mat <- read.table(file, header = FALSE)
     mat <- as.matrix(mat)

     # Optionally add column names (assuming 4 columns for A, C, G, T)
     if (ncol(mat) == 4) {
     colnames(mat) <- c("A", "C", "G", "T")
     }

     mat
 }, error = function(e) {
     message("Failed to read: ", file)
     NULL
 })

 # Use the filename (without extension) as list name
 name <- tools::file_path_sans_ext(basename(file))
 rbp_pwms[[name]] <- pwm
 }

 # -- filtering (ERROR) --
 pwm_list <- motifs(bg_model)
 # Example: filter motifs containing 'ELAVL' or 'HNRNP'
 #rbp_pwms <- pwm_list[grep("ELAVL|HNRNP|IGF2BP|RBFOX|FMR1|PUM", names(pwm_list))]
 rbp_enrichment <- motifEnrichment(utr_3_down, bg_model, pwmList = rbp_pwms)
 rbp_report <- groupReport(rbp_enrichment)

 # -- from MotifDb --
 library(MotifDb)
 human_motifs <- query(MotifDb, "Hsapiens")
 rbp_motifs <- query(human_motifs, "RBP")

 # ---- try Build your background model directly from these PWMs ----

 # 1. List all PWM files
 files_all <- list.files("~/Downloads/cisbp-rna/pwms_all_motifs", pattern="\\.txt$", full.names=TRUE)

 # 2. Filter out empty files
 file_sizes <- file.info(files_all)$size
 files     <- files_all[file_sizes > 0]
 cat("Reading", length(files), "non‑empty PWM files out of", length(files_all), "\n")

 # 3. Define the reader (as before)
 readCisbpPfm <- function(path) {
 df <- read.table(path, header=TRUE, stringsAsFactors=FALSE)
 mat_rna <- as.matrix(df[, c("A","C","G","U")])
 mat_dna <- t(mat_rna)
 rownames(mat_dna) <- c("A","C","G","T")
 sweep(mat_dna, 2, colSums(mat_dna), FUN="/")
 }

 # 4. Read only the non‐empty files
 cisbp_mats <- lapply(files, readCisbpPfm)
 names(cisbp_mats) <- tools::file_path_sans_ext(basename(files))

 # 5. (Optional) Check the first few
 head(names(cisbp_mats))
 length(cisbp_mats)  #193

 # 6. Build your background model directly from these PWMs:
 bg_utrs <- utr_3_background_clean    #readDNAStringSet("run2/3utr_background.fasta")
 #bg_utrs <- bg_utrs[ width(bg_utrs)>=6 & !grepl("[^ACGT]", as.character(bg_utrs)) ]

 custom_bg <- makeBackground(
 seqs    = bg_utrs,
 motifs = cisbp_mats,
 type    = "logn",
 organism = "hg19",
 verbose  = TRUE
 )

 custom_bg <- makeBackground(
 sequences = bg_utrs,       # your DNAStringSet of background UTRs
 motifs    = cisbp_mats,    # your named list of PWM matrices
 type      = "logn",        # log‑normal GC‑aware model
 organism  = NULL,          # disable built‑in genomes
 verbose   = TRUE
 )

 custom_bg <- makeBackground(
 cisbp_mats,        # 1) motifs
 organism = "hg19",
 type    = "logn",  # 2) background model type
 bg.seq  = bg_utrs, # 3) override organism by supplying your own sequences
 quick   = FALSE    # (optional) fit to all sequences for best accuracy
 )

 # 1. Pick a library size to re‑scale your PWMs into PFMs
 library_size <- 1000   # you can also try 1000

 # 2. Convert each normalized matrix into integer counts
 cisbp_pfms <- lapply(cisbp_mats, function(pmat) {
 counts <- round(pmat * library_size)
 # ensure no column sums to zero (add pseudocount 1 if needed)
 zero_cols <- which(colSums(counts) == 0)
 if (length(zero_cols) > 0) {
     counts[, zero_cols] <- 1
 }
 counts
 })

 # 3. Build a custom background (PFMs + your bg UTRs)
 custom_bg <- makeBackground(
 cisbp_pfms,      # 1) a list of integer PFMs
 type   = "logn", # 2) log‑normal GC‑aware model
 bg.seq = bg_utrs,# 3) your background DNAStringSet
 quick  = FALSE   # fit on all sequences
 )

 # … then continue with motifEnrichment() on your down‑regulated UTRs as before.

 # 5) Read & clean your down‑regulated 3′UTRs
 down_utrs <- readDNAStringSet("downregulated_utrs.fasta")
 down_utrs <- down_utrs[ width(down_utrs)>=6 & !grepl("[^ACGT]", as.character(down_utrs)) ]

 # 6) Run the enrichment
 res    <- motifEnrichment(down_utrs, custom_bg)
 report <- groupReport(res)
 df     <- as.data.frame(report)
 df$adj_pval <- p.adjust(df$p.value, method="BH")

 # 7) Subset significant
 sig <- subset(df, adj_pval < 0.05)
 print(sig[, c("motif.name","p.value","adj_pval","enrichmentScore")])

Microbial bioinformatics

Microbial bioinformatics uses computational tools to analyze genomes, track evolution, and study functions in microorganisms, including bacteria and viruses.

Analysis of the RNA binding protein motifs for RNA Seq

Leave a Reply Cancel reply