-
Download RBP motifs (PWM) from ATtRACT_DB: ATtRACT_db.txt and pwm.txt; Convert to MEME format (if needed) (Under ~/DATA/Data_Ute/RBP_enrichments/)
# Both generate_named_meme.py and generate_attract_human_meme.py generate the attract_human.meme inkl. Gene_name # Note that "grep + generate_named_meme.py" = generate_attract_human_meme.py #grep "Homo_sapiens" ATtRACT_db.txt > attract_human.txt #3256 #python ~/Scripts/generate_named_meme.py pwm.txt attract_human.txt #OUTPUT: attract_human_named.meme #python ~/Scripts/generate_attract_human_meme.py pwm.txt ATtRACT_db.txt #OUTPUT: attract_human.meme and it is the same to attract_human_named.meme, both are named motifs, in total 1583 MOTIFs, for example "MOTIF 904_HNRNPH2", only for human-reading, not for pipeline #cut -f12 attract_human.txt | sort | uniq > valid_ids.txt python ~/Scripts/convert_attract_pwm_to_meme.py #Input is "pwm.txt" #OUTPUT: "attract_human.meme", in total 1583 MOTIFs, for example "MOTIF 904"
-
Download GENCODE (Under ~/REFs/)
#Visit and Download: GENCODE FTP site https://www.gencodegenes.org/human/ * GTF annotation file (e.g., gencode.v48.annotation.gtf.gz) * Corresponding genome FASTA (e.g., GRCh38.primary_assembly.genome.fa.gz) wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_48/gencode.v48.annotation.gtf.gz wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_48/GRCh38.primary_assembly.genome.fa.gz gunzip gencode.v48.annotation.gtf.gz gunzip GRCh38.primary_assembly.genome.fa.gz
-
Get 3UTR.fasta, 5UTR.fasta, CDS.fasta and transcripts.fasta based on GENCODE-files (Under ~/DATA/Data_Ute/RBP_enrichments/)
grep ",\"protein_coding\"," ~/DATA/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/results_2025_1/degenes/MKL-1_wt.EV_vs_parental-up.txt > MKL-1_wt.EV_vs_parental-up_protein_coding.txt grep ",\"protein_coding\"," ~/DATA/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/results_2025_1/degenes/MKL-1_wt.EV_vs_parental-down.txt > MKL-1_wt.EV_vs_parental-down_protein_coding.txt grep ",\"protein_coding\"," ~/DATA/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/results_2025_1/degenes/MKL-1_wt.EV_vs_parental-all.txt > MKL-1_wt.EV_vs_parental-all_protein_coding.txt grep ",\"protein_coding\"," ~/DATA/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/results_2025_1/degenes/WaGa_wt.EV_vs_parental-up.txt > WaGa_wt.EV_vs_parental-up_protein_coding.txt grep ",\"protein_coding\"," ~/DATA/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/results_2025_1/degenes/WaGa_wt.EV_vs_parental-down.txt > WaGa_wt.EV_vs_parental-down_protein_coding.txt grep ",\"protein_coding\"," ~/DATA/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/results_2025_1/degenes/WaGa_wt.EV_vs_parental-all.txt > WaGa_wt.EV_vs_parental-all_protein_coding.txt #Usage: python extract_transcript_parts.py <gene_list.txt> <gencode.gtf> <genome.fa> <out_prefix>, the script generates <out_prefix>.[transcripts|CDS|5UTR|3UTR].fasta files. python ~/Scripts/extract_transcript_parts.py MKL-1_wt.EV_vs_parental-down_protein_coding.txt ~/REFs/gencode.v48.annotation.gtf ~/REFs/GRCh38.primary_assembly.genome.fa MKL-1_down #112 python ~/Scripts/extract_transcript_parts.py MKL-1_wt.EV_vs_parental-up_protein_coding.txt ~/REFs/gencode.v48.annotation.gtf ~/REFs/GRCh38.primary_assembly.genome.fa MKL-1_up #5988 python ~/Scripts/extract_transcript_parts.py MKL-1_wt.EV_vs_parental-all_protein_coding.txt ~/REFs/gencode.v48.annotation.gtf ~/REFs/GRCh38.primary_assembly.genome.fa MKL-1_background #19239 python ~/Scripts/extract_transcript_parts.py WaGa_wt.EV_vs_parental-down_protein_coding.txt ~/REFs/gencode.v48.annotation.gtf ~/REFs/GRCh38.primary_assembly.genome.fa WaGa_down #93 python ~/Scripts/extract_transcript_parts.py WaGa_wt.EV_vs_parental-up_protein_coding.txt ~/REFs/gencode.v48.annotation.gtf ~/REFs/GRCh38.primary_assembly.genome.fa WaGa_up #6538 python ~/Scripts/extract_transcript_parts.py WaGa_wt.EV_vs_parental-all_protein_coding.txt ~/REFs/gencode.v48.annotation.gtf ~/REFs/GRCh38.primary_assembly.genome.fa WaGa_background #19239 python ~/Scripts/filter_short_fasta.py MKL-1_up.3UTR.fasta MKL-1_up.filtered.3UTR.fasta python ~/Scripts/filter_short_fasta.py MKL-1_down.3UTR.fasta MKL-1_down.filtered.3UTR.fasta python ~/Scripts/filter_short_fasta.py MKL-1_background.3UTR.fasta MKL-1_background.filtered.3UTR.fasta # 检查背景文件中有多少序列: #grep -c "^>" MKL-1_background.filtered.3UTR.fasta #68890 # 检查背景 FIMO 命中的总序列数: #cut -f3 fimo_background_MKL-1_background/fimo.tsv | sort | uniq | wc -l #67841 python ~/Scripts/filter_short_fasta.py WaGa_background.3UTR.fasta WaGa_background.filtered.3UTR.fasta python ~/Scripts/filter_short_fasta.py WaGa_up.3UTR.fasta WaGa_up.filtered.3UTR.fasta python ~/Scripts/filter_short_fasta.py WaGa_down.3UTR.fasta WaGa_down.filtered.3UTR.fasta
-
FIMO for motif scan on 3UTR sequences
# Generate fimo_foreground_MKL-1_down/fimo.tsv mamba activate meme_env fimo --thresh 1e-4 --oc fimo_foreground_MKL-1_down attract_human_named.meme MKL-1_down.3UTR.fasta fimo --thresh 1e-4 --oc fimo_foreground_MKL-1_up attract_human.meme MKL-1_up.3UTR.fasta fimo --thresh 1e-4 --oc fimo_background_MKL-1_background attract_human.meme MKL-1_background.3UTR.fasta fimo --thresh 1e-4 --oc fimo_foreground_WaGa_down attract_human.meme WaGa_down.3UTR.fasta fimo --thresh 1e-4 --oc fimo_foreground_WaGa_up attract_human.meme WaGa_up.3UTR.fasta fimo --thresh 1e-4 --oc fimo_background_WaGa_background attract_human.meme WaGa_background.3UTR.fasta #Keep only one match per gene (based on Ensembl Gene ID like ENSG00000134871) for each RBP motif, even if multiple transcripts have hits. #python ~/Scripts/filter_fimo_best_per_gene.py --input fimo_foreground/fimo.tsv --output fimo_foreground/fimo.filtered.tsv python ~/Scripts/convert_gtf_to_Gene_annotation_TSV_file.py ~/REFs/Homo_sapiens.GRCh38.114.gtf Homo_sapiens.GRCh38.gene_annotation.tsv python ~/Scripts/filter_fimo_best_per_gene_annotated.py \ --input fimo_foreground_MKL-1_down/fimo.tsv \ --annot Homo_sapiens.GRCh38.gene_annotation.tsv \ --output_filtered fimo_foreground_MKL-1_down/fimo.filtered.tsv \ --output_annotated fimo_foreground_MKL-1_down/fimo.filtered.annotated.tsv #21559 python ~/Scripts/filter_fimo_best_per_gene_annotated.py \ --input fimo_foreground_MKL-1_up/fimo.tsv \ --annot Homo_sapiens.GRCh38.gene_annotation.tsv \ --output_filtered fimo_foreground_MKL-1_up/fimo.filtered.tsv \ --output_annotated fimo_foreground_MKL-1_up/fimo.filtered.annotated.tsv #(736661 rows) python ~/Scripts/filter_fimo_best_per_gene_annotated.py \ --input fimo_background_MKL-1_background/fimo.tsv \ --annot Homo_sapiens.GRCh38.gene_annotation.tsv \ --output_filtered fimo_background_MKL-1_background/fimo.filtered.tsv \ --output_annotated fimo_background_MKL-1_background/fimo.filtered.annotated.tsv #(1869075 rows) python ~/Scripts/filter_fimo_best_per_gene_annotated.py \ --input fimo_foreground_WaGa_down/fimo.tsv \ --annot Homo_sapiens.GRCh38.gene_annotation.tsv \ --output_filtered fimo_foreground_WaGa_down/fimo.filtered.tsv \ --output_annotated fimo_foreground_WaGa_down/fimo.filtered.annotated.tsv #(20364 rows) python ~/Scripts/filter_fimo_best_per_gene_annotated.py \ --input fimo_foreground_WaGa_up/fimo.tsv \ --annot Homo_sapiens.GRCh38.gene_annotation.tsv \ --output_filtered fimo_foreground_WaGa_up/fimo.filtered.tsv \ --output_annotated fimo_foreground_WaGa_up/fimo.filtered.annotated.tsv #(805634 rows) python ~/Scripts/filter_fimo_best_per_gene_annotated.py \ --input fimo_background_WaGa_background/fimo.tsv \ --annot Homo_sapiens.GRCh38.gene_annotation.tsv \ --output_filtered fimo_background_WaGa_background/fimo.filtered.tsv \ --output_annotated fimo_background_WaGa_background/fimo.filtered.annotated.tsv #(1811615 rows) python ~/Scripts/run_enrichment.py \ --attract ATtRACT_db.txt \ --fimo_fg fimo_foreground_MKL-1_up/fimo.filtered.tsv \ --fimo_bg fimo_background_MKL-1_background/fimo.filtered.tsv \ --output rbp_enrichment_MKL-1_up.csv \ --strategy inclusive python ~/Scripts/run_enrichment.py \ --attract ATtRACT_db.txt \ --fimo_fg fimo_foreground_MKL-1_down/fimo.filtered.tsv \ --fimo_bg fimo_background_MKL-1_background/fimo.filtered.tsv \ --output rbp_enrichment_MKL-1_down.csv python ~/Scripts/run_enrichment.py \ --attract ATtRACT_db.txt \ --fimo_fg fimo_foreground_WaGa_up/fimo.filtered.tsv \ --fimo_bg fimo_background_WaGa_background/fimo.filtered.tsv \ --output rbp_enrichment_WaGa_up.csv python ~/Scripts/run_enrichment.py \ --attract ATtRACT_db.txt \ --fimo_fg fimo_foreground_WaGa_down/fimo.filtered.tsv \ --fimo_bg fimo_background_WaGa_background/fimo.filtered.tsv \ --output rbp_enrichment_WaGa_down.csv #Column Meaning in rbp_enrichment_*.[csv|xlsx] #a Number of unique foreground UTRs hit by the RBP #b Number of unique background UTRs hit by the RBP #c Total number of foreground UTRs #d Total number of background UTRs (⬅️ this is the value you're asking about) #p_value, fdr From Fisher's exact test on enrichment # The following pdf-files are not used and sent! python ~/Scripts/plot_volcano.py --csv rbp_enrichment_MKL-1_up.csv --output MKL-1_volcano_up.pdf --title "Upregulated MKL-1" python ~/Scripts/plot_rbp_heatmap.py \ --csvs rbp_enrichment_MKL-1_up.csv rbp_enrichment_MKL-1_down.csv \ --labels Upregulated Downregulated \ --output MKL-1_rbp_enrichment_heatmap.pdf #-- Get all genes the number 1621 refers to -- #AGO2,1621,5050,5732,12987,1.0,1.0 #MKL-1_up #motif_ids are 414 and 399 grep "^414" fimo.filtered.annotated.tsv > AGO2.txt grep "^399" fimo.filtered.annotated.tsv >> AGO2.txt cut -d$'\t' -f11 AGO2.txt | sort -u > AGO2_uniq.txt wc -l AGO2_uniq.txt #1621 AGO2_uniq.txt
-
miRNAs motif analysis using ATtRACT + FIMO
#* Extract their sequences #* Generate a background set #* Run RBP enrichment (e.g., with RBPmap or FIMO) #* Get p-adjusted enrichment stats (e.g., Fisher + BH) #Input_1. DE results (differential expression file from smallRNA-seq) #Input: up- and down-, all-regulated files #~/DATA/Data_Ute/Data_Ute_smallRNA_7/summaries_exo7/miRNAs/untreated_vs_parental_cells-up.txt #66 #~/DATA/Data_Ute/Data_Ute_smallRNA_7/summaries_exo7/miRNAs/untreated_vs_parental_cells-down.txt #38 #~/DATA/Data_Ute/Data_Ute_smallRNA_7/summaries_exo7/miRNAs/untreated_vs_parental_cells-all.txt #1304 #Format: 1st column = miRNA ID (e.g., hsa-miR-21-5p), optionally with other stats. cp ~/DATA/Data_Ute/Data_Ute_smallRNA_7/summaries_exo7/miRNAs/untreated_vs_parental_cells-*.txt . #"hsa-miR-3180|hsa-miR-3180-3p" #>hsa-miR-3180 MIMAT0018178 Homo sapiens miR-3180 #UGGGGCGGAGCUUCCGGAG #>hsa-miR-3180-3p MIMAT0015058 Homo sapiens miR-3180-3p #UGGGGCGGAGCUUCCGGAGGCC #Input_2. Reference FASTA (Reference sequences from miRBase or GENCODE) #From miRBase: https://mirbase.org/download/ https://mirbase.org/download/CURRENT/ ##miRBase_v21 #mature.fa.gz → contains mature miRNA sequences #hairpin.fa.gz → for pre-miRNAs mv ../RBP_enrichments_OLD_DEL/mature_v21.fa . # -- Extract Sequences + Background Set -- #Inputs: # * up_miRNA.txt and down_miRNA.txt: DE results (first column = miRNA name, e.g., hsa-miR-21-5p) # * mature.fa or hairpin.fa from miRBase #Outputs: # * mirna_up.fa # * mirna_down.fa # * mirna_background.fa #Use all remaining miRNAs as background: python ~/Scripts/prepare_miRNA_sets.py untreated_vs_parental_cells-up.txt untreated_vs_parental_cells-down.txt mature_v21.fa mirna --full-background mv mirna_background.fa mirna_full-background.fa #Use random subset background. Note that the generated background has the number of maxsize(up, down), in the case is up (84 records): #python ~/Scripts/prepare_miRNA_sets.py untreated_vs_parental_cells-up.txt untreated_vs_parental_cells-down.txt mature_v21.fa mirna # grep ">" mature_v21.fa | wc -l #35828 # grep ">" mirna_full-background.fa | wc -l #35710-->35723 # grep ">" mirna_up.fa | wc -l #84 # grep ">" mirna_down.fa | wc -l #34 # grep ">" mirna_background.fa | wc -l #84-->67 # #35,710 + 84 + 34 = 35,828 fimo --thresh 1e-4 --oc fimo_mirna_down attract_human.meme mirna_down.fa fimo --thresh 1e-4 --oc fimo_mirna_up attract_human.meme mirna_up.fa fimo --thresh 1e-4 --oc fimo_mirna_full-background attract_human.meme mirna_full-background.fa #fimo --thresh 1e-4 --oc fimo_mirna_background attract_human.meme mirna_background.fa python ~/Scripts/filter_fimo_best_per_gene_annotated.py \ --input fimo_mirna_down/fimo.tsv \ --annot Homo_sapiens.GRCh38.gene_annotation.tsv \ --output_filtered fimo_mirna_down/fimo.filtered.tsv \ --output_annotated fimo_mirna_down/fimo.filtered.annotated.tsv #21 python ~/Scripts/filter_fimo_best_per_gene_annotated.py \ --input fimo_mirna_up/fimo.tsv \ --annot Homo_sapiens.GRCh38.gene_annotation.tsv \ --output_filtered fimo_mirna_up/fimo.filtered.tsv \ --output_annotated fimo_mirna_up/fimo.filtered.annotated.tsv #48 python ~/Scripts/filter_fimo_best_per_gene_annotated.py \ --input fimo_mirna_full-background/fimo.tsv \ --annot Homo_sapiens.GRCh38.gene_annotation.tsv \ --output_filtered fimo_mirna_full-background/fimo.filtered.tsv \ --output_annotated fimo_mirna_full-background/fimo.filtered.annotated.tsv #896 #python ~/Scripts/filter_fimo_best_per_gene_annotated.py \ #--input fimo_mirna_background/fimo.tsv \ #--annot Homo_sapiens.GRCh38.gene_annotation.tsv \ #--output_filtered fimo_mirna_background/fimo.filtered.tsv \ #--output_annotated fimo_mirna_background/fimo.filtered.annotated.tsv #57 python ~/Scripts/run_enrichment_miRNAs.py \ --attract ATtRACT_db.txt \ --fimo_fg fimo_mirna_up/fimo.filtered.tsv \ --fimo_bg fimo_mirna_full-background/fimo.filtered.tsv \ --output rbp_enrichment_mirna_up.csv \ --strategy inclusive python ~/Scripts/run_enrichment_miRNAs.py \ --attract ATtRACT_db.txt \ --fimo_fg fimo_mirna_down/fimo.filtered.tsv \ --fimo_bg fimo_mirna_full-background/fimo.filtered.tsv \ --output rbp_enrichment_mirna_down.csv \ --strategy inclusive #python ~/Scripts/run_enrichment_miRNAs.py \ # --attract ATtRACT_db.txt \ # --fimo_fg fimo_mirna_up/fimo.filtered.tsv \ # --fimo_bg fimo_mirna_background/fimo.filtered.tsv \ # --output rbp_enrichment_mirna_up_on_subset-background.csv \ # --strategy inclusive #python ~/Scripts/run_enrichment_miRNAs.py \ # --attract ATtRACT_db.txt \ # --fimo_fg fimo_mirna_down/fimo.filtered.tsv \ # --fimo_bg fimo_mirna_background/fimo.filtered.tsv \ # --output rbp_enrichment_mirna_down_on_subset-background.csv \ # --strategy inclusive #FXR2 1 (hsa-miR-92b-5p) 1 1 118 0.0168067226890756 0.365546218487395 #ORB2 1 (hsa-miR-4748) 1 1 118 0.0168067226890756 0.365546218487395 #-- Get all genes the number 1621 refers to -- grep "^FXR2" ATtRACT_db.txt #motif_ids is M020_0.6 grep "^M020_0.6" fimo_mirna_up/fimo.filtered.annotated.tsv > FXR2.txt grep "^M020_0.6" fimo_mirna_up/fimo.filtered.annotated.tsv #cut -d$'\t' -f11 AGO2.txt | sort -u > AGO2_uniq.txt #wc -l AGO2_uniq.txt (1621 records) grep "^ORB2" ATtRACT_db.txt grep "^M120_0.6" fimo_mirna_up/fimo.filtered.annotated.tsv
-
Reports for RBP enrichment results
Please find attached the results of the RNA-binding protein (RBP) enrichment analysis using FIMO and the ATtRACT motif database, along with a brief description of the procedures used for both the 3′ UTR-based analysis (RNA-seq) and the miRNA-based analysis (small RNA-seq). 1. RBP Motif Enrichment from RNA-seq (3′ UTRs) We focused on 3′ UTRs, as they are key regulatory regions for RBPs. Sequences shorter than 16 nucleotides were excluded. Using FIMO (from the MEME suite) with motifs from the ATtRACT database, we scanned both foreground and background 3′ UTR sets to identify motif occurrences. Foreground: Differentially expressed transcripts (e.g., MKL-1 up/down, WaGa up/down) Background: All non-differentially expressed transcripts Analysis: Fisher’s exact test was used to assess motif enrichment; p-values were adjusted using the Benjamini–Hochberg method. Output files (RNA-seq): * rbp_enrichment_MKL-1_down.xlsx / .png * rbp_enrichment_MKL-1_up.xlsx / .png * rbp_enrichment_WaGa_down.xlsx / .png * rbp_enrichment_WaGa_up.xlsx / .png 2. RBP Motif Enrichment from Small RNA-seq (miRNAs) This analysis focused on differentially expressed miRNAs, using either mature miRNA sequences from miRBase. We scanned for RBP binding motifs within these sequences using FIMO and assessed motif enrichment relative to background sets. Foreground: DE miRNAs (up/down) from small RNA-seq comparisons Background: All other miRNAs from miRBase Analysis: FIMO was used with --thresh 1e-4, followed by annotation and filtering. Enrichment was assessed using Fisher’s test + BH correction. Output files (miRNAs): * rbp_enrichment_mirna_down.xlsx * rbp_enrichment_mirna_up.xlsx How to Interpret the Numbers Each row in the result tables represents one RBP and its enrichment statistics: a: foreground genes/sequences with the motif b: background genes/sequences with the motif c: total number of foreground genes/sequences d: total number of background genes/sequences These values are used to compute p-values and FDRs. For example, in rbp_enrichment_MKL-1_up.xlsx, AGO2 has a = 1621, meaning FIMO detected AGO2 motifs in 1,621 genes in the MKL-1 upregulated set. These genes are listed in AGO2_uniq.txt. Similarly, for the miRNA analysis (e.g., rbp_enrichment_mirna_up.xlsx and rbp_enrichment_mirna_down.xlsx), the numbers represent counts of unique miRNAs with at least one significant motif hit. As examples, I calculated the detailed membership for FXR2 and ORB2. diff RBP_enrichments/rbp_enrichment_MKL-1_up.csv RBP_enrichments_OLD_DEL/rbp_enrichment_MKL-1_up.csv #diff RBP_enrichments/rbp_enrichment_MKL-1_up.xlsx RBP_enrichments_OLD_DEL/rbp_enrichment_MKL-1_up.xlsx #diff RBP_enrichments/rbp_enrichment_MKL-1_up.png RBP_enrichments_OLD_DEL/rbp_enrichment_MKL-1_up.png diff RBP_enrichments/rbp_enrichment_MKL-1_down.csv RBP_enrichments_OLD_DEL/rbp_enrichment_MKL-1_down.csv #diff RBP_enrichments/rbp_enrichment_MKL-1_down.xlsx RBP_enrichments_OLD_DEL/rbp_enrichment_MKL-1_down.xlsx #diff RBP_enrichments/rbp_enrichment_MKL-1_down.png RBP_enrichments_OLD_DEL/rbp_enrichment_MKL-1_down.png diff RBP_enrichments/rbp_enrichment_WaGa_up.csv RBP_enrichments_OLD_DEL/rbp_enrichment_WaGa_up.csv #diff RBP_enrichments/rbp_enrichment_WaGa_up.xlsx RBP_enrichments_OLD_DEL/rbp_enrichment_WaGa_up.xlsx #diff RBP_enrichments/rbp_enrichment_WaGa_up.png RBP_enrichments_OLD_DEL/rbp_enrichment_WaGa_up.png diff RBP_enrichments/rbp_enrichment_WaGa_down.csv RBP_enrichments_OLD_DEL/rbp_enrichment_WaGa_down.csv #diff RBP_enrichments/rbp_enrichment_WaGa_down.xlsx RBP_enrichments_OLD_DEL/rbp_enrichment_WaGa_down.xlsx #diff RBP_enrichments/rbp_enrichment_WaGa_down.png RBP_enrichments_OLD_DEL/rbp_enrichment_WaGa_down.png okular RBP_enrichments/MKL-1_volcano_up.pdf okular RBP_enrichments_OLD_DEL/MKL-1_volcano_up.pdf okular RBP_enrichments/MKL-1_rbp_enrichment_heatmap.pdf okular RBP_enrichments_OLD_DEL/MKL-1_rbp_enrichment_heatmap.pdf diff RBP_enrichments/rbp_enrichment_mirna_up.csv RBP_enrichments_OLD_DEL/rbp_enrichment_mirna_up_on_full-background.csv #diff RBP_enrichments/rbp_enrichment_mirna_up.xlsx RBP_enrichments_OLD_DEL/rbp_enrichment_mirna_up.xlsx #diff RBP_enrichments/rbp_enrichment_mirna_up.png RBP_enrichments_OLD_DEL/rbp_enrichment_mirna_up_on_full-background.png diff RBP_enrichments/rbp_enrichment_mirna_down.csv RBP_enrichments_OLD_DEL/rbp_enrichment_mirna_down_on_full-background.csv #diff RBP_enrichments/rbp_enrichment_mirna_down.xlsx RBP_enrichments_OLD_DEL/rbp_enrichment_mirna_down.xlsx #diff RBP_enrichments/rbp_enrichment_mirna_down.png RBP_enrichments_OLD_DEL/rbp_enrichment_mirna_down_on_full-background.png
-
Generate the sequence logos for the enriched RBP motives
import os import pandas as pd import matplotlib.pyplot as plt import logomaker from pathlib import Path import re # -------------- # Config # -------------- motif_table_path = "ATtRACT_db.txt" pwm_file_path = "pwm.txt" input_files = [ "rbp_enrichment_MKL-1_up.csv", "rbp_enrichment_MKL-1_down.csv", "rbp_enrichment_WaGa_up.csv", "rbp_enrichment_WaGa_down.csv", "rbp_enrichment_mirna_up.csv", "rbp_enrichment_mirna_down.csv" ] output_dir = "sequence_logos" os.makedirs(output_dir, exist_ok=True) # -------------- # Helper Functions # -------------- def load_pwm_file(pwm_path): pwm_dict = {} current_id = None pwm_matrix = [] with open(pwm_path, 'r') as f: for line in f: line = line.strip() if not line: continue if line.startswith('>'): if current_id and pwm_matrix: pwm_dict[current_id] = pwm_matrix parts = line[1:].split() current_id = parts[0] pwm_matrix = [] else: row = [float(x) for x in line.split()] if len(row) == 4: pwm_matrix.append(row) if current_id and pwm_matrix: pwm_dict[current_id] = pwm_matrix return pwm_dict def sanitize_filename(text): return re.sub(r'[^\w\-_.]', '_', text) def plot_logo(pwm_df, title, output_path): fig, ax = plt.subplots(figsize=(len(pwm_df)*0.5, 2)) logo = logomaker.Logo(pwm_df, ax=ax, color_scheme='classic') logo.style_spines(visible=False) logo.style_spines(spines=['left', 'bottom'], visible=True) ax.set_ylabel("Information") ax.set_title(title) fig.tight_layout() fig.savefig(output_path, bbox_inches='tight') plt.close(fig) # -------------- # Load Motif Table and Filter per your strategy # -------------- motif_table = pd.read_csv(motif_table_path, sep='\t') # Filter rows where Score ends with '**' motif_table = motif_table[motif_table['Score'].str.endswith('**')].copy() # Remove '**' and convert Score to float motif_table['Score'] = motif_table['Score'].str.replace(r'\*\*$', '', regex=True).astype(float) # Function to keep Homo sapiens rows if exist, otherwise all def keep_human_if_exists(group): human_rows = group[group['Organism'] == 'Homo_sapiens'] return human_rows if not human_rows.empty else group motif_table = motif_table.groupby('Gene_name', group_keys=False).apply(keep_human_if_exists) # If multiple remain per RBP, pick one randomly (seeded for reproducibility) motif_table = motif_table.groupby('Gene_name').apply(lambda g: g.sample(1, random_state=42)).reset_index(drop=True) # Build motif map for quick lookup motif_map = motif_table[['Gene_name', 'Matrix_id']].drop_duplicates() # Load PWM dictionary pwm_dict = load_pwm_file(pwm_file_path) # -------------- # Process Each Enrichment File # -------------- for file in input_files: print(f"\n📂 Processing {file}") df = pd.read_csv(file) if 'fdr' not in df.columns: print(f"⚠️ Skipping {file}: no 'fdr' column") continue sig_df = df[df['fdr'] <= 0.05].copy() for _, row in sig_df.iterrows(): rbp_name = row['RBP'] matches = motif_map[motif_map['Gene_name'] == rbp_name] if matches.empty: print(f" ⚠️ No motif entry for RBP: {rbp_name}") continue # Should be exactly one row per RBP now motif_row = matches.iloc[0] matrix_id = motif_row['Matrix_id'] if matrix_id not in pwm_dict: print(f" ⚠️ PWM not found for matrix ID {matrix_id} (RBP: {rbp_name})") continue pwm = pwm_dict[matrix_id] pwm_df = pd.DataFrame(pwm, columns=list("ACGT")) title = f"{rbp_name} ({matrix_id})" safe_name = sanitize_filename(f"{Path(file).stem}_{rbp_name}_{matrix_id}.png") out_path = os.path.join(output_dir, safe_name) plot_logo(pwm_df, title, out_path) print("\n✅ Sequence logo generation complete.")
-
Plot pie-chart for RNA-seq results
mamba activate plot-numpy1 (plot-numpy1) jhuang@WS-2290C:/media/jhuang/Elements1/Data_Ute$ python rna_type_piecharts.py /mnt/nvme1n1p1/Homo_sapiens/Ensembl/GRCh38/Annotation/Genes/genes.gtf Data_RNA-Seq_MKL-1+WaGa/merged_gene_counts_MKL-1_human.txt Data_RNA-Seq_MKL-1+WaGa/merged_gene_counts_WaGa_human.txt
-
Code of rna_type_piecharts.py
import sys import pandas as pd import matplotlib.pyplot as plt from matplotlib import cm import numpy as np def parse_gtf(gtf_file): biotype_dict = {} with open(gtf_file) as f: for line in f: if line.startswith('#'): continue fields = line.strip().split('\t') if fields[2] != 'gene': continue attr_field = fields[8] attrs = {} for attr in attr_field.split(';'): attr = attr.strip() if attr == '': continue key, val = attr.split(' ', 1) attrs[key] = val.strip('"') gene_id = attrs.get('gene_id') gene_biotype = attrs.get('gene_biotype') or attrs.get('gene_type') # some GTFs use gene_type if gene_id and gene_biotype: biotype_dict[gene_id] = gene_biotype return biotype_dict def load_counts(mkl_file, waga_file): df_mkl = pd.read_csv(mkl_file, sep='\t') df_waga = pd.read_csv(waga_file, sep='\t') # Check for Geneid/gene_name match if not (df_mkl['Geneid'].equals(df_waga['Geneid']) and df_mkl['gene_name'].equals(df_waga['gene_name'])): mismatch = df_mkl.loc[ (df_mkl['Geneid'] != df_waga['Geneid']) | (df_mkl['gene_name'] != df_waga['gene_name']), ['Geneid', 'gene_name'] ] print("⚠️ Mismatched rows found between Geneid/gene_name columns:") print(mismatch) raise ValueError("Mismatch in Geneid/gene_name columns between MKL-1 and WaGa count files.") # Drop duplicated Geneid/gene_name from WaGa counts before merge df_waga_samples = df_waga.drop(columns=['Geneid', 'gene_name']) # Merge horizontally df = pd.concat([df_mkl, df_waga_samples], axis=1) return df, df_mkl, df_waga def map_biotypes(df, biotype_dict): # Map gene biotypes by Geneid; if not found, assign 'unknown' df['gene_biotype'] = df['Geneid'].map(biotype_dict).fillna('unknown') return df def save_raw_data_for_pie(df, output_excel): # Sum counts by biotype # Sum all sample columns (exclude Geneid, gene_name, gene_biotype) sample_cols = [c for c in df.columns if c not in ['Geneid', 'gene_name', 'gene_biotype']] df['total_counts'] = df[sample_cols].sum(axis=1) # Aggregate counts per biotype pie_data = df.groupby('gene_biotype')['total_counts'].sum().reset_index() with pd.ExcelWriter(output_excel) as writer: df.to_excel(writer, sheet_name='All_gene_counts', index=False) pie_data.to_excel(writer, sheet_name='Pie_data', index=False) return pie_data def plot_pie(pie_data, output_png): pie_data = pie_data[pie_data['total_counts'] > 0].sort_values('total_counts', ascending=False) N = 10 # Top N biotypes if len(pie_data) > N: top_data = pie_data.nlargest(N, 'total_counts') others = pd.DataFrame([{ 'gene_biotype': 'Other', 'total_counts': pie_data['total_counts'].sum() - top_data['total_counts'].sum() }]) pie_data = pd.concat([top_data, others], ignore_index=True) sizes = pie_data['total_counts'] labels = pie_data['gene_biotype'] # Use updated color map syntax cmap = plt.colormaps['Pastel1'] colors = [cmap(i % cmap.N) for i in range(len(labels))] fig, ax = plt.subplots(figsize=(10, 10)) # Draw pie chart without labels/autopct wedges, _ = ax.pie( sizes, startangle=90, colors=colors, radius=1.2 ) total = sum(sizes) for i, wedge in enumerate(wedges): angle = (wedge.theta2 + wedge.theta1) / 2. x = np.cos(np.deg2rad(angle)) y = np.sin(np.deg2rad(angle)) pct = sizes.iloc[i] / total * 100 if pct < 1: continue # Skip small slices ha = 'left' if x > 0 else 'right' ax.annotate( f"{labels.iloc[i]} ({pct:.1f}%)", xy=(x, y), xytext=(1.4 * x, 1.4 * y), ha=ha, va='center', fontsize=9, #arrowprops=dict(arrowstyle='-', color='gray') arrowprops=dict(arrowstyle='-', connectionstyle='angle3,angleA=0,angleB=90', color='gray') ) # Build detailed legend: biotype (count, %) legend_labels = [ f"{lbl} ({int(cnt):,}, {cnt / total:.1%})" for lbl, cnt in zip(labels, sizes) ] ax.legend( wedges, legend_labels, title="Gene Biotype", loc="center left", bbox_to_anchor=(1, 0.5), fontsize=9, title_fontsize=10 ) ax.set_title("", fontsize=14) ax.axis('equal') plt.tight_layout() plt.savefig(output_png, bbox_inches='tight') plt.close() def main(): if len(sys.argv) != 4: print("Usage: python rna_type_piecharts.py <genes.gtf> <MKL-1_counts.txt> <WaGa_counts.txt>") sys.exit(1) gtf_file = sys.argv[1] mkl_counts_file = sys.argv[2] waga_counts_file = sys.argv[3] print("🔍 Parsing gene biotypes from GTF...") biotype_dict = parse_gtf(gtf_file) print("📊 Loading and merging count matrices...") df, df_mkl, df_waga = load_counts(mkl_counts_file, waga_counts_file) print("🧬 Mapping gene biotypes...") df = map_biotypes(df, biotype_dict) df_mkl = map_biotypes(df_mkl, biotype_dict) df_waga = map_biotypes(df_waga, biotype_dict) print("💾 Saving raw data for pie charts to Excel...") pie_data = save_raw_data_for_pie(df, 'rna_biotype_pie_data.xlsx') pie_mkl_data = save_raw_data_for_pie(df_mkl, 'rna_biotype_pie_data_MKL-1.xlsx') pie_waga_data = save_raw_data_for_pie(df_waga, 'rna_biotype_pie_data_WaGa.xlsx') print("📈 Plotting pie chart...") plot_pie(pie_data, 'rna_biotype_pie_chart.png') plot_pie(pie_mkl_data, 'rna_biotype_pie_chart_MKL-1.png') plot_pie(pie_waga_data, 'rna_biotype_pie_chart_WaGa.png') print("✅ Done! Excel data saved to 'rna_biotype_pie*_data.xlsx' and pie chart saved to 'rna_biotype_pie*_chart.png'.") if __name__ == "__main__": main()
Workflow for RNA-Binding Protein Enrichment and RNA Type Distribution Analysis (Ute’s Project)
Leave a reply