Workflow for RNA-Binding Protein Enrichment and RNA Type Distribution Analysis (Ute’s Project)

Download RBP motifs (PWM) from ATtRACT_DB: ATtRACT_db.txt and pwm.txt; Convert to MEME format (if needed) (Under ~/DATA/Data_Ute/RBP_enrichments/)

 # Both generate_named_meme.py and generate_attract_human_meme.py generate the attract_human.meme inkl. Gene_name
 # Note that "grep + generate_named_meme.py" = generate_attract_human_meme.py
 #grep "Homo_sapiens" ATtRACT_db.txt > attract_human.txt  #3256
 #python ~/Scripts/generate_named_meme.py pwm.txt attract_human.txt  #OUTPUT: attract_human_named.meme
 #python ~/Scripts/generate_attract_human_meme.py pwm.txt ATtRACT_db.txt #OUTPUT: attract_human.meme and it is the same to attract_human_named.meme, both are named motifs, in total 1583 MOTIFs, for example "MOTIF 904_HNRNPH2", only for human-reading, not for pipeline

 #cut -f12 attract_human.txt | sort | uniq > valid_ids.txt
 python ~/Scripts/convert_attract_pwm_to_meme.py  #Input is "pwm.txt" #OUTPUT: "attract_human.meme", in total 1583 MOTIFs, for example "MOTIF 904"

Download GENCODE (Under ~/REFs/)

 #Visit and Download: GENCODE FTP site https://www.gencodegenes.org/human/
     * GTF annotation file (e.g., gencode.v48.annotation.gtf.gz)
     * Corresponding genome FASTA (e.g., GRCh38.primary_assembly.genome.fa.gz)
 wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_48/gencode.v48.annotation.gtf.gz
 wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_48/GRCh38.primary_assembly.genome.fa.gz
 gunzip gencode.v48.annotation.gtf.gz
 gunzip GRCh38.primary_assembly.genome.fa.gz

Get 3UTR.fasta, 5UTR.fasta, CDS.fasta and transcripts.fasta based on GENCODE-files (Under ~/DATA/Data_Ute/RBP_enrichments/)

 grep ",\"protein_coding\"," ~/DATA/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/results_2025_1/degenes/MKL-1_wt.EV_vs_parental-up.txt > MKL-1_wt.EV_vs_parental-up_protein_coding.txt
 grep ",\"protein_coding\"," ~/DATA/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/results_2025_1/degenes/MKL-1_wt.EV_vs_parental-down.txt > MKL-1_wt.EV_vs_parental-down_protein_coding.txt
 grep ",\"protein_coding\"," ~/DATA/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/results_2025_1/degenes/MKL-1_wt.EV_vs_parental-all.txt > MKL-1_wt.EV_vs_parental-all_protein_coding.txt
 grep ",\"protein_coding\"," ~/DATA/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/results_2025_1/degenes/WaGa_wt.EV_vs_parental-up.txt > WaGa_wt.EV_vs_parental-up_protein_coding.txt
 grep ",\"protein_coding\"," ~/DATA/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/results_2025_1/degenes/WaGa_wt.EV_vs_parental-down.txt > WaGa_wt.EV_vs_parental-down_protein_coding.txt
 grep ",\"protein_coding\"," ~/DATA/Data_Ute/Data_RNA-Seq_MKL-1+WaGa/results_2025_1/degenes/WaGa_wt.EV_vs_parental-all.txt > WaGa_wt.EV_vs_parental-all_protein_coding.txt

 #Usage: python extract_transcript_parts.py <gene_list.txt> <gencode.gtf> <genome.fa> <out_prefix>, the script generates <out_prefix>.[transcripts|CDS|5UTR|3UTR].fasta files.
 python ~/Scripts/extract_transcript_parts.py MKL-1_wt.EV_vs_parental-down_protein_coding.txt ~/REFs/gencode.v48.annotation.gtf ~/REFs/GRCh38.primary_assembly.genome.fa MKL-1_down  #112
 python ~/Scripts/extract_transcript_parts.py MKL-1_wt.EV_vs_parental-up_protein_coding.txt ~/REFs/gencode.v48.annotation.gtf ~/REFs/GRCh38.primary_assembly.genome.fa MKL-1_up  #5988
 python ~/Scripts/extract_transcript_parts.py MKL-1_wt.EV_vs_parental-all_protein_coding.txt ~/REFs/gencode.v48.annotation.gtf ~/REFs/GRCh38.primary_assembly.genome.fa MKL-1_background  #19239
 python ~/Scripts/extract_transcript_parts.py WaGa_wt.EV_vs_parental-down_protein_coding.txt ~/REFs/gencode.v48.annotation.gtf ~/REFs/GRCh38.primary_assembly.genome.fa WaGa_down  #93
 python ~/Scripts/extract_transcript_parts.py WaGa_wt.EV_vs_parental-up_protein_coding.txt ~/REFs/gencode.v48.annotation.gtf ~/REFs/GRCh38.primary_assembly.genome.fa WaGa_up  #6538
 python ~/Scripts/extract_transcript_parts.py WaGa_wt.EV_vs_parental-all_protein_coding.txt ~/REFs/gencode.v48.annotation.gtf ~/REFs/GRCh38.primary_assembly.genome.fa WaGa_background  #19239

 python ~/Scripts/filter_short_fasta.py MKL-1_up.3UTR.fasta MKL-1_up.filtered.3UTR.fasta
 python ~/Scripts/filter_short_fasta.py MKL-1_down.3UTR.fasta MKL-1_down.filtered.3UTR.fasta
 python ~/Scripts/filter_short_fasta.py MKL-1_background.3UTR.fasta MKL-1_background.filtered.3UTR.fasta
 # 检查背景文件中有多少序列：
 #grep -c "^>" MKL-1_background.filtered.3UTR.fasta
 #68890
 # 检查背景 FIMO 命中的总序列数：
 #cut -f3 fimo_background_MKL-1_background/fimo.tsv | sort | uniq | wc -l
 #67841
 python ~/Scripts/filter_short_fasta.py WaGa_background.3UTR.fasta WaGa_background.filtered.3UTR.fasta
 python ~/Scripts/filter_short_fasta.py WaGa_up.3UTR.fasta WaGa_up.filtered.3UTR.fasta
 python ~/Scripts/filter_short_fasta.py WaGa_down.3UTR.fasta WaGa_down.filtered.3UTR.fasta

FIMO for motif scan on 3UTR sequences

 # Generate fimo_foreground_MKL-1_down/fimo.tsv
 mamba activate meme_env
 fimo --thresh 1e-4 --oc fimo_foreground_MKL-1_down attract_human_named.meme MKL-1_down.3UTR.fasta
 fimo --thresh 1e-4 --oc fimo_foreground_MKL-1_up attract_human.meme MKL-1_up.3UTR.fasta
 fimo --thresh 1e-4 --oc fimo_background_MKL-1_background attract_human.meme MKL-1_background.3UTR.fasta
 fimo --thresh 1e-4 --oc fimo_foreground_WaGa_down attract_human.meme WaGa_down.3UTR.fasta
 fimo --thresh 1e-4 --oc fimo_foreground_WaGa_up attract_human.meme WaGa_up.3UTR.fasta
 fimo --thresh 1e-4 --oc fimo_background_WaGa_background attract_human.meme WaGa_background.3UTR.fasta

 #Keep only one match per gene (based on Ensembl Gene ID like ENSG00000134871) for each RBP motif, even if multiple transcripts have hits.
 #python ~/Scripts/filter_fimo_best_per_gene.py --input fimo_foreground/fimo.tsv --output fimo_foreground/fimo.filtered.tsv

 python ~/Scripts/convert_gtf_to_Gene_annotation_TSV_file.py ~/REFs/Homo_sapiens.GRCh38.114.gtf Homo_sapiens.GRCh38.gene_annotation.tsv

 python ~/Scripts/filter_fimo_best_per_gene_annotated.py \
 --input fimo_foreground_MKL-1_down/fimo.tsv \
 --annot Homo_sapiens.GRCh38.gene_annotation.tsv \
 --output_filtered fimo_foreground_MKL-1_down/fimo.filtered.tsv \
 --output_annotated fimo_foreground_MKL-1_down/fimo.filtered.annotated.tsv
 #21559
 python ~/Scripts/filter_fimo_best_per_gene_annotated.py \
 --input fimo_foreground_MKL-1_up/fimo.tsv \
 --annot Homo_sapiens.GRCh38.gene_annotation.tsv \
 --output_filtered fimo_foreground_MKL-1_up/fimo.filtered.tsv \
 --output_annotated fimo_foreground_MKL-1_up/fimo.filtered.annotated.tsv
 #(736661 rows)
 python ~/Scripts/filter_fimo_best_per_gene_annotated.py \
 --input fimo_background_MKL-1_background/fimo.tsv \
 --annot Homo_sapiens.GRCh38.gene_annotation.tsv \
 --output_filtered fimo_background_MKL-1_background/fimo.filtered.tsv \
 --output_annotated fimo_background_MKL-1_background/fimo.filtered.annotated.tsv
 #(1869075 rows)
 python ~/Scripts/filter_fimo_best_per_gene_annotated.py \
 --input fimo_foreground_WaGa_down/fimo.tsv \
 --annot Homo_sapiens.GRCh38.gene_annotation.tsv \
 --output_filtered fimo_foreground_WaGa_down/fimo.filtered.tsv \
 --output_annotated fimo_foreground_WaGa_down/fimo.filtered.annotated.tsv
 #(20364 rows)
 python ~/Scripts/filter_fimo_best_per_gene_annotated.py \
 --input fimo_foreground_WaGa_up/fimo.tsv \
 --annot Homo_sapiens.GRCh38.gene_annotation.tsv \
 --output_filtered fimo_foreground_WaGa_up/fimo.filtered.tsv \
 --output_annotated fimo_foreground_WaGa_up/fimo.filtered.annotated.tsv
 #(805634 rows)
 python ~/Scripts/filter_fimo_best_per_gene_annotated.py \
 --input fimo_background_WaGa_background/fimo.tsv \
 --annot Homo_sapiens.GRCh38.gene_annotation.tsv \
 --output_filtered fimo_background_WaGa_background/fimo.filtered.tsv \
 --output_annotated fimo_background_WaGa_background/fimo.filtered.annotated.tsv
 #(1811615 rows)

 python ~/Scripts/run_enrichment.py \
     --attract ATtRACT_db.txt \
     --fimo_fg fimo_foreground_MKL-1_up/fimo.filtered.tsv \
     --fimo_bg fimo_background_MKL-1_background/fimo.filtered.tsv \
     --output rbp_enrichment_MKL-1_up.csv \
     --strategy inclusive
 python ~/Scripts/run_enrichment.py \
     --attract ATtRACT_db.txt \
     --fimo_fg fimo_foreground_MKL-1_down/fimo.filtered.tsv \
     --fimo_bg fimo_background_MKL-1_background/fimo.filtered.tsv \
     --output rbp_enrichment_MKL-1_down.csv
 python ~/Scripts/run_enrichment.py \
     --attract ATtRACT_db.txt \
     --fimo_fg fimo_foreground_WaGa_up/fimo.filtered.tsv \
     --fimo_bg fimo_background_WaGa_background/fimo.filtered.tsv \
     --output rbp_enrichment_WaGa_up.csv
 python ~/Scripts/run_enrichment.py \
     --attract ATtRACT_db.txt \
     --fimo_fg fimo_foreground_WaGa_down/fimo.filtered.tsv \
     --fimo_bg fimo_background_WaGa_background/fimo.filtered.tsv \
     --output rbp_enrichment_WaGa_down.csv

 #Column Meaning in rbp_enrichment_*.[csv|xlsx]
 #a  Number of unique foreground UTRs hit by the RBP
 #b  Number of unique background UTRs hit by the RBP
 #c  Total number of foreground UTRs
 #d  Total number of background UTRs (⬅️ this is the value you're asking about)
 #p_value, fdr   From Fisher's exact test on enrichment

 # The following pdf-files are not used and sent!
 python ~/Scripts/plot_volcano.py --csv rbp_enrichment_MKL-1_up.csv --output MKL-1_volcano_up.pdf --title "Upregulated MKL-1"
 python ~/Scripts/plot_rbp_heatmap.py \
 --csvs rbp_enrichment_MKL-1_up.csv rbp_enrichment_MKL-1_down.csv \
 --labels Upregulated Downregulated \
 --output MKL-1_rbp_enrichment_heatmap.pdf

 #-- Get all genes the number 1621 refers to --
 #AGO2,1621,5050,5732,12987,1.0,1.0   #MKL-1_up
 #motif_ids are 414 and 399
 grep "^414" fimo.filtered.annotated.tsv > AGO2.txt
 grep "^399" fimo.filtered.annotated.tsv >> AGO2.txt
 cut -d$'\t' -f11 AGO2.txt | sort -u > AGO2_uniq.txt
 wc -l AGO2_uniq.txt
 #1621 AGO2_uniq.txt

miRNAs motif analysis using ATtRACT + FIMO

     #* Extract their sequences
     #* Generate a background set
     #* Run RBP enrichment (e.g., with RBPmap or FIMO)
     #* Get p-adjusted enrichment stats (e.g., Fisher + BH)

     #Input_1. DE results (differential expression file from smallRNA-seq)
         #Input: up- and down-, all-regulated files
         #~/DATA/Data_Ute/Data_Ute_smallRNA_7/summaries_exo7/miRNAs/untreated_vs_parental_cells-up.txt  #66
         #~/DATA/Data_Ute/Data_Ute_smallRNA_7/summaries_exo7/miRNAs/untreated_vs_parental_cells-down.txt  #38
         #~/DATA/Data_Ute/Data_Ute_smallRNA_7/summaries_exo7/miRNAs/untreated_vs_parental_cells-all.txt  #1304
         #Format: 1st column = miRNA ID (e.g., hsa-miR-21-5p), optionally with other stats.

         cp ~/DATA/Data_Ute/Data_Ute_smallRNA_7/summaries_exo7/miRNAs/untreated_vs_parental_cells-*.txt .
         #"hsa-miR-3180|hsa-miR-3180-3p"
         #>hsa-miR-3180 MIMAT0018178 Homo sapiens miR-3180
         #UGGGGCGGAGCUUCCGGAG
         #>hsa-miR-3180-3p MIMAT0015058 Homo sapiens miR-3180-3p
         #UGGGGCGGAGCUUCCGGAGGCC

     #Input_2. Reference FASTA (Reference sequences from miRBase or GENCODE)
         #From miRBase: https://mirbase.org/download/  https://mirbase.org/download/CURRENT/
         ##miRBase_v21
         #mature.fa.gz → contains mature miRNA sequences
         #hairpin.fa.gz → for pre-miRNAs
         mv ../RBP_enrichments_OLD_DEL/mature_v21.fa .

     # -- Extract Sequences + Background Set --

     #Inputs:
     #    * up_miRNA.txt and down_miRNA.txt: DE results (first column = miRNA name, e.g., hsa-miR-21-5p)
     #    * mature.fa or hairpin.fa from miRBase

     #Outputs:
     #    * mirna_up.fa
     #    * mirna_down.fa
     #    * mirna_background.fa

     #Use all remaining miRNAs as background:
     python ~/Scripts/prepare_miRNA_sets.py untreated_vs_parental_cells-up.txt untreated_vs_parental_cells-down.txt mature_v21.fa mirna --full-background
     mv mirna_background.fa mirna_full-background.fa

     #Use random subset background. Note that the generated background has the number of maxsize(up, down), in the case is up (84 records):
     #python ~/Scripts/prepare_miRNA_sets.py untreated_vs_parental_cells-up.txt untreated_vs_parental_cells-down.txt mature_v21.fa mirna
     # grep ">" mature_v21.fa | wc -l  #35828
     # grep ">" mirna_full-background.fa | wc -l  #35710-->35723
     # grep ">" mirna_up.fa | wc -l  #84
     # grep ">" mirna_down.fa | wc -l  #34
     # grep ">" mirna_background.fa | wc -l  #84-->67
     # #35,710 + 84 + 34 = 35,828

     fimo --thresh 1e-4 --oc fimo_mirna_down attract_human.meme mirna_down.fa
     fimo --thresh 1e-4 --oc fimo_mirna_up attract_human.meme mirna_up.fa
     fimo --thresh 1e-4 --oc fimo_mirna_full-background attract_human.meme mirna_full-background.fa
     #fimo --thresh 1e-4 --oc fimo_mirna_background attract_human.meme mirna_background.fa

     python ~/Scripts/filter_fimo_best_per_gene_annotated.py \
     --input fimo_mirna_down/fimo.tsv \
     --annot Homo_sapiens.GRCh38.gene_annotation.tsv \
     --output_filtered fimo_mirna_down/fimo.filtered.tsv \
     --output_annotated fimo_mirna_down/fimo.filtered.annotated.tsv  #21
     python ~/Scripts/filter_fimo_best_per_gene_annotated.py \
     --input fimo_mirna_up/fimo.tsv \
     --annot Homo_sapiens.GRCh38.gene_annotation.tsv \
     --output_filtered fimo_mirna_up/fimo.filtered.tsv \
     --output_annotated fimo_mirna_up/fimo.filtered.annotated.tsv  #48
     python ~/Scripts/filter_fimo_best_per_gene_annotated.py \
     --input fimo_mirna_full-background/fimo.tsv \
     --annot Homo_sapiens.GRCh38.gene_annotation.tsv \
     --output_filtered fimo_mirna_full-background/fimo.filtered.tsv \
     --output_annotated fimo_mirna_full-background/fimo.filtered.annotated.tsv  #896
     #python ~/Scripts/filter_fimo_best_per_gene_annotated.py \
     #--input fimo_mirna_background/fimo.tsv \
     #--annot Homo_sapiens.GRCh38.gene_annotation.tsv \
     #--output_filtered fimo_mirna_background/fimo.filtered.tsv \
     #--output_annotated fimo_mirna_background/fimo.filtered.annotated.tsv  #57

     python ~/Scripts/run_enrichment_miRNAs.py \
         --attract ATtRACT_db.txt \
         --fimo_fg fimo_mirna_up/fimo.filtered.tsv \
         --fimo_bg fimo_mirna_full-background/fimo.filtered.tsv \
         --output rbp_enrichment_mirna_up.csv \
         --strategy inclusive
     python ~/Scripts/run_enrichment_miRNAs.py \
         --attract ATtRACT_db.txt \
         --fimo_fg fimo_mirna_down/fimo.filtered.tsv \
         --fimo_bg fimo_mirna_full-background/fimo.filtered.tsv \
         --output rbp_enrichment_mirna_down.csv \
         --strategy inclusive
     #python ~/Scripts/run_enrichment_miRNAs.py \
     #    --attract ATtRACT_db.txt \
     #    --fimo_fg fimo_mirna_up/fimo.filtered.tsv \
     #    --fimo_bg fimo_mirna_background/fimo.filtered.tsv \
     #    --output rbp_enrichment_mirna_up_on_subset-background.csv \
     #    --strategy inclusive
     #python ~/Scripts/run_enrichment_miRNAs.py \
     #    --attract ATtRACT_db.txt \
     #    --fimo_fg fimo_mirna_down/fimo.filtered.tsv \
     #    --fimo_bg fimo_mirna_background/fimo.filtered.tsv \
     #    --output rbp_enrichment_mirna_down_on_subset-background.csv \
     #    --strategy inclusive

     #FXR2   1 (hsa-miR-92b-5p)  1   1   118 0.0168067226890756  0.365546218487395
     #ORB2   1 (hsa-miR-4748)    1   1   118 0.0168067226890756  0.365546218487395

     #-- Get all genes the number 1621 refers to --
     grep "^FXR2" ATtRACT_db.txt
     #motif_ids is M020_0.6
     grep "^M020_0.6" fimo_mirna_up/fimo.filtered.annotated.tsv > FXR2.txt
     grep "^M020_0.6" fimo_mirna_up/fimo.filtered.annotated.tsv
     #cut -d$'\t' -f11 AGO2.txt | sort -u > AGO2_uniq.txt
     #wc -l AGO2_uniq.txt (1621 records)

     grep "^ORB2" ATtRACT_db.txt
     grep "^M120_0.6" fimo_mirna_up/fimo.filtered.annotated.tsv

Reports for RBP enrichment results

 Please find attached the results of the RNA-binding protein (RBP) enrichment analysis using FIMO and the ATtRACT motif database, along with a brief description of the procedures used for both the 3′ UTR-based analysis (RNA-seq) and the miRNA-based analysis (small RNA-seq).

 1. RBP Motif Enrichment from RNA-seq (3′ UTRs)

 We focused on 3′ UTRs, as they are key regulatory regions for RBPs. Sequences shorter than 16 nucleotides were excluded. Using FIMO (from the MEME suite) with motifs from the ATtRACT database, we scanned both foreground and background 3′ UTR sets to identify motif occurrences.

 Foreground: Differentially expressed transcripts (e.g., MKL-1 up/down, WaGa up/down)
 Background: All non-differentially expressed transcripts

 Analysis: Fisher’s exact test was used to assess motif enrichment; p-values were adjusted using the Benjamini–Hochberg method.

 Output files (RNA-seq):

     * rbp_enrichment_MKL-1_down.xlsx / .png
     * rbp_enrichment_MKL-1_up.xlsx / .png
     * rbp_enrichment_WaGa_down.xlsx / .png
     * rbp_enrichment_WaGa_up.xlsx / .png

 2. RBP Motif Enrichment from Small RNA-seq (miRNAs)

 This analysis focused on differentially expressed miRNAs, using either mature miRNA sequences from miRBase. We scanned for RBP binding motifs within these sequences using FIMO and assessed motif enrichment relative to background sets.

 Foreground: DE miRNAs (up/down) from small RNA-seq comparisons
 Background: All other miRNAs from miRBase

 Analysis: FIMO was used with --thresh 1e-4, followed by annotation and filtering. Enrichment was assessed using Fisher’s test + BH correction.

 Output files (miRNAs):

     * rbp_enrichment_mirna_down.xlsx
     * rbp_enrichment_mirna_up.xlsx

 How to Interpret the Numbers
 Each row in the result tables represents one RBP and its enrichment statistics:

 a: foreground genes/sequences with the motif
 b: background genes/sequences with the motif
 c: total number of foreground genes/sequences
 d: total number of background genes/sequences

 These values are used to compute p-values and FDRs.

 For example, in rbp_enrichment_MKL-1_up.xlsx, AGO2 has a = 1621, meaning FIMO detected AGO2 motifs in 1,621 genes in the MKL-1 upregulated set. These genes are listed in AGO2_uniq.txt.

 Similarly, for the miRNA analysis (e.g., rbp_enrichment_mirna_up.xlsx and rbp_enrichment_mirna_down.xlsx), the numbers represent counts of unique miRNAs with at least one significant motif hit. As examples, I calculated the detailed membership for FXR2 and ORB2.

 diff RBP_enrichments/rbp_enrichment_MKL-1_up.csv  RBP_enrichments_OLD_DEL/rbp_enrichment_MKL-1_up.csv
 #diff RBP_enrichments/rbp_enrichment_MKL-1_up.xlsx RBP_enrichments_OLD_DEL/rbp_enrichment_MKL-1_up.xlsx
 #diff RBP_enrichments/rbp_enrichment_MKL-1_up.png   RBP_enrichments_OLD_DEL/rbp_enrichment_MKL-1_up.png
 diff RBP_enrichments/rbp_enrichment_MKL-1_down.csv RBP_enrichments_OLD_DEL/rbp_enrichment_MKL-1_down.csv
 #diff RBP_enrichments/rbp_enrichment_MKL-1_down.xlsx RBP_enrichments_OLD_DEL/rbp_enrichment_MKL-1_down.xlsx
 #diff RBP_enrichments/rbp_enrichment_MKL-1_down.png RBP_enrichments_OLD_DEL/rbp_enrichment_MKL-1_down.png
 diff RBP_enrichments/rbp_enrichment_WaGa_up.csv RBP_enrichments_OLD_DEL/rbp_enrichment_WaGa_up.csv
 #diff RBP_enrichments/rbp_enrichment_WaGa_up.xlsx RBP_enrichments_OLD_DEL/rbp_enrichment_WaGa_up.xlsx
 #diff RBP_enrichments/rbp_enrichment_WaGa_up.png RBP_enrichments_OLD_DEL/rbp_enrichment_WaGa_up.png
 diff RBP_enrichments/rbp_enrichment_WaGa_down.csv RBP_enrichments_OLD_DEL/rbp_enrichment_WaGa_down.csv
 #diff RBP_enrichments/rbp_enrichment_WaGa_down.xlsx RBP_enrichments_OLD_DEL/rbp_enrichment_WaGa_down.xlsx
 #diff RBP_enrichments/rbp_enrichment_WaGa_down.png RBP_enrichments_OLD_DEL/rbp_enrichment_WaGa_down.png
 okular RBP_enrichments/MKL-1_volcano_up.pdf
 okular RBP_enrichments_OLD_DEL/MKL-1_volcano_up.pdf
 okular RBP_enrichments/MKL-1_rbp_enrichment_heatmap.pdf
 okular RBP_enrichments_OLD_DEL/MKL-1_rbp_enrichment_heatmap.pdf

 diff RBP_enrichments/rbp_enrichment_mirna_up.csv RBP_enrichments_OLD_DEL/rbp_enrichment_mirna_up_on_full-background.csv
 #diff RBP_enrichments/rbp_enrichment_mirna_up.xlsx RBP_enrichments_OLD_DEL/rbp_enrichment_mirna_up.xlsx
 #diff RBP_enrichments/rbp_enrichment_mirna_up.png RBP_enrichments_OLD_DEL/rbp_enrichment_mirna_up_on_full-background.png
 diff RBP_enrichments/rbp_enrichment_mirna_down.csv RBP_enrichments_OLD_DEL/rbp_enrichment_mirna_down_on_full-background.csv
 #diff RBP_enrichments/rbp_enrichment_mirna_down.xlsx RBP_enrichments_OLD_DEL/rbp_enrichment_mirna_down.xlsx
 #diff RBP_enrichments/rbp_enrichment_mirna_down.png RBP_enrichments_OLD_DEL/rbp_enrichment_mirna_down_on_full-background.png

Generate the sequence logos for the enriched RBP motives

 import os
 import pandas as pd
 import matplotlib.pyplot as plt
 import logomaker
 from pathlib import Path
 import re

 # --------------
 # Config
 # --------------
 motif_table_path = "ATtRACT_db.txt"
 pwm_file_path = "pwm.txt"
 input_files = [
     "rbp_enrichment_MKL-1_up.csv",
     "rbp_enrichment_MKL-1_down.csv",
     "rbp_enrichment_WaGa_up.csv",
     "rbp_enrichment_WaGa_down.csv",
     "rbp_enrichment_mirna_up.csv",
     "rbp_enrichment_mirna_down.csv"
 ]
 output_dir = "sequence_logos"
 os.makedirs(output_dir, exist_ok=True)

 # --------------
 # Helper Functions
 # --------------
 def load_pwm_file(pwm_path):
     pwm_dict = {}
     current_id = None
     pwm_matrix = []
     with open(pwm_path, 'r') as f:
         for line in f:
             line = line.strip()
             if not line:
                 continue
             if line.startswith('>'):
                 if current_id and pwm_matrix:
                     pwm_dict[current_id] = pwm_matrix
                 parts = line[1:].split()
                 current_id = parts[0]
                 pwm_matrix = []
             else:
                 row = [float(x) for x in line.split()]
                 if len(row) == 4:
                     pwm_matrix.append(row)
     if current_id and pwm_matrix:
         pwm_dict[current_id] = pwm_matrix
     return pwm_dict

 def sanitize_filename(text):
     return re.sub(r'[^\w\-_.]', '_', text)

 def plot_logo(pwm_df, title, output_path):
     fig, ax = plt.subplots(figsize=(len(pwm_df)*0.5, 2))
     logo = logomaker.Logo(pwm_df, ax=ax, color_scheme='classic')
     logo.style_spines(visible=False)
     logo.style_spines(spines=['left', 'bottom'], visible=True)
     ax.set_ylabel("Information")
     ax.set_title(title)
     fig.tight_layout()
     fig.savefig(output_path, bbox_inches='tight')
     plt.close(fig)

 # --------------
 # Load Motif Table and Filter per your strategy
 # --------------
 motif_table = pd.read_csv(motif_table_path, sep='\t')

 # Filter rows where Score ends with '**'
 motif_table = motif_table[motif_table['Score'].str.endswith('**')].copy()

 # Remove '**' and convert Score to float
 motif_table['Score'] = motif_table['Score'].str.replace(r'\*\*$', '', regex=True).astype(float)

 # Function to keep Homo sapiens rows if exist, otherwise all
 def keep_human_if_exists(group):
     human_rows = group[group['Organism'] == 'Homo_sapiens']
     return human_rows if not human_rows.empty else group

 motif_table = motif_table.groupby('Gene_name', group_keys=False).apply(keep_human_if_exists)

 # If multiple remain per RBP, pick one randomly (seeded for reproducibility)
 motif_table = motif_table.groupby('Gene_name').apply(lambda g: g.sample(1, random_state=42)).reset_index(drop=True)

 # Build motif map for quick lookup
 motif_map = motif_table[['Gene_name', 'Matrix_id']].drop_duplicates()

 # Load PWM dictionary
 pwm_dict = load_pwm_file(pwm_file_path)

 # --------------
 # Process Each Enrichment File
 # --------------
 for file in input_files:
     print(f"\n📂 Processing {file}")
     df = pd.read_csv(file)

     if 'fdr' not in df.columns:
         print(f"⚠️ Skipping {file}: no 'fdr' column")
         continue

     sig_df = df[df['fdr'] <= 0.05].copy()

     for _, row in sig_df.iterrows():
         rbp_name = row['RBP']
         matches = motif_map[motif_map['Gene_name'] == rbp_name]

         if matches.empty:
             print(f"  ⚠️ No motif entry for RBP: {rbp_name}")
             continue

         # Should be exactly one row per RBP now
         motif_row = matches.iloc[0]
         matrix_id = motif_row['Matrix_id']

         if matrix_id not in pwm_dict:
             print(f"  ⚠️ PWM not found for matrix ID {matrix_id} (RBP: {rbp_name})")
             continue

         pwm = pwm_dict[matrix_id]
         pwm_df = pd.DataFrame(pwm, columns=list("ACGT"))

         title = f"{rbp_name} ({matrix_id})"
         safe_name = sanitize_filename(f"{Path(file).stem}_{rbp_name}_{matrix_id}.png")
         out_path = os.path.join(output_dir, safe_name)

         plot_logo(pwm_df, title, out_path)

 print("\n✅ Sequence logo generation complete.")

Plot pie-chart for RNA-seq results

 mamba activate plot-numpy1
 (plot-numpy1) jhuang@WS-2290C:/media/jhuang/Elements1/Data_Ute$ python rna_type_piecharts.py /mnt/nvme1n1p1/Homo_sapiens/Ensembl/GRCh38/Annotation/Genes/genes.gtf Data_RNA-Seq_MKL-1+WaGa/merged_gene_counts_MKL-1_human.txt Data_RNA-Seq_MKL-1+WaGa/merged_gene_counts_WaGa_human.txt

Code of rna_type_piecharts.py

 import sys
 import pandas as pd
 import matplotlib.pyplot as plt
 from matplotlib import cm
 import numpy as np

 def parse_gtf(gtf_file):
     biotype_dict = {}
     with open(gtf_file) as f:
         for line in f:
             if line.startswith('#'):
                 continue
             fields = line.strip().split('\t')
             if fields[2] != 'gene':
                 continue
             attr_field = fields[8]
             attrs = {}
             for attr in attr_field.split(';'):
                 attr = attr.strip()
                 if attr == '':
                     continue
                 key, val = attr.split(' ', 1)
                 attrs[key] = val.strip('"')
             gene_id = attrs.get('gene_id')
             gene_biotype = attrs.get('gene_biotype') or attrs.get('gene_type')  # some GTFs use gene_type
             if gene_id and gene_biotype:
                 biotype_dict[gene_id] = gene_biotype
     return biotype_dict

 def load_counts(mkl_file, waga_file):
     df_mkl = pd.read_csv(mkl_file, sep='\t')
     df_waga = pd.read_csv(waga_file, sep='\t')
     # Check for Geneid/gene_name match
     if not (df_mkl['Geneid'].equals(df_waga['Geneid']) and df_mkl['gene_name'].equals(df_waga['gene_name'])):
         mismatch = df_mkl.loc[
             (df_mkl['Geneid'] != df_waga['Geneid']) | (df_mkl['gene_name'] != df_waga['gene_name']),
             ['Geneid', 'gene_name']
         ]
         print("⚠️ Mismatched rows found between Geneid/gene_name columns:")
         print(mismatch)
         raise ValueError("Mismatch in Geneid/gene_name columns between MKL-1 and WaGa count files.")

     # Drop duplicated Geneid/gene_name from WaGa counts before merge
     df_waga_samples = df_waga.drop(columns=['Geneid', 'gene_name'])

     # Merge horizontally
     df = pd.concat([df_mkl, df_waga_samples], axis=1)
     return df, df_mkl, df_waga

 def map_biotypes(df, biotype_dict):
     # Map gene biotypes by Geneid; if not found, assign 'unknown'
     df['gene_biotype'] = df['Geneid'].map(biotype_dict).fillna('unknown')
     return df

 def save_raw_data_for_pie(df, output_excel):
     # Sum counts by biotype
     # Sum all sample columns (exclude Geneid, gene_name, gene_biotype)
     sample_cols = [c for c in df.columns if c not in ['Geneid', 'gene_name', 'gene_biotype']]
     df['total_counts'] = df[sample_cols].sum(axis=1)

     # Aggregate counts per biotype
     pie_data = df.groupby('gene_biotype')['total_counts'].sum().reset_index()

     with pd.ExcelWriter(output_excel) as writer:
         df.to_excel(writer, sheet_name='All_gene_counts', index=False)
         pie_data.to_excel(writer, sheet_name='Pie_data', index=False)

     return pie_data

 def plot_pie(pie_data, output_png):
     pie_data = pie_data[pie_data['total_counts'] > 0].sort_values('total_counts', ascending=False)

     N = 10  # Top N biotypes
     if len(pie_data) > N:
         top_data = pie_data.nlargest(N, 'total_counts')
         others = pd.DataFrame([{
             'gene_biotype': 'Other',
             'total_counts': pie_data['total_counts'].sum() - top_data['total_counts'].sum()
         }])
         pie_data = pd.concat([top_data, others], ignore_index=True)

     sizes = pie_data['total_counts']
     labels = pie_data['gene_biotype']

     # Use updated color map syntax
     cmap = plt.colormaps['Pastel1']
     colors = [cmap(i % cmap.N) for i in range(len(labels))]

     fig, ax = plt.subplots(figsize=(10, 10))

     # Draw pie chart without labels/autopct
     wedges, _ = ax.pie(
         sizes,
         startangle=90,
         colors=colors,
         radius=1.2
     )

     total = sum(sizes)

     for i, wedge in enumerate(wedges):
         angle = (wedge.theta2 + wedge.theta1) / 2.
         x = np.cos(np.deg2rad(angle))
         y = np.sin(np.deg2rad(angle))

         pct = sizes.iloc[i] / total * 100
         if pct < 1:
             continue  # Skip small slices

         ha = 'left' if x > 0 else 'right'
         ax.annotate(
             f"{labels.iloc[i]} ({pct:.1f}%)",
             xy=(x, y),
             xytext=(1.4 * x, 1.4 * y),
             ha=ha, va='center',
             fontsize=9,
             #arrowprops=dict(arrowstyle='-', color='gray')
             arrowprops=dict(arrowstyle='-', connectionstyle='angle3,angleA=0,angleB=90', color='gray')
         )

     # Build detailed legend: biotype (count, %)
     legend_labels = [
         f"{lbl} ({int(cnt):,}, {cnt / total:.1%})"
         for lbl, cnt in zip(labels, sizes)
     ]

     ax.legend(
         wedges,
         legend_labels,
         title="Gene Biotype",
         loc="center left",
         bbox_to_anchor=(1, 0.5),
         fontsize=9,
         title_fontsize=10
     )

     ax.set_title("", fontsize=14)
     ax.axis('equal')
     plt.tight_layout()
     plt.savefig(output_png, bbox_inches='tight')
     plt.close()

 def main():
     if len(sys.argv) != 4:
         print("Usage: python rna_type_piecharts.py <genes.gtf> <MKL-1_counts.txt> <WaGa_counts.txt>")
         sys.exit(1)

     gtf_file = sys.argv[1]
     mkl_counts_file = sys.argv[2]
     waga_counts_file = sys.argv[3]

     print("🔍 Parsing gene biotypes from GTF...")
     biotype_dict = parse_gtf(gtf_file)

     print("📊 Loading and merging count matrices...")
     df, df_mkl, df_waga = load_counts(mkl_counts_file, waga_counts_file)

     print("🧬 Mapping gene biotypes...")
     df = map_biotypes(df, biotype_dict)
     df_mkl = map_biotypes(df_mkl, biotype_dict)
     df_waga = map_biotypes(df_waga, biotype_dict)

     print("💾 Saving raw data for pie charts to Excel...")
     pie_data = save_raw_data_for_pie(df, 'rna_biotype_pie_data.xlsx')
     pie_mkl_data = save_raw_data_for_pie(df_mkl, 'rna_biotype_pie_data_MKL-1.xlsx')
     pie_waga_data = save_raw_data_for_pie(df_waga, 'rna_biotype_pie_data_WaGa.xlsx')

     print("📈 Plotting pie chart...")
     plot_pie(pie_data, 'rna_biotype_pie_chart.png')
     plot_pie(pie_mkl_data, 'rna_biotype_pie_chart_MKL-1.png')
     plot_pie(pie_waga_data, 'rna_biotype_pie_chart_WaGa.png')

     print("✅ Done! Excel data saved to 'rna_biotype_pie*_data.xlsx' and pie chart saved to 'rna_biotype_pie*_chart.png'.")

 if __name__ == "__main__":
     main()

Microbial bioinformatics

Microbial bioinformatics uses computational tools to analyze genomes, track evolution, and study functions in microorganisms, including bacteria and viruses.

Workflow for RNA-Binding Protein Enrichment and RNA Type Distribution Analysis (Ute’s Project)

Leave a Reply Cancel reply