Comprehensive smallRNA-7 profiling using exceRpt pipeline with full reference databases (v2)

Input data

 # name                         condition
 # ----------------------------------------------
 # 0403_WaGa_wt                 parental_cells_1.fastq.gz
 # #0505_WaGa_wt_EV_RNA         untreated_1.fastq.gz
 # #0505_WaGa_sT_DMSO_EV_RNA    DMSO_control_1.fastq.gz
 # #0505_WaGa_sT_Dox_EV_RNA     sT_knockdown_1.fastq.gz
 # #0505_WaGa_scr_DMSO_EV_RNA   scr_DMSO_control_1.fastq.gz
 # #0505_WaGa_scr_Dox_EV_RNA    scr_control_1.fastq.gz
 # #1905_WaGa_wt_EV_RNA         untreated_2.fastq.gz
 # #1905_WaGa_sT_DMSO_EV_RNA    DMSO_control_2.fastq.gz
 # #1905_WaGa_sT_Dox_EV_RNA     sT_knockdown_2.fastq.gz
 # #1905_WaGa_scr_DMSO_EV_RNA   scr_DMSO_control_2.fastq.gz
 # #1905_WaGa_scr_Dox_EV_RNA    scr_control_2.fastq.gz
 #
 # WaGa_wt_cells_1              parental_cells_2.fastq.gz
 # WaGa_wt_cells_2              parental_cells_3.fastq.gz
 # #2001_WaGa_sT_DMSO           DMSO_control_3.fastq.gz
 # #2001_WaGa_sT_Dox            sT_knockdown_3.fastq.gz
 # #2001_WaGa_scr_DMSO          scr_DMSO_control_3.fastq.gz
 # #2001_WaGa_scr_Dox           scr_control_3.fastq.gz
 #
 # WaGa_wt_cells_1              parental_cells_2_R2.fastq.gz
 # WaGa_wt_cells_2              parental_cells_3_R2.fastq.gz
 # #2001_WaGa_sT_DMSO           DMSO_control_3_R2.fastq.gz
 # #2001_WaGa_sT_Dox            sT_knockdown_3_R2.fastq.gz
 # #2001_WaGa_scr_DMSO          scr_DMSO_control_3_R2.fastq.gz
 # #2001_WaGa_scr_Dox           scr_control_3_R2.fastq.gz

 mkdir ~/DATA/Data_Ute/Data_Ute_smallRNA_7/raw_data
 cd raw_data
 ln -s ~/DATA/Data_Ute/Data_Ute_smallRNA_3/220617_NB501882_0371_AH7572BGXM/nf774/0403_WaGa_wt_S20_R1_001.fastq.gz parental_cells_1.fastq.gz
 ln -s ~/DATA/Data_Ute/Data_Ute_smallRNA_7/231016_NB501882_0435_AHG7HMBGXV/nf930/01_0505_WaGa_wt_EV_RNA_S1_R1_001.fastq.gz untreated_1.fastq.gz
 ln -s ~/DATA/Data_Ute/Data_Ute_smallRNA_7/231016_NB501882_0435_AHG7HMBGXV/nf931/02_0505_WaGa_sT_DMSO_EV_RNA_S2_R1_001.fastq.gz DMSO_control_1.fastq.gz
 ln -s ~/DATA/Data_Ute/Data_Ute_smallRNA_7/231016_NB501882_0435_AHG7HMBGXV/nf932/03_0505_WaGa_sT_Dox_EV_RNA_S3_R1_001.fastq.gz sT_knockdown_1.fastq.gz
 ln -s ~/DATA/Data_Ute/Data_Ute_smallRNA_7/231016_NB501882_0435_AHG7HMBGXV/nf933/04_0505_WaGa_scr_DMSO_EV_RNA_S4_R1_001.fastq.gz scr_DMSO_control_1.fastq.gz
 ln -s ~/DATA/Data_Ute/Data_Ute_smallRNA_7/231016_NB501882_0435_AHG7HMBGXV/nf934/05_0505_WaGa_scr_Dox_EV_RNA_S5_R1_001.fastq.gz scr_control_1.fastq.gz
 ln -s ~/DATA/Data_Ute/Data_Ute_smallRNA_7/231016_NB501882_0435_AHG7HMBGXV/nf935/06_1905_WaGa_wt_EV_RNA_S6_R1_001.fastq.gz untreated_2.fastq.gz
 ln -s ~/DATA/Data_Ute/Data_Ute_smallRNA_7/231016_NB501882_0435_AHG7HMBGXV/nf936/07_1905_WaGa_sT_DMSO_EV_RNA_S7_R1_001.fastq.gz DMSO_control_2.fastq.gz
 ln -s ~/DATA/Data_Ute/Data_Ute_smallRNA_7/231016_NB501882_0435_AHG7HMBGXV/nf937/08_1905_WaGa_sT_Dox_EV_RNA_S8_R1_001.fastq.gz sT_knockdown_2.fastq.gz
 ln -s ~/DATA/Data_Ute/Data_Ute_smallRNA_7/231016_NB501882_0435_AHG7HMBGXV/nf938/09_1905_WaGa_scr_DMSO_EV_RNA_S9_R1_001.fastq.gz scr_DMSO_control_2.fastq.gz
 ln -s ~/DATA/Data_Ute/Data_Ute_smallRNA_7/231016_NB501882_0435_AHG7HMBGXV/nf939/10_1905_WaGa_scr_Dox_EV_RNA_S10_R1_001.fastq.gz scr_control_2.fastq.gz

 ln -s ~/DATA/Data_Ute/Data_Ute_smallRNA_7/250411_VH00358_135_AAGKGLHM5/nf961/WaGaWTcells_1_S1_R1_001.fastq.gz parental_cells_2.fastq.gz
 ln -s ~/DATA/Data_Ute/Data_Ute_smallRNA_7/250411_VH00358_135_AAGKGLHM5/nf962/WaGaWTcells_2_S2_R1_001.fastq.gz parental_cells_3.fastq.gz
 ln -s ~/DATA/Data_Ute/Data_Ute_smallRNA_7/250411_VH00358_135_AAGKGLHM5/nf971/2001_WaGa_sT_DMSO_S3_R1_001.fastq.gz DMSO_control_3.fastq.gz
 ln -s ~/DATA/Data_Ute/Data_Ute_smallRNA_7/250411_VH00358_135_AAGKGLHM5/nf972/2001_WaGa_sT_Dox_S4_R1_001.fastq.gz sT_knockdown_3.fastq.gz
 ln -s ~/DATA/Data_Ute/Data_Ute_smallRNA_7/250411_VH00358_135_AAGKGLHM5/nf973/2001_WaGa_scr_DMSO_S5_R1_001.fastq.gz scr_DMSO_control_3.fastq.gz
 ln -s ~/DATA/Data_Ute/Data_Ute_smallRNA_7/250411_VH00358_135_AAGKGLHM5/nf974/2001_WaGa_scr_Dox_S6_R1_001.fastq.gz scr_control_3.fastq.gz

 ln -s ~/DATA/Data_Ute/Data_Ute_smallRNA_7/250411_VH00358_135_AAGKGLHM5/nf961/WaGaWTcells_1_S1_R2_001.fastq.gz parental_cells_2_R2.fastq.gz
 ln -s ~/DATA/Data_Ute/Data_Ute_smallRNA_7/250411_VH00358_135_AAGKGLHM5/nf962/WaGaWTcells_2_S2_R2_001.fastq.gz parental_cells_3_R2.fastq.gz
 ln -s ~/DATA/Data_Ute/Data_Ute_smallRNA_7/250411_VH00358_135_AAGKGLHM5/nf971/2001_WaGa_sT_DMSO_S3_R2_001.fastq.gz DMSO_control_3_R2.fastq.gz
 ln -s ~/DATA/Data_Ute/Data_Ute_smallRNA_7/250411_VH00358_135_AAGKGLHM5/nf972/2001_WaGa_sT_Dox_S4_R2_001.fastq.gz sT_knockdown_3_R2.fastq.gz
 ln -s ~/DATA/Data_Ute/Data_Ute_smallRNA_7/250411_VH00358_135_AAGKGLHM5/nf973/2001_WaGa_scr_DMSO_S5_R2_001.fastq.gz scr_DMSO_control_3_R2.fastq.gz
 ln -s ~/DATA/Data_Ute/Data_Ute_smallRNA_7/250411_VH00358_135_AAGKGLHM5/nf974/2001_WaGa_scr_Dox_S6_R2_001.fastq.gz scr_control_3_R2.fastq.gz

 #awk '{print $2}' temp3

Adapter trimming

 #some common adapter sequences from different kits for reference:
 #    - TruSeq Small RNA (Illumina): TGGAATTCTCGGGTGCCAAGG
 #    - Small RNA Kits V1 (Illumina): TCGTATGCCGTCTTCTGCTTGT
 #    - Small RNA Kits V1.5 (Illumina): ATCTCGTATGCCGTCTTCTGCTTG
 #    - NEXTflex Small RNA Sequencing Kit v3 for Illumina Platforms (Bioo Scientific): TGGAATTCTCGGGTGCCAAGG
 #    - LEXOGEN Small RNA-Seq Library Prep Kit (Illumina): TGGAATTCTCGGGTGCCAAGGAACTCCAGTCAC *
 mkdir trimmed; cd trimmed
 for sample in parental_cells_1 untreated_1 DMSO_control_1 sT_knockdown_1 scr_DMSO_control_1 scr_control_1 untreated_2 DMSO_control_2 sT_knockdown_2 scr_DMSO_control_2 scr_control_2 parental_cells_2 parental_cells_3 DMSO_control_3 sT_knockdown_3 scr_DMSO_control_3 scr_control_3 parental_cells_2_R2 parental_cells_3_R2 DMSO_control_3_R2 sT_knockdown_3_R2 scr_DMSO_control_3_R2 scr_control_3_R2; do
   echo "------------------------------------ cutadapting the ${sample} -----------------------------------" >> LOG
   cutadapt -a TGGAATTCTCGGGTGCCAAGGAACTCCAGTCAC -q 20 --minimum-length 5 --trim-n -o ${sample}.fastq.gz ../raw_data/${sample}.fastq.gz >> LOG
 done

 # In LOG file to look the differences of the R1 and R2 reads based on the statistics of trimming.

 #Reads with adapters:                10,114,799 (79.9%)
 #Reads with adapters:                   240,366 (1.9%)
 #Reads with adapters:                   233,380 (1.6%)
 #Reads with adapters:                   230,664 (1.3%)
 #Reads with adapters:                   207,717 (1.3%)
 #Reads with adapters:                   186,080 (1.2%)
 #Reads with adapters:                   577,429 (1.5%)
 #Reads with adapters:                   268,867 (1.7%)
 #Reads with adapters:                   325,300 (1.4%)
 #Reads with adapters:                   314,540 (1.5%)
 #Reads with adapters:                   264,349 (1.5%)

 #Reads with adapters:                   299,677 (0.7%)
 #Reads with adapters:                   108,801 (0.6%)
 #Reads with adapters:                     5,095 (0.0%)
 #Reads with adapters:                     6,989 (0.0%)
 #Reads with adapters:                     3,868 (0.0%)
 #Reads with adapters:                     2,173 (0.0%)

 #Reads with adapters:                   615,334 (1.4%)
 #Reads with adapters:                   258,388 (1.5%)
 #Reads with adapters:                   294,325 (1.4%)
 #Reads with adapters:                   336,932 (1.8%)
 #Reads with adapters:                   239,288 (2.0%)
 #Reads with adapters:                   117,544 (1.5%)

 #Alternatively, we can also cut adapter in the exceRpt built-in functions since 'grep "TGGAATTCTCGGGTGCCAAGGAACTCCAGTCAC" /mnt/nvme0n1p1/MyexceRptDatabase/adapters/adapters.fa | wc -l' results in 48 records. However, explicitly cut adapter before is more ensured.

 #TODO: check if the R1 and R2 has the similar data distribution? Then decide if only R1 or both used for the downstream analysis?
 cat parental_cells_2.fastq.gz parental_cells_2_R2.fastq.gz > parental_cells_2_merged.fastq.gz
 cat parental_cells_3.fastq.gz parental_cells_3_R2.fastq.gz > parental_cells_3_merged.fastq.gz
 cat DMSO_control_3.fastq.gz DMSO_control_3_R2.fastq.gz > DMSO_control_3_merged.fastq.gz
 cat sT_knockdown_3.fastq.gz sT_knockdown_3_R2.fastq.gz > sT_knockdown_3_merged.fastq.gz
 cat scr_DMSO_control_3.fastq.gz scr_DMSO_control_3_R2.fastq.gz > scr_DMSO_control_3_merged.fastq.gz
 cat scr_control_3.fastq.gz scr_control_3_R2.fastq.gz > scr_control_3_merged.fastq.gz

 #Scenario   Option to use
 #-----------------------------
 #Trimming Read 1 only   -a
 #Trimming Read 2 only   -a
 #Trimming paired-end together   -a and -A
 #cutadapt -a TGGAATTCTCGGGTGCCAAGGAACTCCAGTCAC -q 20 --minimum-length 5 --trim-n -o ${sample}_R2_trimmed.fastq.gz ../raw_data/${sample}_R2.fastq.gz
 cutadapt \
 -a TGGAATTCTCGGGTGCCAAGGAACTCCAGTCAC \
 -A TGGAATTCTCGGGTGCCAAGGAACTCCAGTCAC \
 -q 20 --minimum-length 5 --trim-n \
 -o ${sample}_R1_trimmed.fastq.gz -p ${sample}_R2_trimmed.fastq.gz \
 ../raw_data/${sample}_R1.fastq.gz ../raw_data/${sample}_R2.fastq.gz

 # -- check if it is necessary to remove adapter from 5'-end --
 #(Option_1) cutadapt -g TGGAATTCTCGGGTGCCAAGGAACTCCAGTCAC -o /dev/null --report=minimal 0505_WaGa_wt_cutadapted.fastq.gz --> The trimming statistics in the output will show how often 5'-end adapters were removed.
 #(Option 2) zcat your_sample.fastq.gz | grep 'TGGAATTCTCGGGTGCCAAGGAACTCCAGTCAC' | head -n 20
 #(Option 3) fastqc your_sample.fastq.gz
 #Open the generated HTML report and check:
 #    The "Overrepresented sequences" section for adapter sequences.
 #    The "Per base sequence content" plot to see if there are unexpected sequences at the start of reads.
 #(If check results shows both ends contain adapter) cutadapt -g TGGAATTCTCGGGTGCCAAGGAACTCCAGTCAC -a TGGAATTCTCGGGTGCCAAGGAACTCCAGTCAC -q 20 --minimum-length 10 -o ${sample}_trimmed.fastq.gz ${sample}.fastq.gz >> LOG2
 #    -g → Trims 5'-end adapters
 #    -a → Trims 3'-end adapters; -a TGGAATTCTCGGGTGCCAAGGAACTCCAGTCAC → Specifies the adapter sequence to be removed from the 3' end of the reads. The sequence provided is common in RNA-seq libraries (e.g., Illumina small RNA sequencing).
 #    -q 20 → Performs quality trimming at both read ends, removing bases with a Phred quality score below 20.

Install exceRpt (https://github.gersteinlab.org/exceRpt/)

 docker pull rkitchen/excerpt
 mkdir MyexceRptDatabase
 cd /mnt/nvme0n1p1/MyexceRptDatabase
 wget http://org.gersteinlab.excerpt.s3-website-us-east-1.amazonaws.com/exceRptDB_v4_hg38_lowmem.tgz
 tar -xvf exceRptDB_v4_hg38_lowmem.tgz
 #http://org.gersteinlab.excerpt.s3-website-us-east-1.amazonaws.com/exceRptDB_v4_hg19_lowmem.tgz
 #http://org.gersteinlab.excerpt.s3-website-us-east-1.amazonaws.com/exceRptDB_v4_hg38_lowmem.tgz
 #http://org.gersteinlab.excerpt.s3-website-us-east-1.amazonaws.com/exceRptDB_v4_mm10_lowmem.tgz
 wget http://org.gersteinlab.excerpt.s3-website-us-east-1.amazonaws.com/exceRptDB_v4_EXOmiRNArRNA.tgz
 tar -xvf exceRptDB_v4_EXOmiRNArRNA.tgz
 wget http://org.gersteinlab.excerpt.s3-website-us-east-1.amazonaws.com/exceRptDB_v4_EXOGenomes.tgz
 tar -xvf exceRptDB_v4_EXOGenomes.tgz

Run exceRpt

 #[---- REAL_RUNNING_COMPLETE_DB ---->]
 #NOTE that if not renamed in the input files, then have to RENAME all files recursively by removing "_cutadapted.fastq" in all names in _CORE_RESULTS_v4.6.3.tgz (first unzip, removing, then zip, mv to ../results_g).
 cd trimmed
 #for file in *_cutadapted.fastq.gz; do
 #    echo "mv \"$file\" \"${file/_cutadapted.fastq/}\""
 #done
 for file in *.fastq.gz; do
     echo "mv \"$file\" \"${file/.fastq/}\""
 done

 mkdir results_exo6
 for sample in parental_cells_2 parental_cells_3 DMSO_control_3 sT_knockdown_3 scr_DMSO_control_3 scr_control_3    parental_cells_2_R2 parental_cells_3_R2 DMSO_control_3_R2 sT_knockdown_3_R2 scr_DMSO_control_3_R2 scr_control_3_R2    parental_cells_2_merged parental_cells_3_merged DMSO_control_3_merged sT_knockdown_3_merged scr_DMSO_control_3_merged scr_control_3_merged    parental_cells_1 untreated_1 DMSO_control_1 sT_knockdown_1 scr_DMSO_control_1 scr_control_1 untreated_2 DMSO_control_2 sT_knockdown_2 scr_DMSO_control_2 scr_control_2; do
     docker run -v ~/DATA/Data_Ute/Data_Ute_smallRNA_7/trimmed:/exceRptInput \
                -v ~/DATA/Data_Ute/Data_Ute_smallRNA_7/results_exo6:/exceRptOutput \
               -v /mnt/nvme0n1p1/MyexceRptDatabase:/exceRpt_DB \
               -t rkitchen/excerpt \
               INPUT_FILE_PATH=/exceRptInput/${sample}.gz MAIN_ORGANISM_GENOME_ID=hg38 N_THREADS=50 JAVA_RAM='200G' MAP_EXOGENOUS=on
 done

 #DEBUG the excerpt env
 docker inspect rkitchen/excerpt:latest
 # Without /bin/bash → May run and exit immediately
 #docker run -it rkitchen/excerpt
 # With /bin/bash → Stays open for interaction
 docker run -it --entrypoint /bin/bash rkitchen/excerpt

 #TODO: In the read2 exists the following adapter2, to test if the adapter can be identified and removed with the pipeline!

Processing exceRpt output from multiple samples

   mkdir summaries_exo6
   cd ~/DATA/Data_Ute/Data_Ute_smallRNA_7/exceRpt-master
   (r_env) jhuang@WS-2290C:~/DATA/Data_Ute/Data_Ute_smallRNA_7/exceRpt-master$ R
   #WARNING: need to reload the R-script after each change of the script.
   source("mergePipelineRuns_functions.R")

   getwd()
   #[1] "/media/jhuang/Elements/Data_Ute/Data_Ute_smallRNA_7/exceRpt-master"
   processSamplesInDir("../results_exo6/", "../summaries_exo6")

   #~/Tools/csv2xls-0.4/csv_to_xls.py exceRpt_miRNA_ReadsPerMillion.txt exceRpt_tRNA_ReadsPerMillion.txt exceRpt_piRNA_ReadsPerMillion.txt -d$'\t' -o exceRpt_results_detailed.xls

Re-draw the heatmap plots

   #genome   97.9%   98.3%   21.3%   44.9%   81.4%   78.3%   78.5%   79.3%   73.3%   69.2%   65.6%   71.9%
   #miRNA_sense  84.7%   85.6%   3.5%    7.1%    16.2%   14.7%   15.8%   15.3%   7.5%    7.0%    12.9%   14.6%
   #miRNA_antisense  0.0%    0.0%    0.0%    0.0%    0.0%    0.0%    0.0%    0.0%    0.0%    0.0%    0.0%    0.0%
   #
   #miRNAprecursor_sense 0.1%    0.1%    0.0%    0.0%    0.1%    0.1%    0.0%    0.1%    0.0%    0.0%    0.0%    0.0%
   #miRNAprecursor_antisense 0.0%    0.0%    0.0%    0.0%    0.0%    0.0%    0.0%    0.0%    0.0%    0.0%    0.0%    0.0%
   #
   #tRNA_sense   3.4%    1.8%    8.4%    25.3%   45.3%   41.4%   48.8%   47.3%   52.1%   49.0%   41.2%   33.9%
   #tRNA_antisense   0.0%    0.0%    0.0%    0.0%    0.0%    0.0%    0.0%    0.0%    0.0%    0.0%    0.0%    0.0%
   #
   #piRNA_sense  0.6%    0.5%    0.1%    0.4%    0.3%    0.4%    0.5%    0.4%    0.4%    0.5%    0.4%    0.6%
   #piRNA_antisense  0.0%    0.0%    0.0%    0.0%    0.0%    0.0%    0.0%    0.0%    0.0%    0.0%    0.0%    0.0%
   #
   #gencode_sense    7.0%    8.5%    6.7%    8.6%    15.7%   16.6%   10.8%   12.9%   11.2%   10.8%   8.5%    18.3%
   #gencode_antisense    0.1%    0.1%    0.7%    0.3%    0.2%    0.3%    0.2%    0.2%    0.2%    0.2%    0.2%    0.3%
   #gencode  7.10%   8.60%   7.40%   8.90%   15.90%  16.90%  11.00%  13.10%  11.40%  11.00%  8.70%   18.60%
   #
   #circularRNA_sense    0.0%    0.0%    0.0%    0.0%    0.0%    0.0%    0.0%    0.0%    0.0%    0.0%    0.0%    0.0%
   #circularRNA_antisense    0.0%    0.0%    0.0%    0.0%    0.0%    0.0%    0.0%    0.0%    0.0%    0.0%    0.0%    0.0%
   #
   #not_mapped_to_genome_or_libs 2.1%    1.7%    78.7%   55.1%   18.6%   21.7%   21.5%   20.7%   26.7%   30.8%   34.4%   28.1%

   import pandas as pd
   import numpy as np
   import seaborn as sns
   import matplotlib.pyplot as plt

   # Define data
   samples = [
       "control MKL1", "control WaGa", "WaGa wildtype 0505", "WaGa wildtype 1905",
       "WaGa sT DMSO 0505", "WaGa sT DMSO 1905", "WaGa sT Dox 0505", "WaGa sT Dox 1905",
       "WaGa scr DMSO 0505", "WaGa scr DMSO 1905", "WaGa scr Dox 0505", "WaGa scr Dox 1905"
   ]

   #TODO_2: genome --> human_genome, not_mapped_to_genome_or_libs --> not_mapped_to_human_genome
   #        send the new results including exogenous alignments to Ute!
   #categories = [
   #    "reads_used_for_alignment", "genome", "miRNA", "miRNAprecursor", "tRNA", "piRNA",
   #    "gencode", "circularRNA", "not_mapped_to_genome_or_libs"
   #]
   categories = [
       "reads_used_for_alignment", "human_genome", "miRNA", "miRNAprecursor", "tRNA", "piRNA",
       "gencode", "circularRNA", "not_mapped_to_human_genome"
   ]

   data = np.array([
       [100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0],
       [97.9, 98.3, 44.9, 21.3, 65.6, 71.9, 78.5, 81.4, 73.3, 79.3, 69.2, 78.3],
       [84.7, 85.6, 7.1, 3.5, 12.9, 14.6, 15.8, 16.2, 7.5, 15.3, 7.0, 14.7],
       [0.1, 0.1, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1, 0.0, 0.1, 0.0, 0.1],
       [3.4, 1.8, 25.3, 8.4, 41.2, 33.9, 48.8, 45.3, 52.1, 47.3, 49.0, 41.4],
       [0.6, 0.5, 0.4, 0.1, 0.4, 0.6, 0.5, 0.3, 0.4, 0.4, 0.5, 0.4],
       [7.1, 8.6, 8.9, 7.4, 8.7, 18.6, 11.0, 15.9, 11.4, 13.1, 11.0, 16.9],
       [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
       [2.1, 1.7, 55.1, 78.7, 34.4, 28.1, 21.5, 18.6, 26.7, 20.7, 30.8, 21.7]
   ])

   ## Load data from Excel file
   #file_path = "mapping_heatmap.xlsx"
   #
   ## Read Excel file, assuming first column is index (row labels)
   #df = pd.read_excel(file_path, index_col=0)

   # Convert percentages to decimals
   data = data / 100.0

   # Create DataFrame
   df = pd.DataFrame(data, index=categories, columns=samples)

   # Plot heatmap
   plt.figure(figsize=(14, 6))
   sns.heatmap(df, annot=True, cmap="coolwarm", fmt=".3f", linewidths=0.5, cbar_kws={'label': 'Fraction Aligned Reads'})

   # Improve layout
   plt.title("Heatmap of Read Alignments by Category and Sample", fontsize=14)
   plt.xlabel("Sample", fontsize=12)
   plt.ylabel("Read Category", fontsize=12)
   plt.xticks(rotation=15, ha="right", fontsize=10)
   plt.yticks(rotation=0, fontsize=10)
   plt.tight_layout()

   # Save as PNG
   plt.savefig("mapping_heatmap.png", dpi=300, bbox_inches="tight")

   # Show plot
   plt.show()

Key steps of log: This log details the execution of a small RNA sequencing data analysis pipeline using the exceRpt tool (version 4.6.3) in a Docker container. The pipeline processes a human small RNA-seq dataset (testData_human.fastq.gz) with the following key steps:
- Initial Setup
  - Docker container launched with mounted volumes for input/output and reference databases.
  - Parameters: hg38 genome, 50 threads, 200GB Java memory, exogenous mapping enabled.
  - Docker container launched with input/output volume mounts
  - 50 threads allocated with 200GB Java memory
  - hg38 reference genome specified
- Preprocessing
  - Adapter detection and trimming using known adapter sequences.
  - Quality filtering (Phred score ≥20, length ≥18nt).
  - Removal of homopolymer-rich reads and low-quality sequences.
  - Input FASTQ file decompressed (testData_human.fastq.gz)
  - Adapter sequences identified using adapters.fa
  - Quality encoding determined (Phred+33/64)
  - Adapter clipping performed (TCGTATGCCGTCTTCTGCTTG)
  - Quality filtering (Q20, p<80%)
  - Homopolymer repeats filtered (max 66% single nt)
- Contaminant Filtering
  - Alignment against UniVec contaminants and ribosomal RNA (rRNA) databases.
  - 322 reads processed, with statistics tracked at each step.
- Endogenous RNA Analysis
  - Alignment to human genome (hg38) and transcriptome.
  - Quantification of small RNA types:
    - miRNA (mature/precursor): Sense strands detected (antisense absent).
    - tRNA, piRNA, gencode transcripts: Only sense strands reported.
    - circRNA: Not detected in this dataset.
  - Coverage and complexity metrics calculated.
- Exogenous RNA Analysis
  - Screened for microbial/viral RNAs:
    - miRNA databases (miRBase).
    - Ribosomal RNA databases.
    - Comprehensive genomic databases (bacteria, plants, metazoa, fungi, viruses).
  - Taxonomic classification of exogenous hits performed.
- QC & Results
  - QC Result: PASS (based on transcriptome/genome ratio >0.5 and >100k transcriptome reads).
  - Key Metrics:
    - Input Reads: ~1.5 million (exact count not shown in log).
    - Genome Mapped: Majority of reads.
    - Transcriptome Complexity: Calculated ratio.
  - Core results compressed into testData_human.fastq_CORE_RESULTS_v4.6.3.tgz.
- Notable Observations:
  - Antisense Reads: Absent for miRNA, tRNA, and piRNA (common in small RNA-seq).
  - Potential Issues: Some files (e.g., antisense counts) were missing but did not disrupt pipeline.
  - Resource Usage: High RAM (200GB) and multi-threading (50 cores) employed for efficiency.
- Output Files:
  - Quantified counts for endogenous RNAs (miRNA, tRNA, etc.).
  - Exogenous RNA alignments with taxonomic annotations.
  - QC report, adapter sequences, and alignment statistics.

Microbial bioinformatics

Microbial bioinformatics uses computational tools to analyze genomes, track evolution, and study functions in microorganisms, including bacteria and viruses.

Comprehensive smallRNA-7 profiling using exceRpt pipeline with full reference databases (v2)

Leave a Reply Cancel reply