RNAseq running with umi_tools

  1. install conda environment

     #conda config --set auto_activate_base false
    
     conda create --name rnaseq python=3.7
    
     #NOTE: mamba 确实快多了,以后都用 mamba❕
     #install packages
     conda activate rnaseq
     pip3 install deeptools
     pip3 install multiqc
     conda install -c bioconda stringtie subread gffread
     conda install -c conda-forge -c bioconda -c defaults -c r r-data.table r-gplots
     conda install -c conda-forge -c bioconda -c defaults -c r bioconductor-dupradar bioconductor-edger
     conda install nextflow=23.04
    
     conda install fq
     conda install -c bioconda umi_tools
     conda install -c bioconda rsem
     conda install -c bioconda salmon
    
     #conda install some tools
     #install R-packages, 
     conda install -c bioconda ucsc-bedclip
     conda install -c bioconda ucsc-bedgraphtobigwig
     conda install -c bioconda bioconductor-matrixgenerics
     #conda install -c bioconda bioconductor-deseq2
     conda install -c bioconda r-pheatmap
     conda install -c anaconda gawk
    
     conda install mamba -n base -c conda-forge
     conda config --add channels conda-forge
     mamba install -c bioconda salmon=1.10
     #salmon should be >= 1.10 since in those version salmon set `--validateMappings` as default.
    
     conda install -c bioconda trim-galore star=2.6.1d bioconductor-summarizedexperiment bioconductor-tximport bioconductor-tximeta bioconductor-deseq2
     mamba install -c bioconda samtools=1.9  
     mamba install -c conda-forge r-optparse r-vctrs=0.5.0
     conda install nextflow=23.04
     mamba install -c bioconda qualimap
     mamba install -c bioconda rseqc
     mamba install -c conda-forge openssl
     conda install -c bioconda ucsc-bedclip
     conda install -c bioconda bedtools
     conda update -c bioconda ucsc-bedclip
     #for DEBUG: bedClip: error while loading shared libraries: libssl.so.1.0.0: cannot open shared object file: No such file or directory
     conda update -c bioconda ucsc-bedgraphtobigwig
     # samtools should be >= 1.9 as only those have the option @
     #samtools sort \
     #      -@ 6 \
     #      -o HSV.d2_r1.sorted.bam \
     #      -T HSV.d2_r1.sorted \
     #      HSV.d2_r1.Aligned.out.bam
  2. run UMItools without –umitools_dedup_stats, otherwise it cannot be finished in hamm.

    • Optimize UMItools parameters: Some parameters might influence the memory usage of UMItools. For example, you can try to reduce the number of allowed mismatches in the UMI sequence (–edit-distance-threshold). This will make the deduplication process less memory intensive but might also impact the results.

    • Use other deduplication tools: If the problem persists, you might need to use alternative tools for UMI deduplication which are less memory-intensive. Tools such as fgbio have a grouping and deduplication method similar to UMItools but have been reported to require less memory.

        #https://github.com/nf-core/rnaseq/issues/827
        #INFO for DEBUG: https://umi-tools.readthedocs.io/en/latest/faq.html
        #INFO for DEBUG: https://readthedocs.org/projects/umi-tools/downloads/pdf/stable/
        #https://github.com/CGATOxford/UMI-tools/issues/173
        # excessive dedup memory usage with output-stats #409 
        #https://github.com/CGATOxford/UMI-tools/issues/409
        #umi_tools 1.0.1
        #I am aware of previously closed issues:
        #excessive dedup memory usage #173
        #speed up stats #184
        #Running a single-end bam file with 3.13M reads and a 10bp (fully random) UMI.
        #Using --method=unique
        #There still seems to be a memory problem with --output-stats
        #Running with output-stats, memory usage climbs over 100GB and eventually crashes with "MemoryError".
        #Running without output-stats, job completes in about 3 minutes, with no problems.
      
        #TRY STANDALONE RUNNING: /usr/local/bin/python /usr/local/bin/umi_tools dedup -I HSV.d8_r1.transcriptome.sorted.bam -S HSV.d8_r1.umi_dedup.transcriptome.sorted.bam --method=unique --random-seed=100 
        #/home/jhuang/miniconda3/envs/rnaseq/bin/python /home/jhuang/miniconda3/envs/rnaseq/bin/umi_tools dedup -I star_salmon/HSV.d8_r1.sorted.bam -S HSV.d8_r1.umi_dedup.sorted.bam --output-stats HSV.d8_r1.umi_dedup.sorted --method=unique --random-seed=100
      
        #umitools dedup uses large amounts of memory and runs slowly. To speed it up it is recommended to only run it on a single chromosome, see the FAQ point number 4.
        #I suggest either making the --output-stats optional, or running a second round of deduplication on a single chromosome to generate the output stats.
      
        #--Human--
        #hamm
        /usr/local/bin/nextflow run rnaseq/main.nf --input samplesheet.csv --outdir results_GRCh38 --genome GRCh38   --with_umi --umitools_extract_method "regex" --umitools_bc_pattern "^(?P
      .{12}).*” -profile docker -resume –max_cpus 54 –max_memory 120.GB –max_time 2400.h –save_align_intermeds –save_unaligned –save_reference –aligner ‘star_salmon’ –pseudo_aligner ‘salmon’ –umitools_grouping_method ‘unique’ #sage nextflow run rnaseq/main.nf –input samplesheet.csv –outdir results_GRCh38 –genome GRCh38 –with_umi –umitools_extract_method “regex” –umitools_bc_pattern “^(?P .{12}).*” -profile test_full -resume –max_memory 256.GB –max_time 2400.h –save_align_intermeds –save_unaligned –save_reference –aligner ‘star_salmon’ –pseudo_aligner ‘salmon’ #–Virus– /usr/local/bin/nextflow run rnaseq/main.nf –input samplesheet.csv –outdir results_virus –fasta “/home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/X14112.1.fasta” –gtf “/home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/X14112.1_v4.gtf” –with_umi –umitools_extract_method “regex” –umitools_bc_pattern “^(?P .{12}).*” –umitools_dedup_stats –skip_rseqc –skip_dupradar –skip_preseq -profile test_full -resume –max_cpus 55 –max_memory 120.GB –max_time 2400.h –save_align_intermeds –save_unaligned –save_reference –aligner ‘hisat2’ –gtf_extra_attributes ‘gene_name’ –gtf_group_features ‘gene_id’ –featurecounts_group_type ‘gene_name’ –featurecounts_feature_type ‘exon’ –umitools_grouping_method ‘unique’
  3. R-code for evaluation of nextflow outputs

     # Import the required libraries
     library("AnnotationDbi")
     library("clusterProfiler")
     library("ReactomePA")
     library(gplots)
    
     library(tximport)
     library(DESeq2)
    
     setwd("~/DATA/Data_Manja_RNAseq_Organoids/results_GRCh38_unique/star_salmon")
    
     # Define paths to your Salmon output quantification files
     files <- c("control_r1" = "./control_r1/quant.sf",
               "control_r2" = "./control_r2/quant.sf",
               "HSV.d2_r1" = "./HSV.d2_r1/quant.sf",
               "HSV.d2_r2" = "./HSV.d2_r2/quant.sf",
               "HSV.d4_r1" = "./HSV.d4_r1/quant.sf",
               "HSV.d4_r2" = "./HSV.d4_r2/quant.sf",
               "HSV.d6_r1" = "./HSV.d6_r1/quant.sf",
               "HSV.d6_r2" = "./HSV.d6_r2/quant.sf",
               "HSV.d8_r1" = "./HSV.d8_r1/quant.sf",
               "HSV.d8_r2" = "./HSV.d8_r2/quant.sf")
    
     # Import the transcript abundance data with tximport
     txi <- tximport(files, type = "salmon", txIn = TRUE, txOut = TRUE)
    
     # Define the replicates and condition of the samples
     replicate <- factor(c("r1", "r2", "r1", "r2", "r1", "r2", "r1", "r2", "r1", "r2"))
     condition <- factor(c("control", "control", "HSV.d2", "HSV.d2", "HSV.d4", "HSV.d4", "HSV.d6", "HSV.d6", "HSV.d8", "HSV.d8"))
    
     # Define the colData for DESeq2
     colData <- data.frame(condition=condition, replicate=replicate, row.names=names(files))
    
     # Create DESeqDataSet object
     dds <- DESeqDataSetFromTximport(txi, colData=colData, design=~condition)
    
     # In the context of your new code which is using tximport and DESeq2, you don't necessarily need this step. The reason is that DESeq2 performs its own filtering of low-count genes during the normalization and differential expression steps.
     # Filter data to retain only genes with more than 2 counts > 3 across all samples
     # dds <- dds[rowSums(counts(dds) > 3) > 2, ]
    
     # Run DESeq2
     dds <- DESeq(dds)
    
     # Perform rlog transformation
     rld <- rlogTransformation(dds)
    
     # Output raw count data to a CSV file
     write.csv(counts(dds), file="transcript_counts.csv")
    
     # -- gene-level count data --
     # Read in the tx2gene map from salmon_tx2gene.tsv
     #tx2gene <- read.csv("salmon_tx2gene.tsv", sep="\t", header=FALSE)
     tx2gene <- read.table("salmon_tx2gene.tsv", header=FALSE, stringsAsFactors=FALSE)
    
     # Set the column names
     colnames(tx2gene) <- c("transcript_id", "gene_id", "gene_name")
    
     # Remove the gene_name column if not needed
     tx2gene <- tx2gene[,1:2]
    
     # Import and summarize the Salmon data with tximport
     txi <- tximport(files, type = "salmon", tx2gene = tx2gene, txOut = FALSE)
    
     # Continue with the DESeq2 workflow as before...
     colData <- data.frame(condition=condition, replicate=replicate, row.names=names(files))
     dds <- DESeqDataSetFromTximport(txi, colData=colData, design=~condition)
     #dds <- dds[rowSums(counts(dds) > 3) > 2, ]    #60605-->26543
     dds <- DESeq(dds)
     rld <- rlogTransformation(dds)
     write.csv(counts(dds, normalized=FALSE), file="gene_counts.csv")
    
     #TODO: why a lot of reads were removed due to the too_short?
     #STAR --runThreadN 4 --genomeDir /path/to/GenomeDir --readFilesIn /path/to/read1.fastq /path/to/read2.fastq --outFilterMatchNmin 50 --outSAMtype BAM SortedByCoordinate --outFileNamePrefix /path/to/output
    
     dim(counts(dds))
     head(counts(dds), 10)

Leave a Reply

Your email address will not be published. Required fields are marked *