RNAseq running with umi_tools

gene_x 0 like s 1075 view s

Tags: processing, pipeline, RNA-seq

  1. install conda environment

    #conda config --set auto_activate_base false
    
    conda create --name rnaseq python=3.7
    
    #NOTE: mamba 确实快多了,以后都用 mamba❕
    #install packages
    conda activate rnaseq
    pip3 install deeptools
    pip3 install multiqc
    conda install -c bioconda stringtie subread gffread
    conda install -c conda-forge -c bioconda -c defaults -c r r-data.table r-gplots
    conda install -c conda-forge -c bioconda -c defaults -c r bioconductor-dupradar bioconductor-edger
    conda install nextflow=23.04
    
    conda install fq
    conda install -c bioconda umi_tools
    conda install -c bioconda rsem
    conda install -c bioconda salmon
    
    #conda install some tools
    #install R-packages, 
    conda install -c bioconda ucsc-bedclip
    conda install -c bioconda ucsc-bedgraphtobigwig
    conda install -c bioconda bioconductor-matrixgenerics
    #conda install -c bioconda bioconductor-deseq2
    conda install -c bioconda r-pheatmap
    conda install -c anaconda gawk
    
    conda install mamba -n base -c conda-forge
    conda config --add channels conda-forge
    mamba install -c bioconda salmon=1.10
    #salmon should be >= 1.10 since in those version salmon set `--validateMappings` as default.
    
    conda install -c bioconda trim-galore star=2.6.1d bioconductor-summarizedexperiment bioconductor-tximport bioconductor-tximeta bioconductor-deseq2
    mamba install -c bioconda samtools=1.9  
    mamba install -c conda-forge r-optparse r-vctrs=0.5.0
    conda install nextflow=23.04
    mamba install -c bioconda qualimap
    mamba install -c bioconda rseqc
    mamba install -c conda-forge openssl
    conda install -c bioconda ucsc-bedclip
    conda install -c bioconda bedtools
    conda update -c bioconda ucsc-bedclip
    #for DEBUG: bedClip: error while loading shared libraries: libssl.so.1.0.0: cannot open shared object file: No such file or directory
    conda update -c bioconda ucsc-bedgraphtobigwig
    # samtools should be >= 1.9 as only those have the option @
    #samtools sort \
    #      -@ 6 \
    #      -o HSV.d2_r1.sorted.bam \
    #      -T HSV.d2_r1.sorted \
    #      HSV.d2_r1.Aligned.out.bam
    
  2. run UMItools without --umitools_dedup_stats, otherwise it cannot be finished in hamm.

    • Optimize UMItools parameters: Some parameters might influence the memory usage of UMItools. For example, you can try to reduce the number of allowed mismatches in the UMI sequence (--edit-distance-threshold). This will make the deduplication process less memory intensive but might also impact the results.

    • Use other deduplication tools: If the problem persists, you might need to use alternative tools for UMI deduplication which are less memory-intensive. Tools such as fgbio have a grouping and deduplication method similar to UMItools but have been reported to require less memory.

      #https://github.com/nf-core/rnaseq/issues/827
      #INFO for DEBUG: https://umi-tools.readthedocs.io/en/latest/faq.html
      #INFO for DEBUG: https://readthedocs.org/projects/umi-tools/downloads/pdf/stable/
      #https://github.com/CGATOxford/UMI-tools/issues/173
      # excessive dedup memory usage with output-stats #409 
      #https://github.com/CGATOxford/UMI-tools/issues/409
      #umi_tools 1.0.1
      #I am aware of previously closed issues:
      #excessive dedup memory usage #173
      #speed up stats #184
      #Running a single-end bam file with 3.13M reads and a 10bp (fully random) UMI.
      #Using --method=unique
      #There still seems to be a memory problem with --output-stats
      #Running with output-stats, memory usage climbs over 100GB and eventually crashes with "MemoryError".
      #Running without output-stats, job completes in about 3 minutes, with no problems.
      
      #TRY STANDALONE RUNNING: /usr/local/bin/python /usr/local/bin/umi_tools dedup -I HSV.d8_r1.transcriptome.sorted.bam -S HSV.d8_r1.umi_dedup.transcriptome.sorted.bam --method=unique --random-seed=100 
      #/home/jhuang/miniconda3/envs/rnaseq/bin/python /home/jhuang/miniconda3/envs/rnaseq/bin/umi_tools dedup -I star_salmon/HSV.d8_r1.sorted.bam -S HSV.d8_r1.umi_dedup.sorted.bam --output-stats HSV.d8_r1.umi_dedup.sorted --method=unique --random-seed=100
      
      #umitools dedup uses large amounts of memory and runs slowly. To speed it up it is recommended to only run it on a single chromosome, see the FAQ point number 4.
      #I suggest either making the --output-stats optional, or running a second round of deduplication on a single chromosome to generate the output stats.
      
      #--Human--
      #hamm
      /usr/local/bin/nextflow run rnaseq/main.nf --input samplesheet.csv --outdir results_GRCh38 --genome GRCh38   --with_umi --umitools_extract_method "regex" --umitools_bc_pattern "^(?P<umi_1>.{12}).*"         -profile docker     -resume  --max_cpus 54 --max_memory 120.GB --max_time 2400.h    --save_align_intermeds --save_unaligned --save_reference    --aligner 'star_salmon' --pseudo_aligner 'salmon'    --umitools_grouping_method 'unique'
      
      #sage
      nextflow run rnaseq/main.nf                --input samplesheet.csv --outdir results_GRCh38 --genome GRCh38   --with_umi --umitools_extract_method "regex" --umitools_bc_pattern "^(?P<umi_1>.{12}).*"         -profile test_full  -resume                --max_memory 256.GB --max_time 2400.h    --save_align_intermeds --save_unaligned --save_reference    --aligner 'star_salmon' --pseudo_aligner 'salmon'
      
      #--Virus--
      /usr/local/bin/nextflow run rnaseq/main.nf --input samplesheet.csv --outdir results_virus    --fasta "/home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/X14112.1.fasta" --gtf "/home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/X14112.1_v4.gtf"   --with_umi --umitools_extract_method "regex" --umitools_bc_pattern "^(?P<umi_1>.{12}).*" --umitools_dedup_stats    --skip_rseqc --skip_dupradar --skip_preseq    -profile test_full -resume  --max_cpus 55 --max_memory 120.GB --max_time 2400.h    --save_align_intermeds --save_unaligned --save_reference    --aligner 'hisat2'    --gtf_extra_attributes 'gene_name' --gtf_group_features 'gene_id' --featurecounts_group_type 'gene_name' --featurecounts_feature_type 'exon'    --umitools_grouping_method 'unique'
      
  3. R-code for evaluation of nextflow outputs

    # Import the required libraries
    library("AnnotationDbi")
    library("clusterProfiler")
    library("ReactomePA")
    library(gplots)
    
    library(tximport)
    library(DESeq2)
    
    setwd("~/DATA/Data_Manja_RNAseq_Organoids/results_GRCh38_unique/star_salmon")
    
    # Define paths to your Salmon output quantification files
    files <- c("control_r1" = "./control_r1/quant.sf",
              "control_r2" = "./control_r2/quant.sf",
              "HSV.d2_r1" = "./HSV.d2_r1/quant.sf",
              "HSV.d2_r2" = "./HSV.d2_r2/quant.sf",
              "HSV.d4_r1" = "./HSV.d4_r1/quant.sf",
              "HSV.d4_r2" = "./HSV.d4_r2/quant.sf",
              "HSV.d6_r1" = "./HSV.d6_r1/quant.sf",
              "HSV.d6_r2" = "./HSV.d6_r2/quant.sf",
              "HSV.d8_r1" = "./HSV.d8_r1/quant.sf",
              "HSV.d8_r2" = "./HSV.d8_r2/quant.sf")
    
    # Import the transcript abundance data with tximport
    txi <- tximport(files, type = "salmon", txIn = TRUE, txOut = TRUE)
    
    # Define the replicates and condition of the samples
    replicate <- factor(c("r1", "r2", "r1", "r2", "r1", "r2", "r1", "r2", "r1", "r2"))
    condition <- factor(c("control", "control", "HSV.d2", "HSV.d2", "HSV.d4", "HSV.d4", "HSV.d6", "HSV.d6", "HSV.d8", "HSV.d8"))
    
    # Define the colData for DESeq2
    colData <- data.frame(condition=condition, replicate=replicate, row.names=names(files))
    
    # Create DESeqDataSet object
    dds <- DESeqDataSetFromTximport(txi, colData=colData, design=~condition)
    
    # In the context of your new code which is using tximport and DESeq2, you don't necessarily need this step. The reason is that DESeq2 performs its own filtering of low-count genes during the normalization and differential expression steps.
    # Filter data to retain only genes with more than 2 counts > 3 across all samples
    # dds <- dds[rowSums(counts(dds) > 3) > 2, ]
    
    # Run DESeq2
    dds <- DESeq(dds)
    
    # Perform rlog transformation
    rld <- rlogTransformation(dds)
    
    # Output raw count data to a CSV file
    write.csv(counts(dds), file="transcript_counts.csv")
    
    # -- gene-level count data --
    # Read in the tx2gene map from salmon_tx2gene.tsv
    #tx2gene <- read.csv("salmon_tx2gene.tsv", sep="\t", header=FALSE)
    tx2gene <- read.table("salmon_tx2gene.tsv", header=FALSE, stringsAsFactors=FALSE)
    
    # Set the column names
    colnames(tx2gene) <- c("transcript_id", "gene_id", "gene_name")
    
    # Remove the gene_name column if not needed
    tx2gene <- tx2gene[,1:2]
    
    # Import and summarize the Salmon data with tximport
    txi <- tximport(files, type = "salmon", tx2gene = tx2gene, txOut = FALSE)
    
    # Continue with the DESeq2 workflow as before...
    colData <- data.frame(condition=condition, replicate=replicate, row.names=names(files))
    dds <- DESeqDataSetFromTximport(txi, colData=colData, design=~condition)
    #dds <- dds[rowSums(counts(dds) > 3) > 2, ]    #60605-->26543
    dds <- DESeq(dds)
    rld <- rlogTransformation(dds)
    write.csv(counts(dds, normalized=FALSE), file="gene_counts.csv")
    
    #TODO: why a lot of reads were removed due to the too_short?
    #STAR --runThreadN 4 --genomeDir /path/to/GenomeDir --readFilesIn /path/to/read1.fastq /path/to/read2.fastq --outFilterMatchNmin 50 --outSAMtype BAM SortedByCoordinate --outFileNamePrefix /path/to/output
    
    dim(counts(dds))
    head(counts(dds), 10)
    

like unlike

点赞本文的读者

还没有人对此文章表态


本文有评论

没有评论

看文章,发评论,不要沉默


© 2023 XGenes.com Impressum