RNAseq running with umi_tools

install conda environment

#conda config --set auto_activate_base false

conda create --name rnaseq python=3.7

#NOTE: mamba 确实快多了,以后都用 mamba❕
#install packages
conda activate rnaseq
pip3 install deeptools
pip3 install multiqc
conda install -c bioconda stringtie subread gffread
conda install -c conda-forge -c bioconda -c defaults -c r r-data.table r-gplots
conda install -c conda-forge -c bioconda -c defaults -c r bioconductor-dupradar bioconductor-edger
conda install nextflow=23.04

conda install fq
conda install -c bioconda umi_tools
conda install -c bioconda rsem
conda install -c bioconda salmon

#conda install some tools
#install R-packages, 
conda install -c bioconda ucsc-bedclip
conda install -c bioconda ucsc-bedgraphtobigwig
conda install -c bioconda bioconductor-matrixgenerics
#conda install -c bioconda bioconductor-deseq2
conda install -c bioconda r-pheatmap
conda install -c anaconda gawk

conda install mamba -n base -c conda-forge
conda config --add channels conda-forge
mamba install -c bioconda salmon=1.10
#salmon should be >= 1.10 since in those version salmon set `--validateMappings` as default.

conda install -c bioconda trim-galore star=2.6.1d bioconductor-summarizedexperiment bioconductor-tximport bioconductor-tximeta bioconductor-deseq2
mamba install -c bioconda samtools=1.9  
mamba install -c conda-forge r-optparse r-vctrs=0.5.0
conda install nextflow=23.04
mamba install -c bioconda qualimap
mamba install -c bioconda rseqc
mamba install -c conda-forge openssl
conda install -c bioconda ucsc-bedclip
conda install -c bioconda bedtools
conda update -c bioconda ucsc-bedclip
#for DEBUG: bedClip: error while loading shared libraries: libssl.so.1.0.0: cannot open shared object file: No such file or directory
conda update -c bioconda ucsc-bedgraphtobigwig
# samtools should be >= 1.9 as only those have the option @
#samtools sort \
#      -@ 6 \
#      -o HSV.d2_r1.sorted.bam \
#      -T HSV.d2_r1.sorted \
#      HSV.d2_r1.Aligned.out.bam

run UMItools without –umitools_dedup_stats, otherwise it cannot be finished in hamm.
- Optimize UMItools parameters: Some parameters might influence the memory usage of UMItools. For example, you can try to reduce the number of allowed mismatches in the UMI sequence (–edit-distance-threshold). This will make the deduplication process less memory intensive but might also impact the results.
- Use other deduplication tools: If the problem persists, you might need to use alternative tools for UMI deduplication which are less memory-intensive. Tools such as fgbio have a grouping and deduplication method similar to UMItools but have been reported to require less memory.
```
#https://github.com/nf-core/rnaseq/issues/827
#INFO for DEBUG: https://umi-tools.readthedocs.io/en/latest/faq.html
#INFO for DEBUG: https://readthedocs.org/projects/umi-tools/downloads/pdf/stable/
#https://github.com/CGATOxford/UMI-tools/issues/173
# excessive dedup memory usage with output-stats #409 
#https://github.com/CGATOxford/UMI-tools/issues/409
#umi_tools 1.0.1
#I am aware of previously closed issues:
#excessive dedup memory usage #173
#speed up stats #184
#Running a single-end bam file with 3.13M reads and a 10bp (fully random) UMI.
#Using --method=unique
#There still seems to be a memory problem with --output-stats
#Running with output-stats, memory usage climbs over 100GB and eventually crashes with "MemoryError".
#Running without output-stats, job completes in about 3 minutes, with no problems.

#TRY STANDALONE RUNNING: /usr/local/bin/python /usr/local/bin/umi_tools dedup -I HSV.d8_r1.transcriptome.sorted.bam -S HSV.d8_r1.umi_dedup.transcriptome.sorted.bam --method=unique --random-seed=100 
#/home/jhuang/miniconda3/envs/rnaseq/bin/python /home/jhuang/miniconda3/envs/rnaseq/bin/umi_tools dedup -I star_salmon/HSV.d8_r1.sorted.bam -S HSV.d8_r1.umi_dedup.sorted.bam --output-stats HSV.d8_r1.umi_dedup.sorted --method=unique --random-seed=100

#umitools dedup uses large amounts of memory and runs slowly. To speed it up it is recommended to only run it on a single chromosome, see the FAQ point number 4.
#I suggest either making the --output-stats optional, or running a second round of deduplication on a single chromosome to generate the output stats.

#--Human--
#hamm
/usr/local/bin/nextflow run rnaseq/main.nf --input samplesheet.csv --outdir results_GRCh38 --genome GRCh38   --with_umi --umitools_extract_method "regex" --umitools_bc_pattern "^(?P
```
  .{12}).*” -profile docker -resume –max_cpus 54 –max_memory 120.GB –max_time 2400.h –save_align_intermeds –save_unaligned –save_reference –aligner ‘star_salmon’ –pseudo_aligner ‘salmon’ –umitools_grouping_method ‘unique’ #sage nextflow run rnaseq/main.nf –input samplesheet.csv –outdir results_GRCh38 –genome GRCh38 –with_umi –umitools_extract_method “regex” –umitools_bc_pattern “^(?P .{12}).*” -profile test_full -resume –max_memory 256.GB –max_time 2400.h –save_align_intermeds –save_unaligned –save_reference –aligner ‘star_salmon’ –pseudo_aligner ‘salmon’ #–Virus– /usr/local/bin/nextflow run rnaseq/main.nf –input samplesheet.csv –outdir results_virus –fasta “/home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/X14112.1.fasta” –gtf “/home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/X14112.1_v4.gtf” –with_umi –umitools_extract_method “regex” –umitools_bc_pattern “^(?P .{12}).*” –umitools_dedup_stats –skip_rseqc –skip_dupradar –skip_preseq -profile test_full -resume –max_cpus 55 –max_memory 120.GB –max_time 2400.h –save_align_intermeds –save_unaligned –save_reference –aligner ‘hisat2’ –gtf_extra_attributes ‘gene_name’ –gtf_group_features ‘gene_id’ –featurecounts_group_type ‘gene_name’ –featurecounts_feature_type ‘exon’ –umitools_grouping_method ‘unique’

R-code for evaluation of nextflow outputs

# Import the required libraries
library("AnnotationDbi")
library("clusterProfiler")
library("ReactomePA")
library(gplots)

library(tximport)
library(DESeq2)

setwd("~/DATA/Data_Manja_RNAseq_Organoids/results_GRCh38_unique/star_salmon")

# Define paths to your Salmon output quantification files
files <- c("control_r1" = "./control_r1/quant.sf",
          "control_r2" = "./control_r2/quant.sf",
          "HSV.d2_r1" = "./HSV.d2_r1/quant.sf",
          "HSV.d2_r2" = "./HSV.d2_r2/quant.sf",
          "HSV.d4_r1" = "./HSV.d4_r1/quant.sf",
          "HSV.d4_r2" = "./HSV.d4_r2/quant.sf",
          "HSV.d6_r1" = "./HSV.d6_r1/quant.sf",
          "HSV.d6_r2" = "./HSV.d6_r2/quant.sf",
          "HSV.d8_r1" = "./HSV.d8_r1/quant.sf",
          "HSV.d8_r2" = "./HSV.d8_r2/quant.sf")

# Import the transcript abundance data with tximport
txi <- tximport(files, type = "salmon", txIn = TRUE, txOut = TRUE)

# Define the replicates and condition of the samples
replicate <- factor(c("r1", "r2", "r1", "r2", "r1", "r2", "r1", "r2", "r1", "r2"))
condition <- factor(c("control", "control", "HSV.d2", "HSV.d2", "HSV.d4", "HSV.d4", "HSV.d6", "HSV.d6", "HSV.d8", "HSV.d8"))

# Define the colData for DESeq2
colData <- data.frame(condition=condition, replicate=replicate, row.names=names(files))

# Create DESeqDataSet object
dds <- DESeqDataSetFromTximport(txi, colData=colData, design=~condition)

# In the context of your new code which is using tximport and DESeq2, you don't necessarily need this step. The reason is that DESeq2 performs its own filtering of low-count genes during the normalization and differential expression steps.
# Filter data to retain only genes with more than 2 counts > 3 across all samples
# dds <- dds[rowSums(counts(dds) > 3) > 2, ]

# Run DESeq2
dds <- DESeq(dds)

# Perform rlog transformation
rld <- rlogTransformation(dds)

# Output raw count data to a CSV file
write.csv(counts(dds), file="transcript_counts.csv")

# -- gene-level count data --
# Read in the tx2gene map from salmon_tx2gene.tsv
#tx2gene <- read.csv("salmon_tx2gene.tsv", sep="\t", header=FALSE)
tx2gene <- read.table("salmon_tx2gene.tsv", header=FALSE, stringsAsFactors=FALSE)

# Set the column names
colnames(tx2gene) <- c("transcript_id", "gene_id", "gene_name")

# Remove the gene_name column if not needed
tx2gene <- tx2gene[,1:2]

# Import and summarize the Salmon data with tximport
txi <- tximport(files, type = "salmon", tx2gene = tx2gene, txOut = FALSE)

# Continue with the DESeq2 workflow as before...
colData <- data.frame(condition=condition, replicate=replicate, row.names=names(files))
dds <- DESeqDataSetFromTximport(txi, colData=colData, design=~condition)
#dds <- dds[rowSums(counts(dds) > 3) > 2, ]    #60605-->26543
dds <- DESeq(dds)
rld <- rlogTransformation(dds)
write.csv(counts(dds, normalized=FALSE), file="gene_counts.csv")

#TODO: why a lot of reads were removed due to the too_short?
#STAR --runThreadN 4 --genomeDir /path/to/GenomeDir --readFilesIn /path/to/read1.fastq /path/to/read2.fastq --outFilterMatchNmin 50 --outSAMtype BAM SortedByCoordinate --outFileNamePrefix /path/to/output

dim(counts(dds))
head(counts(dds), 10)

Microbial bioinformatics

Microbial bioinformatics uses computational tools to analyze genomes, track evolution, and study functions in microorganisms, including bacteria and viruses.

Leave a Reply Cancel reply