-
install conda environment
#conda config --set auto_activate_base false conda create --name rnaseq python=3.7 #NOTE: mamba 确实快多了,以后都用 mamba❕ #install packages conda activate rnaseq pip3 install deeptools pip3 install multiqc conda install -c bioconda stringtie subread gffread conda install -c conda-forge -c bioconda -c defaults -c r r-data.table r-gplots conda install -c conda-forge -c bioconda -c defaults -c r bioconductor-dupradar bioconductor-edger conda install nextflow=23.04 conda install fq conda install -c bioconda umi_tools conda install -c bioconda rsem conda install -c bioconda salmon #conda install some tools #install R-packages, conda install -c bioconda ucsc-bedclip conda install -c bioconda ucsc-bedgraphtobigwig conda install -c bioconda bioconductor-matrixgenerics #conda install -c bioconda bioconductor-deseq2 conda install -c bioconda r-pheatmap conda install -c anaconda gawk conda install mamba -n base -c conda-forge conda config --add channels conda-forge mamba install -c bioconda salmon=1.10 #salmon should be >= 1.10 since in those version salmon set `--validateMappings` as default. conda install -c bioconda trim-galore star=2.6.1d bioconductor-summarizedexperiment bioconductor-tximport bioconductor-tximeta bioconductor-deseq2 mamba install -c bioconda samtools=1.9 mamba install -c conda-forge r-optparse r-vctrs=0.5.0 conda install nextflow=23.04 mamba install -c bioconda qualimap mamba install -c bioconda rseqc mamba install -c conda-forge openssl conda install -c bioconda ucsc-bedclip conda install -c bioconda bedtools conda update -c bioconda ucsc-bedclip #for DEBUG: bedClip: error while loading shared libraries: libssl.so.1.0.0: cannot open shared object file: No such file or directory conda update -c bioconda ucsc-bedgraphtobigwig # samtools should be >= 1.9 as only those have the option @ #samtools sort \ # -@ 6 \ # -o HSV.d2_r1.sorted.bam \ # -T HSV.d2_r1.sorted \ # HSV.d2_r1.Aligned.out.bam
-
run UMItools without –umitools_dedup_stats, otherwise it cannot be finished in hamm.
-
Optimize UMItools parameters: Some parameters might influence the memory usage of UMItools. For example, you can try to reduce the number of allowed mismatches in the UMI sequence (–edit-distance-threshold). This will make the deduplication process less memory intensive but might also impact the results.
-
Use other deduplication tools: If the problem persists, you might need to use alternative tools for UMI deduplication which are less memory-intensive. Tools such as fgbio have a grouping and deduplication method similar to UMItools but have been reported to require less memory.
#https://github.com/nf-core/rnaseq/issues/827 #INFO for DEBUG: https://umi-tools.readthedocs.io/en/latest/faq.html #INFO for DEBUG: https://readthedocs.org/projects/umi-tools/downloads/pdf/stable/ #https://github.com/CGATOxford/UMI-tools/issues/173 # excessive dedup memory usage with output-stats #409 #https://github.com/CGATOxford/UMI-tools/issues/409 #umi_tools 1.0.1 #I am aware of previously closed issues: #excessive dedup memory usage #173 #speed up stats #184 #Running a single-end bam file with 3.13M reads and a 10bp (fully random) UMI. #Using --method=unique #There still seems to be a memory problem with --output-stats #Running with output-stats, memory usage climbs over 100GB and eventually crashes with "MemoryError". #Running without output-stats, job completes in about 3 minutes, with no problems. #TRY STANDALONE RUNNING: /usr/local/bin/python /usr/local/bin/umi_tools dedup -I HSV.d8_r1.transcriptome.sorted.bam -S HSV.d8_r1.umi_dedup.transcriptome.sorted.bam --method=unique --random-seed=100 #/home/jhuang/miniconda3/envs/rnaseq/bin/python /home/jhuang/miniconda3/envs/rnaseq/bin/umi_tools dedup -I star_salmon/HSV.d8_r1.sorted.bam -S HSV.d8_r1.umi_dedup.sorted.bam --output-stats HSV.d8_r1.umi_dedup.sorted --method=unique --random-seed=100 #umitools dedup uses large amounts of memory and runs slowly. To speed it up it is recommended to only run it on a single chromosome, see the FAQ point number 4. #I suggest either making the --output-stats optional, or running a second round of deduplication on a single chromosome to generate the output stats. #--Human-- #hamm /usr/local/bin/nextflow run rnaseq/main.nf --input samplesheet.csv --outdir results_GRCh38 --genome GRCh38 --with_umi --umitools_extract_method "regex" --umitools_bc_pattern "^(?P
.{12}).*” -profile docker -resume –max_cpus 54 –max_memory 120.GB –max_time 2400.h –save_align_intermeds –save_unaligned –save_reference –aligner ‘star_salmon’ –pseudo_aligner ‘salmon’ –umitools_grouping_method ‘unique’ #sage nextflow run rnaseq/main.nf –input samplesheet.csv –outdir results_GRCh38 –genome GRCh38 –with_umi –umitools_extract_method “regex” –umitools_bc_pattern “^(?P .{12}).*” -profile test_full -resume –max_memory 256.GB –max_time 2400.h –save_align_intermeds –save_unaligned –save_reference –aligner ‘star_salmon’ –pseudo_aligner ‘salmon’ #–Virus– /usr/local/bin/nextflow run rnaseq/main.nf –input samplesheet.csv –outdir results_virus –fasta “/home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/X14112.1.fasta” –gtf “/home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/X14112.1_v4.gtf” –with_umi –umitools_extract_method “regex” –umitools_bc_pattern “^(?P .{12}).*” –umitools_dedup_stats –skip_rseqc –skip_dupradar –skip_preseq -profile test_full -resume –max_cpus 55 –max_memory 120.GB –max_time 2400.h –save_align_intermeds –save_unaligned –save_reference –aligner ‘hisat2’ –gtf_extra_attributes ‘gene_name’ –gtf_group_features ‘gene_id’ –featurecounts_group_type ‘gene_name’ –featurecounts_feature_type ‘exon’ –umitools_grouping_method ‘unique’
-
-
R-code for evaluation of nextflow outputs
# Import the required libraries library("AnnotationDbi") library("clusterProfiler") library("ReactomePA") library(gplots) library(tximport) library(DESeq2) setwd("~/DATA/Data_Manja_RNAseq_Organoids/results_GRCh38_unique/star_salmon") # Define paths to your Salmon output quantification files files <- c("control_r1" = "./control_r1/quant.sf", "control_r2" = "./control_r2/quant.sf", "HSV.d2_r1" = "./HSV.d2_r1/quant.sf", "HSV.d2_r2" = "./HSV.d2_r2/quant.sf", "HSV.d4_r1" = "./HSV.d4_r1/quant.sf", "HSV.d4_r2" = "./HSV.d4_r2/quant.sf", "HSV.d6_r1" = "./HSV.d6_r1/quant.sf", "HSV.d6_r2" = "./HSV.d6_r2/quant.sf", "HSV.d8_r1" = "./HSV.d8_r1/quant.sf", "HSV.d8_r2" = "./HSV.d8_r2/quant.sf") # Import the transcript abundance data with tximport txi <- tximport(files, type = "salmon", txIn = TRUE, txOut = TRUE) # Define the replicates and condition of the samples replicate <- factor(c("r1", "r2", "r1", "r2", "r1", "r2", "r1", "r2", "r1", "r2")) condition <- factor(c("control", "control", "HSV.d2", "HSV.d2", "HSV.d4", "HSV.d4", "HSV.d6", "HSV.d6", "HSV.d8", "HSV.d8")) # Define the colData for DESeq2 colData <- data.frame(condition=condition, replicate=replicate, row.names=names(files)) # Create DESeqDataSet object dds <- DESeqDataSetFromTximport(txi, colData=colData, design=~condition) # In the context of your new code which is using tximport and DESeq2, you don't necessarily need this step. The reason is that DESeq2 performs its own filtering of low-count genes during the normalization and differential expression steps. # Filter data to retain only genes with more than 2 counts > 3 across all samples # dds <- dds[rowSums(counts(dds) > 3) > 2, ] # Run DESeq2 dds <- DESeq(dds) # Perform rlog transformation rld <- rlogTransformation(dds) # Output raw count data to a CSV file write.csv(counts(dds), file="transcript_counts.csv") # -- gene-level count data -- # Read in the tx2gene map from salmon_tx2gene.tsv #tx2gene <- read.csv("salmon_tx2gene.tsv", sep="\t", header=FALSE) tx2gene <- read.table("salmon_tx2gene.tsv", header=FALSE, stringsAsFactors=FALSE) # Set the column names colnames(tx2gene) <- c("transcript_id", "gene_id", "gene_name") # Remove the gene_name column if not needed tx2gene <- tx2gene[,1:2] # Import and summarize the Salmon data with tximport txi <- tximport(files, type = "salmon", tx2gene = tx2gene, txOut = FALSE) # Continue with the DESeq2 workflow as before... colData <- data.frame(condition=condition, replicate=replicate, row.names=names(files)) dds <- DESeqDataSetFromTximport(txi, colData=colData, design=~condition) #dds <- dds[rowSums(counts(dds) > 3) > 2, ] #60605-->26543 dds <- DESeq(dds) rld <- rlogTransformation(dds) write.csv(counts(dds, normalized=FALSE), file="gene_counts.csv") #TODO: why a lot of reads were removed due to the too_short? #STAR --runThreadN 4 --genomeDir /path/to/GenomeDir --readFilesIn /path/to/read1.fastq /path/to/read2.fastq --outFilterMatchNmin 50 --outSAMtype BAM SortedByCoordinate --outFileNamePrefix /path/to/output dim(counts(dds)) head(counts(dds), 10)
RNAseq running with umi_tools
Leave a reply