gene_x 0 like s 912 view s
Tags: processing, pipeline, RNA-seq
install conda environment
#conda config --set auto_activate_base false
conda create --name rnaseq python=3.7
#NOTE: mamba 确实快多了,以后都用 mamba❕
#install packages
conda activate rnaseq
pip3 install deeptools
pip3 install multiqc
conda install -c bioconda stringtie subread gffread
conda install -c conda-forge -c bioconda -c defaults -c r r-data.table r-gplots
conda install -c conda-forge -c bioconda -c defaults -c r bioconductor-dupradar bioconductor-edger
conda install nextflow=23.04
conda install fq
conda install -c bioconda umi_tools
conda install -c bioconda rsem
conda install -c bioconda salmon
#conda install some tools
#install R-packages,
conda install -c bioconda ucsc-bedclip
conda install -c bioconda ucsc-bedgraphtobigwig
conda install -c bioconda bioconductor-matrixgenerics
#conda install -c bioconda bioconductor-deseq2
conda install -c bioconda r-pheatmap
conda install -c anaconda gawk
conda install mamba -n base -c conda-forge
conda config --add channels conda-forge
mamba install -c bioconda salmon=1.10
#salmon should be >= 1.10 since in those version salmon set `--validateMappings` as default.
conda install -c bioconda trim-galore star=2.6.1d bioconductor-summarizedexperiment bioconductor-tximport bioconductor-tximeta bioconductor-deseq2
mamba install -c bioconda samtools=1.9
mamba install -c conda-forge r-optparse r-vctrs=0.5.0
conda install nextflow=23.04
mamba install -c bioconda qualimap
mamba install -c bioconda rseqc
mamba install -c conda-forge openssl
conda install -c bioconda ucsc-bedclip
conda install -c bioconda bedtools
conda update -c bioconda ucsc-bedclip
#for DEBUG: bedClip: error while loading shared libraries: libssl.so.1.0.0: cannot open shared object file: No such file or directory
conda update -c bioconda ucsc-bedgraphtobigwig
# samtools should be >= 1.9 as only those have the option @
#samtools sort \
# -@ 6 \
# -o HSV.d2_r1.sorted.bam \
# -T HSV.d2_r1.sorted \
# HSV.d2_r1.Aligned.out.bam
run UMItools without --umitools_dedup_stats, otherwise it cannot be finished in hamm.
Optimize UMItools parameters: Some parameters might influence the memory usage of UMItools. For example, you can try to reduce the number of allowed mismatches in the UMI sequence (--edit-distance-threshold). This will make the deduplication process less memory intensive but might also impact the results.
Use other deduplication tools: If the problem persists, you might need to use alternative tools for UMI deduplication which are less memory-intensive. Tools such as fgbio have a grouping and deduplication method similar to UMItools but have been reported to require less memory.
#https://github.com/nf-core/rnaseq/issues/827
#INFO for DEBUG: https://umi-tools.readthedocs.io/en/latest/faq.html
#INFO for DEBUG: https://readthedocs.org/projects/umi-tools/downloads/pdf/stable/
#https://github.com/CGATOxford/UMI-tools/issues/173
# excessive dedup memory usage with output-stats #409
#https://github.com/CGATOxford/UMI-tools/issues/409
#umi_tools 1.0.1
#I am aware of previously closed issues:
#excessive dedup memory usage #173
#speed up stats #184
#Running a single-end bam file with 3.13M reads and a 10bp (fully random) UMI.
#Using --method=unique
#There still seems to be a memory problem with --output-stats
#Running with output-stats, memory usage climbs over 100GB and eventually crashes with "MemoryError".
#Running without output-stats, job completes in about 3 minutes, with no problems.
#TRY STANDALONE RUNNING: /usr/local/bin/python /usr/local/bin/umi_tools dedup -I HSV.d8_r1.transcriptome.sorted.bam -S HSV.d8_r1.umi_dedup.transcriptome.sorted.bam --method=unique --random-seed=100
#/home/jhuang/miniconda3/envs/rnaseq/bin/python /home/jhuang/miniconda3/envs/rnaseq/bin/umi_tools dedup -I star_salmon/HSV.d8_r1.sorted.bam -S HSV.d8_r1.umi_dedup.sorted.bam --output-stats HSV.d8_r1.umi_dedup.sorted --method=unique --random-seed=100
#umitools dedup uses large amounts of memory and runs slowly. To speed it up it is recommended to only run it on a single chromosome, see the FAQ point number 4.
#I suggest either making the --output-stats optional, or running a second round of deduplication on a single chromosome to generate the output stats.
#--Human--
#hamm
/usr/local/bin/nextflow run rnaseq/main.nf --input samplesheet.csv --outdir results_GRCh38 --genome GRCh38 --with_umi --umitools_extract_method "regex" --umitools_bc_pattern "^(?P<umi_1>.{12}).*" -profile docker -resume --max_cpus 54 --max_memory 120.GB --max_time 2400.h --save_align_intermeds --save_unaligned --save_reference --aligner 'star_salmon' --pseudo_aligner 'salmon' --umitools_grouping_method 'unique'
#sage
nextflow run rnaseq/main.nf --input samplesheet.csv --outdir results_GRCh38 --genome GRCh38 --with_umi --umitools_extract_method "regex" --umitools_bc_pattern "^(?P<umi_1>.{12}).*" -profile test_full -resume --max_memory 256.GB --max_time 2400.h --save_align_intermeds --save_unaligned --save_reference --aligner 'star_salmon' --pseudo_aligner 'salmon'
#--Virus--
/usr/local/bin/nextflow run rnaseq/main.nf --input samplesheet.csv --outdir results_virus --fasta "/home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/X14112.1.fasta" --gtf "/home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/X14112.1_v4.gtf" --with_umi --umitools_extract_method "regex" --umitools_bc_pattern "^(?P<umi_1>.{12}).*" --umitools_dedup_stats --skip_rseqc --skip_dupradar --skip_preseq -profile test_full -resume --max_cpus 55 --max_memory 120.GB --max_time 2400.h --save_align_intermeds --save_unaligned --save_reference --aligner 'hisat2' --gtf_extra_attributes 'gene_name' --gtf_group_features 'gene_id' --featurecounts_group_type 'gene_name' --featurecounts_feature_type 'exon' --umitools_grouping_method 'unique'
R-code for evaluation of nextflow outputs
# Import the required libraries
library("AnnotationDbi")
library("clusterProfiler")
library("ReactomePA")
library(gplots)
library(tximport)
library(DESeq2)
setwd("~/DATA/Data_Manja_RNAseq_Organoids/results_GRCh38_unique/star_salmon")
# Define paths to your Salmon output quantification files
files <- c("control_r1" = "./control_r1/quant.sf",
"control_r2" = "./control_r2/quant.sf",
"HSV.d2_r1" = "./HSV.d2_r1/quant.sf",
"HSV.d2_r2" = "./HSV.d2_r2/quant.sf",
"HSV.d4_r1" = "./HSV.d4_r1/quant.sf",
"HSV.d4_r2" = "./HSV.d4_r2/quant.sf",
"HSV.d6_r1" = "./HSV.d6_r1/quant.sf",
"HSV.d6_r2" = "./HSV.d6_r2/quant.sf",
"HSV.d8_r1" = "./HSV.d8_r1/quant.sf",
"HSV.d8_r2" = "./HSV.d8_r2/quant.sf")
# Import the transcript abundance data with tximport
txi <- tximport(files, type = "salmon", txIn = TRUE, txOut = TRUE)
# Define the replicates and condition of the samples
replicate <- factor(c("r1", "r2", "r1", "r2", "r1", "r2", "r1", "r2", "r1", "r2"))
condition <- factor(c("control", "control", "HSV.d2", "HSV.d2", "HSV.d4", "HSV.d4", "HSV.d6", "HSV.d6", "HSV.d8", "HSV.d8"))
# Define the colData for DESeq2
colData <- data.frame(condition=condition, replicate=replicate, row.names=names(files))
# Create DESeqDataSet object
dds <- DESeqDataSetFromTximport(txi, colData=colData, design=~condition)
# In the context of your new code which is using tximport and DESeq2, you don't necessarily need this step. The reason is that DESeq2 performs its own filtering of low-count genes during the normalization and differential expression steps.
# Filter data to retain only genes with more than 2 counts > 3 across all samples
# dds <- dds[rowSums(counts(dds) > 3) > 2, ]
# Run DESeq2
dds <- DESeq(dds)
# Perform rlog transformation
rld <- rlogTransformation(dds)
# Output raw count data to a CSV file
write.csv(counts(dds), file="transcript_counts.csv")
# -- gene-level count data --
# Read in the tx2gene map from salmon_tx2gene.tsv
#tx2gene <- read.csv("salmon_tx2gene.tsv", sep="\t", header=FALSE)
tx2gene <- read.table("salmon_tx2gene.tsv", header=FALSE, stringsAsFactors=FALSE)
# Set the column names
colnames(tx2gene) <- c("transcript_id", "gene_id", "gene_name")
# Remove the gene_name column if not needed
tx2gene <- tx2gene[,1:2]
# Import and summarize the Salmon data with tximport
txi <- tximport(files, type = "salmon", tx2gene = tx2gene, txOut = FALSE)
# Continue with the DESeq2 workflow as before...
colData <- data.frame(condition=condition, replicate=replicate, row.names=names(files))
dds <- DESeqDataSetFromTximport(txi, colData=colData, design=~condition)
#dds <- dds[rowSums(counts(dds) > 3) > 2, ] #60605-->26543
dds <- DESeq(dds)
rld <- rlogTransformation(dds)
write.csv(counts(dds, normalized=FALSE), file="gene_counts.csv")
#TODO: why a lot of reads were removed due to the too_short?
#STAR --runThreadN 4 --genomeDir /path/to/GenomeDir --readFilesIn /path/to/read1.fastq /path/to/read2.fastq --outFilterMatchNmin 50 --outSAMtype BAM SortedByCoordinate --outFileNamePrefix /path/to/output
dim(counts(dds))
head(counts(dds), 10)
点赞本文的读者
还没有人对此文章表态
没有评论
RNA-seq 2024 Ute from raw counts
RNA-seq data analysis of Yersinia on GRCh38
Variant Calling for Herpes Simplex Virus 1 from Patient Sample Using Capture Probe Sequencing
© 2023 XGenes.com Impressum