Processing Data_Julia_RNASeq_SARS-CoV-2

  1. Preparing the directory trimmed. Note that the extension of input files is fastq.gz, the extension of output file is fq.gz.

     mkdir trimmed trimmed_unpaired;
    
     for sample_id in 01_PBS_1 02_PBS_2 03_PBS_3 04_rluc_1 05_rluc_2 06_rluc_3 07_ralpha_1 08_ralpha_2 09_ralpha_3 10_rBA2_1 11_rBA2_2 12_rBA2_3 13_rBA5_1 14_rBA5_2 15_rBA5_3 16_rdelta_1 17_rdelta_2 18_rdelta_3 19_rpirola_1 20_rpirola_2 21_rpirola_3; do
             java -jar /home/jhuang/Tools/Trimmomatic-0.36/trimmomatic-0.36.jar PE -threads 100 20250423_AV243904_0006_B/${sample_id}/${sample_id}_R1.fastq.gz 20250423_AV243904_0006_B/${sample_id}/${sample_id}_R2.fastq.gz trimmed/${sample_id}_R1.fq.gz trimmed_unpaired/${sample_id}_R1.fq.gz trimmed/${sample_id}_R2.fq.gz trimmed_unpaired/${sample_id}_R2.fq.gz ILLUMINACLIP:/home/jhuang/Tools/Trimmomatic-0.36/adapters/TruSeq3-PE-2.fa:2:30:10:8:TRUE LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36 AVGQUAL:20; done 2> trimmomatic_pe.log;
     done
  2. Preparing samplesheet.csv

     sample,fastq_1,fastq_2,strandedness
     PBS_1,01_PBS_1_R1.fq.gz,01_PBS_1_R2.fq.gz,auto
     PBS_2,02_PBS_2_R1.fq.gz,02_PBS_2_R2.fq.gz,auto
     PBS_3,03_PBS_3_R1.fq.gz,03_PBS_3_R2.fq.gz,auto
     rLUC_1,04_rluc_1_R1.fq.gz,04_rluc_1_R2.fq.gz,auto
     rLUC_2,05_rluc_2_R1.fq.gz,05_rluc_2_R2.fq.gz,auto
     rLUC_3,06_rluc_3_R1.fq.gz,06_rluc_3_R2.fq.gz,auto
     rAlpha-N_1,07_ralpha_1_R1.fq.gz,07_ralpha_1_R2.fq.gz,auto
     rAlpha-N_2,08_ralpha_2_R1.fq.gz,08_ralpha_2_R2.fq.gz,auto
     rAlpha-N_3,09_ralpha_3_R1.fq.gz,09_ralpha_3_R2.fq.gz,auto
     rBA.2-N_1,10_rBA2_1_R1.fq.gz,10_rBA2_1_R2.fq.gz,auto
     rBA.2-N_2,11_rBA2_2_R1.fq.gz,11_rBA2_2_R2.fq.gz,auto
     rBA.2-N_3,12_rBA2_3_R1.fq.gz,12_rBA2_3_R2.fq.gz,auto
     rBA.5-N_1,13_rBA5_1_R1.fq.gz,13_rBA5_1_R2.fq.gz,auto
     rBA.5-N_2,14_rBA5_2_R1.fq.gz,14_rBA5_2_R2.fq.gz,auto
     rBA.5-N_3,15_rBA5_3_R1.fq.gz,15_rBA5_3_R2.fq.gz,auto
     rDelta-N_1,16_rdelta_1_R1.fq.gz,16_rdelta_1_R2.fq.gz,auto
     rDelta-N_2,17_rdelta_2_R1.fq.gz,17_rdelta_2_R2.fq.gz,auto
     rDelta-N_3,18_rdelta_3_R1.fq.gz,18_rdelta_3_R2.fq.gz,auto
     rPirola-N_1,19_rpirola_1_R1.fq.gz,19_rpirola_1_R2.fq.gz,auto
     rPirola-N_2,20_rpirola_2_R1.fq.gz,20_rpirola_2_R2.fq.gz,auto
     rPirola-N_3,21_rpirola_3_R1.fq.gz,21_rpirola_3_R2.fq.gz,auto
    
     mv trimmed/* .
  3. nextflow run

     'GRCh38' {
         fasta       = "/home/jhuang/Homo_sapiens/Ensembl/GRCh38/Sequence/WholeGenomeFasta/genome.fa"
         bwa         = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh38/Sequence/BWAIndex/version0.6.0/"
         bowtie2     = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh38/Sequence/Bowtie2Index/"
         star        = "/home/jhuang/Homo_sapiens/Ensembl/GRCh38/Sequence/STARIndex/"
         bismark     = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh38/Sequence/BismarkIndex/"
         gtf         = "/home/jhuang/Homo_sapiens/Ensembl/GRCh38/Annotation/Genes/genes.gtf"
         bed12       = "/home/jhuang/Homo_sapiens/Ensembl/GRCh38/Annotation/Genes/genes.bed"
         mito_name   = "chrM"
         macs_gsize  = "2.7e9"
         blacklist   = "/home/jhuang/Homo_sapiens/UCSC/hg38/blacklists/hg38-blacklist.bed"
     }
    
     #Example1: http://xgenes.com/article/article-content/157/prepare-virus-gtf-for-nextflow-run/
     #docker pull nfcore/rnaseq
     ln -s /home/jhuang/Tools/nf-core-rnaseq-3.12.0/ rnaseq
    
     # ---- SUCCESSFUL with directly downloaded gff3 and fasta from NCBI using docker after replacing 'CDS' with 'exon' ----
     #For Tam's bacteria: --fasta "/home/jhuang/DATA/Data_Tam_RNAseq_2024_AUM_MHB_Urine/CP059040.fasta" --gff "/home/jhuang/DATA/Data_Tam_RNAseq_2024_AUM_MHB_Urine/CP059040_m.gff"                    --gtf_group_features 'gene_id'  --gtf_extra_attributes 'gene_name' --featurecounts_group_type 'gene_biotype' --featurecounts_feature_type 'transcript'
     (host_env) /usr/local/bin/nextflow run rnaseq/main.nf -profile docker --input samplesheet.csv --genome GRCh38 --aligner 'star_salmon' --outdir results   --max_cpus 60 --max_memory 600.GB --max_time 2400.h   -resume
     #
     #--save_align_intermeds --save_unaligned --save_reference
  4. samplesheet.csv

     sample,fastq_1,fastq_2,strandedness
     PBS_1,01_PBS_1_R1.fq.gz,01_PBS_1_R2.fq.gz,auto
     PBS_2,02_PBS_2_R1.fq.gz,02_PBS_2_R2.fq.gz,auto
     PBS_3,03_PBS_3_R1.fq.gz,03_PBS_3_R2.fq.gz,auto
     rLUC_1,04_rluc_1_R1.fq.gz,04_rluc_1_R2.fq.gz,auto
     rLUC_2,05_rluc_2_R1.fq.gz,05_rluc_2_R2.fq.gz,auto
     rLUC_3,06_rluc_3_R1.fq.gz,06_rluc_3_R2.fq.gz,auto
     rAlpha-N_1,07_ralpha_1_R1.fq.gz,07_ralpha_1_R2.fq.gz,auto
     rAlpha-N_2,08_ralpha_2_R1.fq.gz,08_ralpha_2_R2.fq.gz,auto
     rAlpha-N_3,09_ralpha_3_R1.fq.gz,09_ralpha_3_R2.fq.gz,auto
     rBA.2-N_1,10_rBA2_1_R1.fq.gz,10_rBA2_1_R2.fq.gz,auto
     rBA.2-N_2,11_rBA2_2_R1.fq.gz,11_rBA2_2_R2.fq.gz,auto
     rBA.2-N_3,12_rBA2_3_R1.fq.gz,12_rBA2_3_R2.fq.gz,auto
     rBA.5-N_1,13_rBA5_1_R1.fq.gz,13_rBA5_1_R2.fq.gz,auto
     rBA.5-N_2,14_rBA5_2_R1.fq.gz,14_rBA5_2_R2.fq.gz,auto
     rBA.5-N_3,15_rBA5_3_R1.fq.gz,15_rBA5_3_R2.fq.gz,auto
     rDelta-N_1,16_rdelta_1_R1.fq.gz,16_rdelta_1_R2.fq.gz,auto
     rDelta-N_2,17_rdelta_2_R1.fq.gz,17_rdelta_2_R2.fq.gz,auto
     rDelta-N_3,18_rdelta_3_R1.fq.gz,18_rdelta_3_R2.fq.gz,auto
     rPirola-N_1,19_rpirola_1_R1.fq.gz,19_rpirola_1_R2.fq.gz,auto
     rPirola-N_2,20_rpirola_2_R1.fq.gz,20_rpirola_2_R2.fq.gz,auto
     rPirola-N_3,21_rpirola_3_R1.fq.gz,21_rpirola_3_R2.fq.gz,auto
  5. R-code for evaluation

     # Import the required libraries
     library("AnnotationDbi")
     library("clusterProfiler")
     library("ReactomePA")
     library(gplots)
    
     library(tximport)
     library(DESeq2)
     library(RColorBrewer)
     library(openxlsx)
    
     setwd("~/DATA/Data_Julia_RNASeq_SARS-CoV-2/results/star_salmon")
    
     # Define paths to your Salmon output quantification files, quant.sf refers to tx-counts, later will be summaried as gene-counts.
     files <- c("PBS_r1" = "./PBS_1/quant.sf",
                 "PBS_r2" = "./PBS_2/quant.sf",
                 "PBS_r3" = "./PBS_3/quant.sf",
                 "rLUC_r1" = "./rLUC_1/quant.sf",
                 "rLUC_r2" = "./rLUC_2/quant.sf",
                 "rLUC_r3" = "./rLUC_3/quant.sf",
                 "rAlpha.N_r1" = "./rAlpha-N_1/quant.sf",
                 "rAlpha.N_r2" = "./rAlpha-N_2/quant.sf",
                 "rAlpha.N_r3" = "./rAlpha-N_3/quant.sf",
                 "rBA.2.N_r1" = "./rBA.2-N_1/quant.sf",
                 "rBA.2.N_r2" = "./rBA.2-N_2/quant.sf",
                 "rBA.2.N_r3" = "./rBA.2-N_3/quant.sf",
                 "rBA.5.N_r1" = "./rBA.5-N_1/quant.sf",
                 "rBA.5.N_r2" = "./rBA.5-N_2/quant.sf",
                 "rBA.5.N_r3" = "./rBA.5-N_3/quant.sf",
                 "rDelta.N_r1" = "./rDelta-N_1/quant.sf",
                 "rDelta.N_r2" = "./rDelta-N_2/quant.sf",
                 "rDelta.N_r3" = "./rDelta-N_3/quant.sf",
                 "rPirola.N_r1" = "./rPirola-N_1/quant.sf",
                 "rPirola.N_r2" = "./rPirola-N_2/quant.sf",
                 "rPirola.N_r3" = "./rPirola-N_3/quant.sf")
    
     # ---- tx-level count data ----
     # Import the transcript abundance data with tximport
     txi <- tximport(files, type = "salmon", txIn = TRUE, txOut = TRUE)
    
     # Define the replicates and condition of the samples
     #replicate <- factor(c("r1", "r2", "r1", "r2", "r1", "r2", "r1", "r2", "r1", "r2"))
     #condition <- factor(c("PBS", "PBS", "PBS",  "rLUC", "rLUC", "rLUC",  "rAlpha.N", "rAlpha.N", "rAlpha.N",  "rBA.2.N", "rBA.2.N", "rBA.2.N",  "rBA.5.N", "rBA.5.N", "rBA.5.N",  "rDelta.N", "rDelta.N", "rDelta.N",  "rPirola.N", "rPirola.N", "rPirola.N"))
     condition <- factor(c("PBS", "PBS", "PBS",  "rLUC", "rLUC", "rLUC",  "rAlpha-N", "rAlpha-N", "rAlpha-N",  "rBA.2-N", "rBA.2-N", "rBA.2-N",  "rBA.5-N", "rBA.5-N", "rBA.5-N",  "rDelta-N", "rDelta-N", "rDelta-N",  "rPirola-N", "rPirola-N", "rPirola-N"))
    
     # Define the colData for DESeq2
     colData <- data.frame(condition=condition, row.names=names(files))
    
     # Create DESeqDataSet object
     dds <- DESeqDataSetFromTximport(txi, colData=colData, design=~condition)
    
     # In the context of your new code which is using tximport and DESeq2, you don't necessarily need this step. The reason is that DESeq2 performs its own filtering of low-count genes during the normalization and differential expression steps.
     # Filter data to retain only genes with more than 2 counts > 3 across all samples
     # dds <- dds[rowSums(counts(dds) > 3) > 2, ]
    
     # Output raw count data to a CSV file
     write.csv(counts(dds), file="transcript_counts.csv")
    
     # ---- gene-level count data ----
     # Read in the tx2gene map from salmon_tx2gene.tsv
     #tx2gene <- read.csv("salmon_tx2gene.tsv", sep="\t", header=FALSE)
     tx2gene <- read.table("salmon_tx2gene.tsv", header=FALSE, stringsAsFactors=FALSE)
    
     # Set the column names
     colnames(tx2gene) <- c("transcript_id", "gene_id", "gene_name")
    
     # Remove the gene_name column if not needed
     tx2gene <- tx2gene[,1:2]
    
     # Import and summarize the Salmon data with tximport
     txi <- tximport(files, type = "salmon", tx2gene = tx2gene, txOut = FALSE)
    
     # Continue with the DESeq2 workflow as before...
     colData <- data.frame(condition=condition, row.names=names(files))
     dds <- DESeqDataSetFromTximport(txi, colData=colData, design=~condition)
     #dds <- dds[rowSums(counts(dds) > 3) > 2, ]    #60605-->26543
     dds <- DESeq(dds)
     rld <- rlogTransformation(dds)
     write.csv(counts(dds, normalized=FALSE), file="gene_counts.csv")
    
     #TODO: why a lot of reads were removed due to the too_short?
     #STAR --runThreadN 4 --genomeDir /path/to/GenomeDir --readFilesIn /path/to/read1.fastq /path/to/read2.fastq --outFilterMatchNmin 50 --outSAMtype BAM SortedByCoordinate --outFileNamePrefix /path/to/output
    
     dim(counts(dds))
     head(counts(dds), 10)
  6. (Optional) draw 3D PCA plots.

     library(gplots)
     library("RColorBrewer")
    
     library(ggplot2)
     data <- plotPCA(rld, intgroup=c("condition", "replicate"), returnData=TRUE)
     write.csv(data, file="plotPCA_data.csv")
     #calculate all PCs including PC3 with the following codes
     library(genefilter)
     ntop <- 500
     rv <- rowVars(assay(rld))
     select <- order(rv, decreasing = TRUE)[seq_len(min(ntop, length(rv)))]
     mat <- t( assay(rld)[select, ] )
     pc <- prcomp(mat)
     pc$x[,1:3]
     #df_pc <- data.frame(pc$x[,1:3])
     df_pc <- data.frame(pc$x)
     identical(rownames(data), rownames(df_pc)) #-->TRUE
    
     data$PC1 <- NULL
     data$PC2 <- NULL
     merged_df <- merge(data, df_pc, by = "row.names")
     #merged_df <- merged_df[, -1]
     row.names(merged_df) <- merged_df$Row.names
     merged_df$Row.names <- NULL  # remove the "name" column
     merged_df$name <- NULL
     merged_df <- merged_df[, c("PC1","PC2","PC3","PC4","PC5","PC6","PC7","PC8","PC9","PC10","group","condition","replicate")]
     write.csv(merged_df, file="merged_df_10PCs.csv")
     summary(pc)
     #0.5333  0.2125 0.06852
    
     draw_3D.py
  7. Draw 2D PCA plots.

     # -- pca --
     png("pca.png", 1200, 800)
     plotPCA(rld, intgroup=c("condition"))
     dev.off()
    
     # -- heatmap --
     png("heatmap.png", 1200, 800)
     distsRL <- dist(t(assay(rld)))
     mat <- as.matrix(distsRL)
     hc <- hclust(distsRL)
     hmcol <- colorRampPalette(brewer.pal(9,"GnBu"))(100)
     heatmap.2(mat, Rowv=as.dendrogram(hc),symm=TRUE, trace="none",col = rev(hmcol), margin=c(13, 13))
     dev.off()
  8. (Optional) estimate size factors

     > head(dds)
     class: DESeqDataSet
     dim: 6 10
     metadata(1): version
     assays(6): counts avgTxLength ... H cooks
     rownames(6): ENSG00000000003 ENSG00000000005 ... ENSG00000000460
       ENSG00000000938
     rowData names(34): baseMean baseVar ... deviance maxCooks
     colnames(10): control_r1 control_r2 ... HSV.d8_r1 HSV.d8_r2
     colData names(2): condition replicate
    
     #convert bam to bigwig using deepTools by feeding inverse of DESeq’s size Factor
     sizeFactors(dds)
     #NULL
     dds <- estimateSizeFactors(dds)
     sizeFactors(dds)
    
     normalized_counts <- counts(dds, normalized=TRUE)
     #write.table(normalized_counts, file="normalized_counts.txt", sep="\t", quote=F, col.names=NA)
    
     # ---- DEBUG sizeFactors(dds) always NULL, see https://support.bioconductor.org/p/97676/ ----
     nm <- assays(dds)[["avgTxLength"]]
     sf <- estimateSizeFactorsForMatrix(counts(dds), normMatrix=nm)
    
     assays(dds)$counts  # for count data
     assays(dds)$avgTxLength  # for average transcript length, etc.
     assays(dds)$normalizationFactors
    
     In normal circumstances, the size factors should be stored in the DESeqDataSet object itself and not in the assays, so they are typically not retrievable via the assays() function. However, due to the issues you're experiencing, you might be able to manually compute the size factors and assign them back to the DESeqDataSet.
    
     To calculate size factors manually, DESeq2 uses the median ratio method. Here's a very simplified version of how you could compute this manually:
     > assays(dds)
     List of length 6
     names(6): counts avgTxLength normalizationFactors mu H cooks
    
     To calculate size factors manually, DESeq2 uses the median ratio method. Here's a very simplified version of how you could compute this manually:
    
     geoMeans <- apply(assays(dds)$counts, 1, function(row) if (all(row == 0)) 0 else exp(mean(log(row[row != 0]))))
     sizeFactors(dds) <- median(assays(dds)$counts / geoMeans, na.rm = TRUE)
    
     # ---- DEBUG END ----
    
     #unter konsole
     #  control_r1  ...
     # 1/0.9978755  ...
    
     > sizeFactors(dds)
                         HeLa_TO_r1                      HeLa_TO_r2
                           0.9978755                       1.1092227
    
     1/0.9978755=1.002129023
     1/1.1092227=
    
     #bamCoverage --bam ../markDuplicates/${sample}Aligned.sortedByCoord.out.bam -o ${sample}_norm.bw --binSize 10 --scaleFactor  --effectiveGenomeSize 2864785220
     bamCoverage --bam ../markDuplicates/HeLa_TO_r1Aligned.sortedByCoord.out.markDups.bam -o HeLa_TO_r1.bw --binSize 10 --scaleFactor 1.002129023     --effectiveGenomeSize 2864785220
     bamCoverage --bam ../markDuplicates/HeLa_TO_r2Aligned.sortedByCoord.out.markDups.bam -o HeLa_TO_r2.bw --binSize 10 --scaleFactor  0.901532217        --effectiveGenomeSize 2864785220
  9. Differential expressed gene analyses

     #A central method for exploring differences between groups of segments or samples is to perform differential gene expression analysis.
    
     dds$condition <- relevel(dds$condition, "PBS")
     dds = DESeq(dds, betaPrior=FALSE)
     resultsNames(dds)
     clist <- c("rLUC_vs_PBS","rAlpha.N_vs_PBS","rBA.2.N_vs_PBS","rBA.5.N_vs_PBS","rDelta.N_vs_PBS","rPirola.N_vs_PBS")
    
     dds$condition <- relevel(dds$condition, "rLUC")
     dds = DESeq(dds, betaPrior=FALSE)
     resultsNames(dds)
     clist <- c("rAlpha.N_vs_rLUC","rBA.2.N_vs_rLUC","rBA.5.N_vs_rLUC","rDelta.N_vs_rLUC","rPirola.N_vs_rLUC")
    
     dds$condition <- relevel(dds$condition, "rAlpha-N")
     dds = DESeq(dds, betaPrior=FALSE)
     resultsNames(dds)
     clist <- c("rBA.2.N_vs_rAlpha.N","rBA.5.N_vs_rAlpha.N","rDelta.N_vs_rAlpha.N","rPirola.N_vs_rAlpha.N")
    
     dds$condition <- relevel(dds$condition, "rBA.2-N")
     dds = DESeq(dds, betaPrior=FALSE)
     resultsNames(dds)
     clist <- c("rBA.5.N_vs_rBA.2.N","rDelta.N_vs_rBA.2.N","rPirola.N_vs_rBA.2.N")
    
     dds$condition <- relevel(dds$condition, "rBA.5-N")
     dds = DESeq(dds, betaPrior=FALSE)
     resultsNames(dds)
     clist <- c("rDelta.N_vs_rBA.5.N","rPirola.N_vs_rBA.5.N")
    
     dds$condition <- relevel(dds$condition, "rDelta-N")
     dds = DESeq(dds, betaPrior=FALSE)
     resultsNames(dds)
     clist <- c("rPirola.N_vs_rDelta.N")
    
     ##https://bioconductor.statistik.tu-dortmund.de/packages/3.7/data/annotation/
     #BiocManager::install("EnsDb.Mmusculus.v79")
     #library(EnsDb.Mmusculus.v79)
     #edb <- EnsDb.Mmusculus.v79
    
     #https://bioconductor.org/packages/release/bioc/vignettes/biomaRt/inst/doc/accessing_ensembl.html#selecting-an-ensembl-biomart-database-and-dataset
     #https://bioconductor.org/packages/release/bioc/vignettes/biomaRt/inst/doc/accessing_ensembl.html#selecting-an-ensembl-biomart-database-and-dataset
     library(biomaRt)
     listEnsembl()
     listMarts()
     #ensembl <- useEnsembl(biomart = "genes", mirror="asia")  # default is Mouse strains 104
     #ensembl <- useEnsembl(biomart = "ensembl", dataset = "mmusculus_gene_ensembl", mirror = "www")
     #ensembl = useMart("ensembl_mart_44", dataset="hsapiens_gene_ensembl",archive=TRUE, mysql=TRUE)
     #ensembl <- useEnsembl(biomart = "ensembl", dataset = "mmusculus_gene_ensembl", version="104")
     #ensembl <- useEnsembl(biomart = "ensembl", dataset = "hsapiens_gene_ensembl", version="86")
     #--> total 69, 27  GRCh38.p7 and 39  GRCm38.p4; we should take 104, since rnaseq-pipeline is also using annotation of 104!
     ensembl <- useEnsembl(biomart = "ensembl", dataset = "hsapiens_gene_ensembl", version="114")
     datasets <- listDatasets(ensembl)
     #--> total 202   80                         GRCh38.p13         107                            GRCm39
     #80                         GRCh38.p14
     #107                            GRCm39
    
     listEnsemblArchives()
     #            name     date                                 url version
     #1  Ensembl GRCh37 Feb 2014          https://grch37.ensembl.org  GRCh37
     #2     Ensembl 114 May 2025 https://may2025.archive.ensembl.org     114
     #3     Ensembl 113 Oct 2024 https://oct2024.archive.ensembl.org     113
     #4     Ensembl 112 May 2024 https://may2024.archive.ensembl.org     112
     #5     Ensembl 111 Jan 2024 https://jan2024.archive.ensembl.org     111
     #6     Ensembl 110 Jul 2023 https://jul2023.archive.ensembl.org     110
     #7     Ensembl 109 Feb 2023 https://feb2023.archive.ensembl.org     109
    
     attributes = listAttributes(ensembl)
     attributes[1:25,]
    
     #https://www.ncbi.nlm.nih.gov/grc/human
     #BiocManager::install("org.Mmu.eg.db")
     #library("org.Mmu.eg.db")
     #edb <- org.Mmu.eg.db
     #
     #https://bioconductor.statistik.tu-dortmund.de/packages/3.6/data/annotation/
     #EnsDb.Mmusculus.v79
     #> query(hub, c("EnsDb", "apiens", "98"))
     #columns(edb)
    
     #searchAttributes(mart = ensembl, pattern = "symbol")
    
     ##https://www.geeksforgeeks.org/remove-duplicate-rows-based-on-multiple-columns-using-dplyr-in-r/
     library(dplyr)
     library(tidyverse)
     #df <- data.frame (lang =c ('Java','C','Python','GO','RUST','Javascript',
                           'Cpp','Java','Julia','Typescript','Python','GO'),
                           value = c (21,21,3,5,180,9,12,20,6,0,3,6),
                           usage =c(21,21,0,99,44,48,53,16,6,8,0,6))
     #distinct(df, lang, .keep_all= TRUE)
    
     for (i in clist) {
     #i<-clist[1]
       contrast = paste("condition", i, sep="_")
       res = results(dds, name=contrast)
       res <- res[!is.na(res$log2FoldChange),]
       #geness <- AnnotationDbi::select(edb86, keys = rownames(res), keytype = "GENEID", columns = c("ENTREZID","EXONID","GENEBIOTYPE","GENEID","GENENAME","PROTEINDOMAINSOURCE","PROTEINID","SEQNAME","SEQSTRAND","SYMBOL","TXBIOTYPE","TXID","TXNAME","UNIPROTID"))
       #geness <- AnnotationDbi::select(edb86, keys = rownames(res), keytype = "GENEID", columns = c("GENEID", "ENTREZID", "SYMBOL", "GENENAME","GENEBIOTYPE","TXBIOTYPE","SEQSTRAND","UNIPROTID"))
       # In the ENSEMBL-database, GENEID is ENSEMBL-ID.
       #geness <- AnnotationDbi::select(edb86, keys = rownames(res), keytype = "GENEID", columns = c("GENEID", "SYMBOL", "GENEBIOTYPE"))  #  "ENTREZID", "TXID","TXBIOTYPE","TXSEQSTART","TXSEQEND"
       #geness <- geness[!duplicated(geness$GENEID), ]
    
       #using getBM replacing AnnotationDbi::select
       #filters = 'ensembl_gene_id' means the records should always have a valid ensembl_gene_ids.
       geness <- getBM(attributes = c('ensembl_gene_id', 'external_gene_name', 'gene_biotype', 'entrezgene_id', 'chromosome_name', 'start_position', 'end_position', 'strand', 'description'),
           filters = 'ensembl_gene_id',
           values = rownames(res),
           mart = ensembl)
       geness_uniq <- distinct(geness, ensembl_gene_id, .keep_all= TRUE)
    
       #merge by column by common colunmn name, in the case "GENEID"
       res$ENSEMBL = rownames(res)
       identical(rownames(res), geness_uniq$ensembl_gene_id)
       res_df <- as.data.frame(res)
       geness_res <- merge(geness_uniq, res_df, by.x="ensembl_gene_id", by.y="ENSEMBL")
       dim(geness_res)
       rownames(geness_res) <- geness_res$ensembl_gene_id
       geness_res$ensembl_gene_id <- NULL
       write.csv(as.data.frame(geness_res[order(geness_res$pvalue),]), file = paste(i, "all.txt", sep="-"))
       up <- subset(geness_res, padj<=0.05 & log2FoldChange>=2)
       down <- subset(geness_res, padj<=0.05 & log2FoldChange<=-2)
       write.csv(as.data.frame(up[order(up$log2FoldChange,decreasing=TRUE),]), file = paste(i, "up.txt", sep="-"))
       write.csv(as.data.frame(down[order(abs(down$log2FoldChange),decreasing=TRUE),]), file = paste(i, "down.txt", sep="-"))
     }
    
     #-- show methods of class DESeq2 --
     #x=capture.output(showMethods(class="DESeq2"))
     #unlist(lapply(strsplit(x[grep("Function: ",x,)]," "),function(x) x[2]))
  10. Volcano plots with automatically finding top_g

     #A canonical visualization for interpreting differential gene expression results is the volcano plot.
     library(ggrepel)
    
     #for i in rLUC_vs_PBS rAlpha.N_vs_PBS rBA.2.N_vs_PBS rBA.5.N_vs_PBS rDelta.N_vs_PBS rPirola.N_vs_PBS; do
     #for i in rAlpha.N_vs_rLUC rBA.2.N_vs_rLUC rBA.5.N_vs_rLUC rDelta.N_vs_rLUC rPirola.N_vs_rLUC; do
     #for i in rBA.2.N_vs_rAlpha.N rBA.5.N_vs_rAlpha.N rDelta.N_vs_rAlpha.N rPirola.N_vs_rAlpha.N; do
     #for i in rBA.5.N_vs_rBA.2.N rDelta.N_vs_rBA.2.N rPirola.N_vs_rBA.2.N; do
     #for i in rDelta.N_vs_rBA.5.N rPirola.N_vs_rBA.5.N; do
     for i in rPirola.N_vs_rDelta.N; do
         echo "geness_res <- read.csv(file = paste(\"${i}\", \"all.txt\", sep=\"-\"), row.names=1)"
    
         echo "geness_res\$Color <- \"NS or log2FC < 2.0\""
         echo "geness_res\$Color[geness_res\$pvalue < 0.05] <- \"P < 0.05\""
         echo "geness_res\$Color[geness_res\$padj < 0.05] <- \"P-adj < 0.05\""
         echo "geness_res\$Color[abs(geness_res\$log2FoldChange) < 2.0] <- \"NS or log2FC < 2.0\""
         echo "geness_res\$Color <- factor(geness_res\$Color, levels = c(\"NS or log2FC < 2.0\", \"P < 0.05\", \"P-adj < 0.05\"))"
         echo "write.csv(geness_res, \"${i}_with_Category.csv\")"
    
         echo "write.csv(geness_res, \"${i}_with_Category.csv\")"
         echo "openxlsx::write.xlsx(geness_res, file = \"${i}_with_Category.xlsx\")"
    
         echo "geness_res\$invert_P <- (-log10(geness_res\$pvalue)) * sign(geness_res\$log2FoldChange)"
         echo "top_g <- c()"
         echo "top_g <- c(top_g, \
             geness_res[, 'external_gene_name'][order(geness_res[, 'invert_P'], decreasing = TRUE)[1:100]], \
             geness_res[, 'external_gene_name'][order(geness_res[, 'invert_P'], decreasing = FALSE)[1:100]])"
         echo "top_g <- unique(top_g)"
    
         # Save filtered subset for optional export/debug
         echo "highlight_genes <- subset(geness_res, external_gene_name %in% top_g & pvalue < 0.05 & abs(geness_res\$log2FoldChange) >= 2.0)"
    
         echo "geness_res <- geness_res[, -1*ncol(geness_res)]"  # remove invert_P column
    
         echo "png(\"${i}.png\",width=1200, height=2000)"
         echo "ggplot(geness_res, \
             aes(x = log2FoldChange, y = -log10(pvalue), \
                 color = Color, label = external_gene_name)) + \
             geom_vline(xintercept = c(2.0, -2.0), lty = \"dashed\") + \
             geom_hline(yintercept = -log10(0.05), lty = \"dashed\") + \
             geom_point() + \
             labs(x = \"log2(FC)\", y = \"Significance, -log10(P)\", color = \"Significance\") + \
             scale_color_manual(values = c(\"P-adj < 0.05\"=\"darkblue\",\"P < 0.05\"=\"lightblue\",\"NS or log2FC < 2.0\"=\"darkgray\"), \
                                 guide = guide_legend(override.aes = list(size = 4))) + \
             scale_y_continuous(expand = expansion(mult = c(0,0.05))) + \
             geom_text_repel(data = highlight_genes, size = 4, point.padding = 0.15, color = \"black\", \
                             min.segment.length = .1, box.padding = .2, lwd = 2) + \
             theme_bw(base_size = 16) + \
             theme(legend.position = \"bottom\")"
         echo "dev.off()"
     done
    
     sed -i -e 's/Color/Category/g' *_Category.csv
    
     for i in rLUC_vs_PBS rAlpha.N_vs_PBS rBA.2.N_vs_PBS rBA.5.N_vs_PBS rDelta.N_vs_PBS rPirola.N_vs_PBS    rAlpha.N_vs_rLUC rBA.2.N_vs_rLUC rBA.5.N_vs_rLUC rDelta.N_vs_rLUC rPirola.N_vs_rLUC    rBA.2.N_vs_rAlpha.N rBA.5.N_vs_rAlpha.N rDelta.N_vs_rAlpha.N rPirola.N_vs_rAlpha.N    rBA.5.N_vs_rBA.2.N rDelta.N_vs_rBA.2.N rPirola.N_vs_rBA.2.N    rDelta.N_vs_rBA.5.N rPirola.N_vs_rBA.5.N    rPirola.N_vs_rDelta.N; do
       echo "~/Tools/csv2xls-0.4/csv_to_xls.py ${i}-all.txt ${i}-up.txt ${i}-down.txt -d$',' -o ${i}.xls;"
     done

It looks as if the viruses are indeed different, especially in terms of “response to virus/IFN induction” (yellow cluster in your heat map).

To get this even clearer, could you possibly now compare the viruses with each other? Initially, I think it would be best to compare each virus with rLUC,

  1. The question above has already been addressed in the previous two steps.

     e.g. rLUC vs rAlpha.N,    rLUC vs rDelta,N,    rLUC vs rBA2.N,    rLUC vs rBA5.N,    rLUC vs rPirola.N?
     # rAlpha.N_vs_rLUC.xls
     # rBA.2.N_vs_rLUC.xls
     # rBA.5.N_vs_rLUC.xls
     # rDelta.N_vs_rLUC.xls
     # rPirola.N_vs_rLUC.xls
  2. Clustering the genes and draw heatmap

     install.packages("gplots")
     library("gplots")
    
     for i in rLUC_vs_PBS rAlpha.N_vs_PBS rBA.2.N_vs_PBS rBA.5.N_vs_PBS rDelta.N_vs_PBS rPirola.N_vs_PBS    rAlpha.N_vs_rLUC rBA.2.N_vs_rLUC rBA.5.N_vs_rLUC rDelta.N_vs_rLUC rPirola.N_vs_rLUC    rBA.2.N_vs_rAlpha.N rBA.5.N_vs_rAlpha.N rDelta.N_vs_rAlpha.N rPirola.N_vs_rAlpha.N    rBA.5.N_vs_rBA.2.N rDelta.N_vs_rBA.2.N rPirola.N_vs_rBA.2.N    rDelta.N_vs_rBA.5.N rPirola.N_vs_rBA.5.N    rPirola.N_vs_rDelta.N; do
       echo "cut -d',' -f1-1 ${i}-up.txt > ${i}-up.id"
       echo "cut -d',' -f1-1 ${i}-down.txt > ${i}-down.id"
     done
    
     #    1 1457.2h_vs_1457.M10.2h-down.id
     #    1 1457.2h_vs_1457.M10.2h-up.id
     #   23 1457.2h_vs_uninfected.2h-down.id
     #   74 1457.2h_vs_uninfected.2h-up.id
     #  126 1457.3d_vs_1457.2h-down.id
     #   61 1457.3d_vs_1457.2h-up.id
     #    2 1457.3d_vs_1457.M10.3d-down.id
     #    2 1457.3d_vs_1457.M10.3d-up.id
     #   97 1457.3d_vs_uninfected.3d-down.id
     #   79 1457.3d_vs_uninfected.3d-up.id
     #   17 1457.M10.2h_vs_uninfected.2h-down.id
     #   82 1457.M10.2h_vs_uninfected.2h-up.id
     #  162 1457.M10.3d_vs_1457.M10.2h-down.id
     #   69 1457.M10.3d_vs_1457.M10.2h-up.id
     #  171 1457.M10.3d_vs_uninfected.3d-down.id
     #  124 1457.M10.3d_vs_uninfected.3d-up.id
    
     cat *.id | sort -u > ids
     #add Gene_Id in the first line, delete the ""
     GOI <- read.csv("ids")$Gene_Id  #739 genes
     RNASeq.NoCellLine <- assay(rld)
    
     # Defining the custom order
     #column_order <- c(
     #  "PBS_r1","PBS_r2","PBS_r3","rLUC_r1","rLUC_r2","rLUC_r3","rAlpha.N_r1","rAlpha.N_r2","rAlpha.N_r3","rBA.2.N_r1","rBA.2.N_r2","rBA.2.N_r3","rBA.5.N_r1","rBA.5.N_r2","rBA.5.N_r3","rDelta.N_r1","rDelta.N_r2","rDelta.N_r3","rPirola.N_r1","rPirola.N_r2","rPirola.N_r3"
     #)
     #RNASeq.NoCellLine_reordered <- RNASeq.NoCellLine[, column_order]
     #head(RNASeq.NoCellLine_reordered)
    
     # Update column names
     new_colnames <- c("PBS 1","PBS 2","PBS 3","rLUC 1","rLUC 2","rLUC 3","rAlpha-N 1","rAlpha-N 2","rAlpha-N 3","rBA.2-N 1","rBA.2-N 2","rBA.2-N 3","rBA.5-N 1","rBA.5-N 2","rBA.5-N 3","rDelta-N 1","rDelta-N 2","rDelta-N 3","rPirola-N 1","rPirola-N 2","rPirola-N 3")
     colnames(RNASeq.NoCellLine) <- new_colnames
    
     #clustering methods: "ward.D", "ward.D2", "single", "complete", "average" (= UPGMA), "mcquitty" (= WPGMA), "median" (= WPGMC) or "centroid" (= UPGMC).  pearson or spearman
     datamat = RNASeq.NoCellLine[GOI, ]
     write.csv(as.data.frame(datamat), file ="DEGs_heatmap_data.csv")
    
     hr <- hclust(as.dist(1-cor(t(datamat), method="pearson")), method="complete")
     hc <- hclust(as.dist(1-cor(datamat, method="spearman")), method="complete")
     mycl = cutree(hr, h=max(hr$height)/1.1)
     mycol = c("YELLOW", "DARKBLUE", "DARKORANGE", "DARKMAGENTA", "DARKCYAN", "DARKRED",  "MAROON", "DARKGREEN", "LIGHTBLUE", "PINK", "MAGENTA", "LIGHTCYAN","LIGHTGREEN", "BLUE", "ORANGE", "CYAN", "RED", "GREEN");
     mycol = mycol[as.vector(mycl)]
     #png("DEGs_heatmap.png", width=800, height=1000)
     png("DEGs_heatmap.png", width = 1200, height = 1500, res = 150)  # Higher res
     heatmap.2(as.matrix(datamat),Rowv=as.dendrogram(hr),Colv = NA, dendrogram = 'row',
                 scale='row',trace='none',col=bluered(75),
                 RowSideColors = mycol, labRow="", srtCol=30, keysize=0.72, cexRow = 2, cexCol = 1.4)
     dev.off()
    
     # Extract rows from datamat where the row names match the identifiers in subset_1
    
     #### cluster members #####
     subset_1<-names(subset(mycl, mycl == '1'))
     data <- as.data.frame(datamat[rownames(datamat) %in% subset_1, ])  #402
    
     subset_2<-names(subset(mycl, mycl == '2'))
     data <- as.data.frame(datamat[rownames(datamat) %in% subset_2, ])  #137
    
     subset_3<-names(subset(mycl, mycl == '3'))
     data <- as.data.frame(datamat[rownames(datamat) %in% subset_3, ])  #200
    
     # Initialize an empty data frame for the annotated data
     annotated_data <- data.frame()
     # Determine total number of genes
     total_genes <- length(rownames(data))
     # Loop through each gene to annotate
     for (i in 1:total_genes) {
         gene <- rownames(data)[i]
         result <- getBM(attributes = c('ensembl_gene_id', 'external_gene_name', 'gene_biotype', 'entrezgene_id', 'chromosome_name', 'start_position', 'end_position', 'strand', 'description'),
                         filters = 'ensembl_gene_id',
                         values = gene,
                         mart = ensembl)
         # If multiple rows are returned, take the first one
         if (nrow(result) > 1) {
             result <- result[1, ]
         }
         # Check if the result is empty
         if (nrow(result) == 0) {
             result <- data.frame(ensembl_gene_id = gene,
                                 external_gene_name = NA,
                                 gene_biotype = NA,
                                 entrezgene_id = NA,
                                 chromosome_name = NA,
                                 start_position = NA,
                                 end_position = NA,
                                 strand = NA,
                                 description = NA)
         }
         # Transpose expression values
         expression_values <- t(data.frame(t(data[gene, ])))
         colnames(expression_values) <- colnames(data)
         # Combine gene information and expression data
         combined_result <- cbind(result, expression_values)
         # Append to the final dataframe
         annotated_data <- rbind(annotated_data, combined_result)
         # Print progress every 100 genes
         if (i %% 100 == 0) {
             cat(sprintf("Processed gene %d out of %d\n", i, total_genes))
         }
     }
    
     # Save the annotated data to a new CSV file
     write.csv(annotated_data, "cluster1_YELLOW.csv", row.names=FALSE)
     write.csv(annotated_data, "cluster2_DARKBLUE.csv", row.names=FALSE)
     write.csv(annotated_data, "cluster3_DARKORANGE.csv", row.names=FALSE)
     #~/Tools/csv2xls-0.4/csv_to_xls.py cluster*.csv -d',' -o gene_clusters.xls

Leave a Reply

Your email address will not be published. Required fields are marked *