-
install conda environment rnaseq_2021
conda create -n rnaseq_2021 python=3.6.7 conda activate rnaseq_2021 conda install -c conda-forge -c bioconda -c defaults nextflow=21.04 fastqc=0.11.8 trim-galore=0.5.0 star=2.6.1d hisat2=2.1.0 conda install -c conda-forge -c bioconda -c defaults picard=2.18.27 csvtk=0.17.0 preseq=2.0.3 conda install -c conda-forge -c bioconda -c defaults samtools=1.9 conda install -c conda-forge -c bioconda -c defaults gffread=0.9.12 conda install -c conda-forge -c bioconda -c defaults subread=1.6.4 conda install -c conda-forge -c bioconda -c defaults deeptools=3.2.0 conda install -c conda-forge -c bioconda -c defaults multiqc=1.7 #* conda install -c conda-forge -c bioconda -c defaults conda-forge::r-data.table=1.12.0 conda-forge::r-gplots=3.0.1.1 bioconductor-dupradar=1.12.1 bioconductor-edger=3.24.1 conda install -c conda-forge -c bioconda -c defaults stringtie=1.3.5 conda install -c conda-forge -c bioconda -c defaults rseqc=3.0.0
-
prepare the virus gtf-file
# -- processing for virus gtf -- # #gffread X14112.1_gene.gff -T -o X14112.1_gene.gtf # cp X14112.1.gff X14112.1.gff3 # #gffread -E -F --bed X14112.1.gff3 -o X14112.1.bed (change the name errors in 1 intron and 2 genes) # grep "^##" X14112.1.gff3 > X14112.1_gene.gff3 # # # --try to filter the file with genes --> failed -- # grep "ID=gene" X14112.1.gff3 >> X14112.1_gene.gff # cp X14112.1.gff3 X14112.1.gff # -- generating *_gene.gtf file containing only gene records -- python3 gff2gtf.py # -- check if gene_id is unique -- cut -f9- -d$'\t' X14112.1_gene.gtf > temp cut -f1 -d';' temp > temp_ #111 sort temp_ > temp_1 sort temp_ -u > temp_2 diff temp_1 temp_2 #39d38 #< ID=gene-UL29 #59d57 #< ID=gene-UL43 #--> delete short ones of the repeated records --> 109 records python3 extends.py #generating the file X14112.1_gene_extended.gtf #Then replace 'transcript_id "exon' --> 'transcript_id "rna' in X14112.1_gene_extended.gtf gffread -E -F --bed X14112.1_gene_extended.gtf -o X14112.1_gene_extended.bed #-->bed contains 109 transcript-name for example "rna-gene-RS1-2" ##!!!!OPTIONAL!!!!: don't need to change type '\tgene\t' to '\texon\t', since X14112.1_gene_extended.gtf contains exon-records. nextflow run rnaseq_old/main.nf --reads "/home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/Raw_Data/*.fastq.gz" --fasta "K14112.1.fasta" --gtf "K14112.1_gene_extended.gtf" --bed12 "K14112.1_gene_extendced.bed" --singleEnd -profile standard --aligner star --fcGroupFeaturesType gene_biotype --skip_genebody_coverage -resume --saveReference # -- correct some special records (optional, as the processing above didn't genrate the records) -- # delete the lines starting with "#" # replace "X14112.1:146805..151063" to IE175 # replace "X14112.1:133941..146107" to IE68 # add ;gene=IE68 ;gene=IE175 to the corresponding lines # -- python code for convert gff to gtf -- # open the input file for reading and the output file for writing # -- scripts choose gene or exon or mRNA -- # python3 gff2gtf.py #X14112.1.gff-->X14112.1.gtf # replace '; transcript_id "gene' to '; transcript_id "tx' # !!!!VERY_IMPORTANT!!!!: change type '\tgene\t' to '\texon\t'! # sed -i -e "s/\tgene\t/\texon\t/g" X14112.1_gene.gff # since default is --featurecounts_feature_type 'exon'.
-
nextflow command: the input should be *.umi_extract.fastq.gz.
#SUCCESSFUL (rnaseq) jhuang@hamburg:~/DATA/Data_Manja_RNAseq_Organoids_Virus$ /home/jhuang/anaconda3/bin/nextflow run rnaseq_old/main.nf --reads "/home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/Raw_Data/*.fastq.gz" --fasta "/home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/X14112.1.fasta" --gtf "/home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/X14112.1_gene_extended.gtf" --bed12 "/home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/X14112.1_gene_extended.bed" --singleEnd -profile standard --aligner hisat2 --fcGroupFeaturesType gene_biotype --skip_genebody_coverage --skip_preseq -resume --saveReference #NOT_TESTED (rnaseq_2021) jhuang@hamm:~/DATA/Data_Manja_RNAseq_Organoids_Virus$ nextflow run rnaseq_old/main.nf --reads "/home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/Raw_Data/*.fastq.gz" --fasta "/home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/X14112.1.fasta" --gtf "/home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/X14112.1_gene_extended.gtf" --bed12 "/home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/X14112.1_gene_extended.bed" --singleEnd -profile standard --aligner star -resume --saveReference
-
snippet of the human hg38 gtf served as a pattern
1 ensembl_havana gene 685679 686673 . - . gene_id "ENSG00000284662"; gene_version "1"; gene_name "OR4F16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; 1 ensembl_havana transcript 685679 686673 . - . gene_id "ENSG00000284662"; gene_version "1"; transcript_id "ENST00000332831"; transcript_version "4"; gene_name "OR4F16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "OR4F16-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS41221"; tag "basic"; transcript_support_level "NA (assigned to previous version 3)"; 1 ensembl_havana exon 685679 686673 . - . gene_id "ENSG00000284662"; gene_version "1"; transcript_id "ENST00000332831"; transcript_version "4"; exon_number "1"; gene_name "OR4F16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "OR4F16-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS41221"; exon_id "ENSE00002324228"; exon_version "3"; tag "basic"; transcript_support_level "NA (assigned to previous version 3)"; 1 ensembl_havana CDS 685719 686654 . - 0 gene_id "ENSG00000284662"; gene_version "1"; transcript_id "ENST00000332831"; transcript_version "4"; exon_number "1"; gene_name "OR4F16"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "OR4F16-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS41221"; protein_id "ENSP00000329982"; protein_version "2"; tag "basic"; transcript_support_level "NA (assigned to previous version 3)"; 1 ensembl_havana gene 1211340 1214153 . - . gene_id "ENSG00000186827"; gene_version "11"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; 1 havana transcript 1211340 1214138 . - . gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000497869"; transcript_version "5"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-203"; transcript_source "havana"; transcript_biotype "retained_intron"; transcript_support_level "2"; 1 havana exon 1213983 1214138 . - . gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000497869"; transcript_version "5"; exon_number "1"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-203"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00001480264"; exon_version "3"; transcript_support_level "2"; 1 havana exon 1212992 1213785 . - . gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000497869"; transcript_version "5"; exon_number "2"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-203"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00001906619"; exon_version "1"; transcript_support_level "2"; 1 havana exon 1212638 1212704 . - . gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000497869"; transcript_version "5"; exon_number "3"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-203"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00003550137"; exon_version "1"; transcript_support_level "2"; 1 havana exon 1211942 1212138 . - . gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000497869"; transcript_version "5"; exon_number "4"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-203"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00003604411"; exon_version "1"; transcript_support_level "2"; 1 ensembl_havana exon 1211704 1211832 . - . gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000379236"; transcript_version "4"; exon_number "6"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS11"; exon_id "ENSE00001333051"; exon_v ersion "1"; tag "basic"; transcript_support_level "1 (assigned to previous version 3)"; 1 ensembl_havana CDS 1211704 1211832 . - 2 gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000379236"; transcript_version "4"; exon_number "6"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS11"; protein_id "ENSP00000368538"; protein_version "3"; tag "basic"; transcript_support_level "1 (assigned to previous version 3)"; 1 ensembl_havana exon 1211340 1211625 . - . gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000379236"; transcript_version "4"; exon_number "7"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS11"; exon_id "ENSE00001915458"; exon_version "2"; tag "basic"; transcript_support_level "1 (assigned to previous version 3)"; 1 ensembl_havana CDS 1211558 1211625 . - 2 gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000379236"; transcript_version "4"; exon_number "7"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS11"; protein_id "ENSP00000368538"; protein_version "3"; tag "basic"; transcript_support_level "1 (assigned to previous version 3)"; 1 ensembl_havana stop_codon 1211555 1211557 . - 0 gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000379236"; transcript_version "4"; exon_number "7"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS11"; tag "basic"; transcript_support_level "1 (assigned to previous version 3)"; 1 ensembl_havana five_prime_utr 1214128 1214153 . - . gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000379236"; transcript_version "4"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS11"; tag "basic"; transcript_support_level "1 (assigned to previous version 3)"; 1 ensembl_havana three_prime_utr 1211340 1211554 . - . gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000379236"; transcript_version "4"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-201"; transcript_source "ensembl_havana"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS11"; tag "basic"; transcript_support_level "1 (assigned to previous version 3)"; 1 havana transcript 1211340 1214138 . - . gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000497869"; transcript_version "5"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-203"; transcript_source "havana"; transcript_biotype "retained_intron"; transcript_support_level "2"; 1 havana exon 1213983 1214138 . - . gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000497869"; transcript_version "5"; exon_number "1"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-203"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00001480264"; exon_version "3"; transcript_support_level "2"; 1 havana exon 1212992 1213785 . - . gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000497869"; transcript_version "5"; exon_number "2"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-203 "; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00001906619"; exon_version "1"; transcript_support_level "2"; 1 havana exon 1212638 1212704 . - . gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000497869"; transcript_version "5"; exon_number "3"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-203"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00003550137"; exon_version "1"; transcript_support_level "2"; 1 havana exon 1211942 1212138 . - . gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000497869"; transcript_version "5"; exon_number "4"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-203"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00003604411"; exon_version "1"; transcript_support_level "2"; 1 havana exon 1211340 1211832 . - . gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000497869"; transcript_version "5"; exon_number "5"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-203"; transcript_source "havana"; transcript_biotype "retained_intron"; exon_id "ENSE00001923078"; exon_version "1"; transcript_support_level "2"; 1 havana transcript 1212019 1213498 . - . gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000453580"; transcript_version "1"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-202"; transcript_source "havana"; transcript_biotype "processed_transcript"; transcript_support_level "3"; 1 havana exon 1213395 1213498 . - . gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000453580"; transcript_version "1"; exon_number "1"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-202"; transcript_source "havana"; transcript_biotype "processed_transcript"; exon_id "ENSE00001680308"; exon_version "1"; transcript_support_level "3"; 1 havana exon 1212992 1213093 . - . gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000453580"; transcript_version "1"; exon_number "2"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-202"; transcript_source "havana"; transcript_biotype "processed_transcript"; exon_id "ENSE00003495433"; exon_version "1"; transcript_support_level "3"; 1 havana exon 1212638 1212704 . - . gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000453580"; transcript_version "1"; exon_number "3"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-202"; transcript_source "havana"; transcript_biotype "processed_transcript"; exon_id "ENSE00003550137"; exon_version "1"; transcript_support_level "3"; 1 havana exon 1212019 1212138 . - . gene_id "ENSG00000186827"; gene_version "11"; transcript_id "ENST00000453580"; transcript_version "1"; exon_number "4"; gene_name "TNFRSF4"; gene_source "ensembl_havana"; gene_biotype "protein_coding"; transcript_name "TNFRSF4-202"; transcript_source "havana"; transcript_biotype "processed_transcript"; exon_id "ENSE00001723250"; exon_version "1"; transcript_support_level "3";
-
generate wig files
samtools faidx X14112.1.fasta cut -f1,2 X14112.1.fasta.fai > results/markDuplicates/chrom.sizes cd results/markDuplicates/ for sample in control_r1 control_r2 HSV.d2_r1 HSV.d2_r2 HSV.d4_r1 HSV.d4_r2 HSV.d6_r1 HSV.d6_r2 HSV.d8_r1 HSV.d8_r2; do #bamCoverage -b ${sample}.umi_extract.sorted.markDups.bam -o ${sample}.bw #bedtools genomecov -ibam ${sample}.umi_extract.sorted.markDups.bam -bg > ${sample}.bedgraph bedGraphToBigWig ${sample}.bedgraph chrom.sizes ${sample}.bw bigWigToWig ${sample}.bw ${sample}.wig done
-
input and clean data using R
#BiocManager::install(c("DESeq2")) requiredPackages1 <-c("AnnotationDbi","clusterProfiler","ReactomePA","org.Hs.eg.db","DESeq2", "gplots", "RColorBrewer") ipak <- function(pkg){ new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])] if (length(new.pkg)) install.packages(new.pkg, dependencies = TRUE) sapply(pkg, require, character.only = TRUE) } ipak(requiredPackages1) #requiredPackages2 <- c("tidyverse") #ipak(requiredPackages2) #cut -f1-1 merged_gene_counts.txt > col1.txt #paste -d$'\t' col1.txt merged_gene_counts2.txt > merged_gene_counts3.txt #sed -i 's/gene-//g' merged_gene_counts3.txt #replace "X14112.1" to "X14112"; delete "rna-gene-"; get X14112.1_gene_extended2.gtf; using it in IGV library("AnnotationDbi") library("clusterProfiler") library("ReactomePA") library("org.Hs.eg.db") library(DESeq2) library(gplots) setwd("~/DATA/Data_Manja_RNAseq_Organoids_Virus/results/featureCounts") #---- dataset (27) samples (firstly import all samples, then spring to 27-3-1-2) ---- d.raw<- read.delim2("merged_gene_counts3.txt",sep="\t", header=TRUE, row.names=1) #> head(d.raw,0) # [1] HSV.d4_r2.umi_extract.sorted.bam HSV.d6_r1.umi_extract.sorted.bam # [3] HSV.d4_r1.umi_extract.sorted.bam control_r1.umi_extract.sorted.bam # [5] HSV.d2_r2.umi_extract.sorted.bam control_r2.umi_extract.sorted.bam # [7] HSV.d2_r1.umi_extract.sorted.bam HSV.d8_r2.umi_extract.sorted.bam # [9] HSV.d8_r1.umi_extract.sorted.bam HSV.d6_r2.umi_extract.sorted.bam col.order <- c("control_r1.umi_extract.sorted.bam","control_r2.umi_extract.sorted.bam", "HSV.d2_r1.umi_extract.sorted.bam","HSV.d2_r2.umi_extract.sorted.bam", "HSV.d4_r1.umi_extract.sorted.bam","HSV.d4_r2.umi_extract.sorted.bam", "HSV.d6_r1.umi_extract.sorted.bam","HSV.d6_r2.umi_extract.sorted.bam", "HSV.d8_r1.umi_extract.sorted.bam","HSV.d8_r2.umi_extract.sorted.bam") d <- d.raw[,col.order] #reordered.raw #d <- reordered.raw[rowSums(reordered.raw>3)>2,] colnames(d) = c("control_r1","control_r2", "HSV.d2_r1","HSV.d2_r2", "HSV.d4_r1","HSV.d4_r2", "HSV.d6_r1","HSV.d6_r2", "HSV.d8_r1","HSV.d8_r2") # Define the replicates and condition of the samples ids <- factor(c("control_r1","control_r2", "HSV.d2_r1","HSV.d2_r2", "HSV.d4_r1","HSV.d4_r2", "HSV.d6_r1","HSV.d6_r2", "HSV.d8_r1","HSV.d8_r2")) replicate <- factor(c("r1", "r2", "r1", "r2", "r1", "r2", "r1", "r2", "r1", "r2")) condition <- factor(c("control", "control", "HSV.d2", "HSV.d2", "HSV.d4", "HSV.d4", "HSV.d6", "HSV.d6", "HSV.d8", "HSV.d8")) # Construct the DESeqDataSet cData = data.frame(row.names=colnames(d), replicate=replicate, condition=condition, ids=ids) dds<-DESeqDataSetFromMatrix(countData=d, colData=cData, design=~condition) # Run DESeq (early without the step, WRONG?) dds <- DESeq(dds) # Apply the rlog transformation rld <- rlogTransformation(dds) #rld <- vst(dds) #vsd #-- save raw_data as xls -- write.csv(d, file="d.csv") #~/Tools/csv2xls-0.4/csv_to_xls.py d.csv -d$',' -o d.xls
-
plotting pca and heatmap and remove batchEffect
# -- pca -- png("pca.png", 1200, 800) plotPCA(rld, intgroup=c("condition")) #plotPCA(rld, intgroup = c("condition", "batch")) #plotPCA(rld, intgroup = c("condition", "ids")) #plotPCA(rld, "batch") dev.off() # -- heatmap -- ## generate the pairwise comparison between samples library(gplots) library("RColorBrewer") png("heatmap.png", 1200, 800) distsRL <- dist(t(assay(rld))) mat <- as.matrix(distsRL) #paste( rld$dex, rld$cell, sep="-" ) #rownames(mat) <- colnames(mat) <- with(colData(dds),paste(condition,batch, sep=":")) rownames(mat) <- colnames(mat) <- with(colData(dds),paste(condition,ids, sep=":")) hc <- hclust(distsRL) hmcol <- colorRampPalette(brewer.pal(9,"GnBu"))(100) heatmap.2(mat, Rowv=as.dendrogram(hc),symm=TRUE, trace="none",col = rev(hmcol), margin=c(13, 13)) dev.off()
-
select the differentially expressed genes
#https://galaxyproject.eu/posts/2020/08/22/three-steps-to-galaxify-your-tool/ #https://www.biostars.org/p/282295/ #https://www.biostars.org/p/335751/ #> condition # [1] control control HSV.d2 HSV.d2 HSV.d4 HSV.d4 HSV.d6 HSV.d6 HSV.d8 HSV.d8 #Levels: control HSV.d2 HSV.d4 HSV.d6 HSV.d8 #CONSOLE: mkdir /home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/results/featureCounts/degenes setwd("/home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/results/featureCounts/degenes") #---- relevel to control ---- dds$condition <- relevel(dds$condition, "control") dds = DESeq(dds, betaPrior=FALSE) resultsNames(dds) clist <- c("HSV.d2_vs_control","HSV.d4_vs_control", "HSV.d6_vs_control", "HSV.d8_vs_control") dds$condition <- relevel(dds$condition, "HSV.d2") dds = DESeq(dds, betaPrior=FALSE) resultsNames(dds) clist <- c("HSV.d4_vs_HSV.d2", "HSV.d6_vs_HSV.d2", "HSV.d8_vs_HSV.d2") dds$condition <- relevel(dds$condition, "HSV.d4") dds = DESeq(dds, betaPrior=FALSE) resultsNames(dds) clist <- c("HSV.d6_vs_HSV.d4", "HSV.d8_vs_HSV.d4") dds$condition <- relevel(dds$condition, "HSV.d6") dds = DESeq(dds, betaPrior=FALSE) resultsNames(dds) clist <- c("HSV.d8_vs_HSV.d6") for (i in clist) { contrast = paste("condition", i, sep="_") res = results(dds, name=contrast) res <- res[!is.na(res$log2FoldChange),] res_df <- as.data.frame(res) write.csv(as.data.frame(res_df[order(res_df$pvalue),]), file = paste(i, "all.txt", sep="-")) up <- subset(res_df, padj<=0.05 & log2FoldChange>=1.2) down <- subset(res_df, padj<=0.05 & log2FoldChange<=-1.2) write.csv(as.data.frame(up[order(up$log2FoldChange,decreasing=TRUE),]), file = paste(i, "up.txt", sep="-")) write.csv(as.data.frame(down[order(abs(down$log2FoldChange),decreasing=TRUE),]), file = paste(i, "down.txt", sep="-")) } ##https://github.com/kevinblighe/EnhancedVolcano #BiocManager::install("EnhancedVolcano") library("EnhancedVolcano") #for i in HSV.d2_vs_control HSV.d4_vs_control HSV.d6_vs_control HSV.d8_vs_control; do #for i in HSV.d4_vs_HSV.d2 HSV.d6_vs_HSV.d2 HSV.d8_vs_HSV.d2; do #for i in HSV.d6_vs_HSV.d4 HSV.d8_vs_HSV.d4; do for i in HSV.d8_vs_HSV.d6; do echo "contrast = paste(\"condition\", \"${i}\", sep=\"_\")" echo "res = results(dds, name=contrast)" #echo "res <- res[!is.na(res$log2FoldChange),]" echo "res <- na.omit(res)" echo "res_df <- as.data.frame(res)" #selectLab = selectLab_italics, echo "png(\"${i}.png\",width=1200, height=1000)" #legendPosition = 'right',legendLabSize = 12, arrowheads = FALSE, echo "EnhancedVolcano(res, lab = rownames(res),x = 'log2FoldChange',y = 'padj', pCutoff=5e-2, FCcutoff=1.2, title='', subtitleLabSize = 18, pointSize = 3.0, labSize = 5.0, colAlpha=1, legendIconSize = 4.0, drawConnectors = TRUE, widthConnectors = 0.5, colConnectors = 'black', subtitle=expression(\"$(echo $i | cut -d'_' -f1) versus $(echo $i | cut -d'_' -f3)\"))" echo "dev.off()" done #under DIR degenes under KONSOLE for i in HSV.d2_vs_control HSV.d4_vs_control HSV.d6_vs_control HSV.d8_vs_control HSV.d4_vs_HSV.d2 HSV.d6_vs_HSV.d2 HSV.d8_vs_HSV.d2 HSV.d6_vs_HSV.d4 HSV.d8_vs_HSV.d4 HSV.d8_vs_HSV.d6; do echo "~/Tools/csv2xls-0.4/csv_to_xls.py ${i}-all.txt ${i}-up.txt ${i}-down.txt -d$',' -o ${i}.xls;" done
9 (optional). clustering the genes and draw heatmap
install.packages("gplots")
library("gplots")
for i in HSV.d2_vs_control HSV.d4_vs_control HSV.d6_vs_control HSV.d8_vs_control HSV.d4_vs_HSV.d2 HSV.d6_vs_HSV.d2 HSV.d8_vs_HSV.d2 HSV.d6_vs_HSV.d4 HSV.d8_vs_HSV.d4 HSV.d8_vs_HSV.d6; do
echo "cut -d',' -f1-1 ${i}-up.txt > ${i}-up.id"
echo "cut -d',' -f1-1 ${i}-down.txt > ${i}-down.id"
done
cat *.id | sort -u > ids
#add Gene_Id in the first line, delete the ""
GOI <- read.csv("ids")$Gene_Id
RNASeq.NoCellLine <- assay(rld)
#clustering methods: "ward.D", "ward.D2", "single", "complete", "average" (= UPGMA), "mcquitty" (= WPGMA), "median" (= WPGMC) or "centroid" (= UPGMC). pearson or spearman
datamat = RNASeq.NoCellLine[GOI, ]
write.csv(as.data.frame(datamat), file ="significant_gene_expressions.txt")
hr <- hclust(as.dist(1-cor(t(datamat), method="pearson")), method="complete")
hc <- hclust(as.dist(1-cor(datamat, method="spearman")), method="complete")
mycl = cutree(hr, h=max(hr$height)/1.5)
mycol = c("YELLOW", "BLUE", "ORANGE", "MAGENTA", "CYAN", "RED", "GREEN", "MAROON", "LIGHTBLUE", "PINK", "MAGENTA", "LIGHTCYAN", "LIGHTRED", "LIGHTGREEN");
mycol = mycol[as.vector(mycl)]
#png("DEGs_heatmap.png", width=900, height=800)
#cex.lab=10, labRow="",
png("DEGs_heatmap.png", width=900, height=1000)
heatmap.2(as.matrix(datamat),Rowv=as.dendrogram(hr),Colv = NA, dendrogram = 'row',
scale='row',trace='none',col=bluered(75),
RowSideColors = mycol, margins=c(10,20), cexRow=1.5, srtCol=45, lhei = c(2, 8)) #rownames(datamat)
#heatmap.2(datamat, Rowv=as.dendrogram(hr), col=bluered(75), scale="row", RowSideColors=mycol, trace="none", margin=c(5,5), sepwidth=c(0,0), dendrogram = 'row', Colv = 'false', density.info='none', labRow="", srtCol=30, lhei=c(0.1,2))
dev.off()
#### cluster members #####
write.csv(names(subset(mycl, mycl == '1')),file='cluster1_YELLOW.txt')
write.csv(names(subset(mycl, mycl == '2')),file='cluster2_DARKBLUE.txt')
write.csv(names(subset(mycl, mycl == '3')),file='cluster3_DARKORANGE.txt')
#~/Tools/csv2xls-0.4/csv_to_xls.py cluster*.txt -d',' -o DEGs_heatmap_cluster_members.xls
~/Tools/csv2xls-0.4/csv_to_xls.py \
significant_gene_expressions.txt \
-d',' -o DEGs_heatmap_expression_data.xls;