-
import data and pca-plot
# Import the required libraries library("AnnotationDbi") library("clusterProfiler") library("ReactomePA") library(gplots) library(tximport) library(DESeq2) setwd("~/DATA/Data_Manja_RNAseq_Organoids_Virus/results_chrHsv1_downstream/star_salmon") # Define paths to your Salmon output quantification files files <- c("control_r1" = "./control_r1/quant.sf", "control_r2" = "./control_r2/quant.sf", "HSV.d2_r1" = "./HSV.d2_r1/quant.sf", "HSV.d2_r2" = "./HSV.d2_r2/quant.sf", "HSV.d4_r1" = "./HSV.d4_r1/quant.sf", "HSV.d4_r2" = "./HSV.d4_r2/quant.sf", "HSV.d6_r1" = "./HSV.d6_r1/quant.sf", "HSV.d6_r2" = "./HSV.d6_r2/quant.sf", "HSV.d8_r1" = "./HSV.d8_r1/quant.sf", "HSV.d8_r2" = "./HSV.d8_r2/quant.sf") # Import the transcript abundance data with tximport txi <- tximport(files, type = "salmon", txIn = TRUE, txOut = TRUE) # Define the replicates and condition of the samples replicate <- factor(c("r1", "r2", "r1", "r2", "r1", "r2", "r1", "r2", "r1", "r2")) condition <- factor(c("control", "control", "HSV.d2", "HSV.d2", "HSV.d4", "HSV.d4", "HSV.d6", "HSV.d6", "HSV.d8", "HSV.d8")) # Define the colData for DESeq2 colData <- data.frame(condition=condition, replicate=replicate, row.names=names(files)) # Create DESeqDataSet object dds <- DESeqDataSetFromTximport(txi, colData=colData, design=~condition) # In the context of your new code which is using tximport and DESeq2, you don't necessarily need this step. The reason is that DESeq2 performs its own filtering of low-count genes during the normalization and differential expression steps. # Filter data to retain only genes with more than 2 counts > 3 across all samples # dds <- dds[rowSums(counts(dds) > 3) > 2, ] # Output raw count data to a CSV file write.csv(counts(dds), file="transcript_counts.csv") # -- gene-level count data -- # Read in the tx2gene map from salmon_tx2gene.tsv #tx2gene <- read.csv("salmon_tx2gene.tsv", sep="\t", header=FALSE) tx2gene <- read.table("salmon_tx2gene.tsv", header=FALSE, stringsAsFactors=FALSE) # Set the column names colnames(tx2gene) <- c("transcript_id", "gene_id", "gene_name") # Remove the gene_name column if not needed tx2gene <- tx2gene[,1:2] # Import and summarize the Salmon data with tximport txi <- tximport(files, type = "salmon", tx2gene = tx2gene, txOut = FALSE) # Continue with the DESeq2 workflow as before... colData <- data.frame(condition=condition, replicate=replicate, row.names=names(files)) dds <- DESeqDataSetFromTximport(txi, colData=colData, design=~condition) #dds <- dds[rowSums(counts(dds) > 3) > 2, ] #60605-->26543 write.csv(counts(dds, normalized=FALSE), file="gene_counts.csv") #~/Tools/csv2xls-0.4/csv_to_xls.py gene_counts.csv -d',' -o gene_counts.xls #TODO: why a lot of reads were removed due to the too_short? #STAR --runThreadN 4 --genomeDir /path/to/GenomeDir --readFilesIn /path/to/read1.fastq /path/to/read2.fastq --outFilterMatchNmin 50 --outSAMtype BAM SortedByCoordinate --outFileNamePrefix /path/to/output dim(counts(dds)) head(counts(dds), 10) #DEBUG: DESeq should not used here!? #TODO_NEXT_WEEK: rerun without fistly DESeq(dds) to compare if the results is the same to process_1 #dds <- DESeq(dds) rld <- rlogTransformation(dds) # draw simple pca and heatmap library(gplots) library("RColorBrewer") #mat <- assay(rld) #mm <- model.matrix(~condition, colData(rld)) #mat <- limma::removeBatchEffect(mat, batch=rld$batch, design=mm) #assay(rld) <- mat # -- pca -- png("pca.png", 1200, 800) plotPCA(rld, intgroup=c("condition")) dev.off() # -- heatmap -- png("heatmap.png", 1200, 800) distsRL <- dist(t(assay(rld))) mat <- as.matrix(distsRL) hc <- hclust(distsRL) hmcol <- colorRampPalette(brewer.pal(9,"GnBu"))(100) heatmap.2(mat, Rowv=as.dendrogram(hc),symm=TRUE, trace="none",col = rev(hmcol), margin=c(13, 13)) dev.off()
-
draw 3D PCA plots.
library(gplots) library("RColorBrewer") library(ggplot2) data <- plotPCA(rld, intgroup=c("condition", "replicate"), returnData=TRUE) write.csv(data, file="plotPCA_data.csv") #calculate all PCs including PC3 with the following codes library(genefilter) ntop <- 500 rv <- rowVars(assay(rld)) select <- order(rv, decreasing = TRUE)[seq_len(min(ntop, length(rv)))] mat <- t( assay(rld)[select, ] ) pc <- prcomp(mat) pc$x[,1:3] #df_pc <- data.frame(pc$x[,1:3]) df_pc <- data.frame(pc$x) identical(rownames(data), rownames(df_pc)) #-->TRUE data$PC1 <- NULL data$PC2 <- NULL merged_df <- merge(data, df_pc, by = "row.names") #merged_df <- merged_df[, -1] row.names(merged_df) <- merged_df$Row.names merged_df$Row.names <- NULL # remove the "name" column merged_df$name <- NULL merged_df <- merged_df[, c("PC1","PC2","PC3","PC4","PC5","PC6","PC7","PC8","PC9","PC10","group","condition","replicate")] write.csv(merged_df, file="merged_df_10PCs.csv") summary(pc) #0.5333 0.2125 0.06852 #0.8026 0.09042 0.06578 draw_3D.py # adjust proportion to real values in the following plot import plotly.graph_objects as go import pandas as pd from sklearn.decomposition import PCA import numpy as np # Read in data as a pandas dataframe #df = pd.DataFrame({ # 'PC1': [-13.999925, -12.504291, -12.443057, -13.065235, -17.316215], # 'PC2': [-1.498823, -3.342411, -6.067055, -8.205809, 3.293993], # 'PC3': [-3.335085, 15.207755, -14.725450, 15.078469, -6.917358], # 'condition': ['GFP d3', 'GFP d3', 'GFP d8', 'GFP d8', 'GFP+mCh d9/12'], # 'replicate': ['DI', 'DII', 'DI', 'DII', 'DI'] #}) df = pd.read_csv('merged_df_10PCs.csv', index_col=0, header=0) df['condition'] = df['condition'].replace("control", "control") df['condition'] = df['condition'].replace("HSV.d2", "day 2") df['condition'] = df['condition'].replace("HSV.d4", "day 4") df['condition'] = df['condition'].replace("HSV.d6", "day 6") df['condition'] = df['condition'].replace("HSV.d8", "day 8") # Fit PCA model to reduce data dimensions to 3 pca = PCA(n_components=3) pca.fit(df.iloc[:, :-3]) X_reduced = pca.transform(df.iloc[:, :-3]) # Get variance ratios explained_variance_ratio = pca.explained_variance_ratio_ # Add reduced data back to dataframe df['PC1'] = X_reduced[:, 0] df['PC2'] = X_reduced[:, 1] df['PC3'] = X_reduced[:, 2] # Create PCA plot with 3D scatter fig = go.Figure() ##ff7f00 condition_color_map = { 'control': 'rgb(100, 100, 100)', 'day 2': '#33a02c', 'day 4': '#1f78b4', 'day 6': '#e31a1c', 'day 8': 'magenta' } replicate_symbol_map = {'r1': 'circle', 'r2': 'diamond'} for replicate, replicate_symbol in replicate_symbol_map.items(): for condition, condition_color in condition_color_map.items(): mask = (df['condition'] == condition) & (df['replicate'] == replicate) fig.add_trace(go.Scatter3d(x=df.loc[mask, 'PC1'], y=df.loc[mask, 'PC2'], z=df.loc[mask, 'PC3'], mode='markers', name=f'{condition}' if replicate == 'r1' else None, legendgroup=f'{condition}', showlegend=True if replicate == 'r1' else False, marker=dict(size=6 if replicate_symbol in ['diamond'] else 10, opacity=0.8, color=condition_color, symbol=replicate_symbol))) for replicate, replicate_symbol in replicate_symbol_map.items(): fig.add_trace(go.Scatter3d(x=[None], y=[None], z=[None], mode='markers', name=replicate, legendgroup=f'{replicate}', showlegend=True, marker=dict(size=10, opacity=1, color='black', symbol=replicate_symbol), hoverinfo='none')) # Annotations for the legend blocks #TODO: calculate the PC values. #TODO: adjust the axis length according to the actual size of axis! fig.update_layout( annotations=[ dict(x=1.1, y=1.0, xref='paper', yref='paper', showarrow=False, text='Condition', font=dict(size=15)), dict(x=1.1, y=0.6, xref='paper', yref='paper', showarrow=False, text='Replicate', font=dict(size=15)) ], scene=dict( #aspectmode='cube', #xaxis=dict(gridcolor='black', backgroundcolor='white', zerolinecolor='black', title='PC1: 53% v.', scaleratio=0.53), #yaxis=dict(gridcolor='black', backgroundcolor='white', zerolinecolor='black', title='PC2: 21% v.', scaleratio=0.21), #zaxis=dict(gridcolor='black', backgroundcolor='white', zerolinecolor='black', title='PC3: 7% variance', scaleratio=0.07), aspectmode='manual', aspectratio=dict(x=explained_variance_ratio[0], y=explained_variance_ratio[1], z=explained_variance_ratio[2]), xaxis=dict(gridcolor='black', backgroundcolor='white', zerolinecolor='black', title='PC1: 53% v.', range=[min(df['PC1']), max(df['PC1'])]), yaxis=dict(gridcolor='black', backgroundcolor='white', zerolinecolor='black', title='PC2: 21% v.', range=[min(df['PC2']), max(df['PC2'])]), zaxis=dict(gridcolor='black', backgroundcolor='white', zerolinecolor='black', title='PC3: 7% variance', range=[min(df['PC3']), max(df['PC3'])]), bgcolor='white' ), margin=dict(l=5, r=5, b=5, t=0) # Adjust the margins to prevent clipping of axis titles ) #fig.show() fig.write_image("fig1.svg") fig.update_layout( annotations=[ dict(x=1.1, y=1.0, xref='paper', yref='paper', showarrow=False, text='Condition', font=dict(size=15)), dict(x=1.1, y=0.6, xref='paper', yref='paper', showarrow=False, text='Donor', font=dict(size=15)), dict(x=1.08, y=0.2, xref='paper', yref='paper', showarrow=False, text=f'PC3: {explained_variance_ratio[2]*100:.2f}% v.', font=dict(size=15), textangle=-90) ], scene=dict( aspectmode='manual', aspectratio=dict(x=explained_variance_ratio[0]*2, y=explained_variance_ratio[1]*2, z=explained_variance_ratio[2]*2), #, range=[min(df['PC1']), max(df['PC1'])] #, range=[min(df['PC2']), max(df['PC2'])] #, range=[min(df['PC3']), max(df['PC3'])] xaxis=dict(gridcolor='black', backgroundcolor='white', zerolinecolor='black', title=f'PC1: {explained_variance_ratio[0]*100:.2f}% variance'), yaxis=dict(gridcolor='black', backgroundcolor='white', zerolinecolor='black', title=f'PC2: {explained_variance_ratio[1]*100:.2f}% v.'), zaxis=dict(gridcolor='black', backgroundcolor='white', zerolinecolor='black', title=''), bgcolor='white' ), margin=dict(l=0, r=0, b=0, t=0) # Adjust the margins to prevent clipping of axis titles ) fig.write_image("PCA_3D.svg") #/usr/bin/convert g235.png -crop 3250x1680+1+750 PCA_3D_.png
-
(optional) estimate size factors
> head(dds) class: DESeqDataSet dim: 6 10 metadata(1): version assays(6): counts avgTxLength ... H cooks rownames(6): ENSG00000000003 ENSG00000000005 ... ENSG00000000460 ENSG00000000938 rowData names(34): baseMean baseVar ... deviance maxCooks colnames(10): control_r1 control_r2 ... HSV.d8_r1 HSV.d8_r2 colData names(2): condition replicate #convert bam to bigwig using deepTools by feeding inverse of DESeq’s size Factor sizeFactors(dds) #NULL dds <- estimateSizeFactors(dds) > sizeFactors(dds) raw_counts <- counts(dds) normalized_counts <- counts(dds, normalized=TRUE) #write.table(raw_counts, file="raw_counts.txt", sep="\t", quote=F, col.names=NA) #write.table(normalized_counts, file="normalized_counts.txt", sep="\t", quote=F, col.names=NA) # ---- DEBUG sizeFactors(dds) always NULL, see https://support.bioconductor.org/p/97676/ ---- nm <- assays(dds)[["avgTxLength"]] sf <- estimateSizeFactorsForMatrix(counts(dds), normMatrix=nm) assays(dds)$counts # for count data assays(dds)$avgTxLength # for average transcript length, etc. assays(dds)$normalizationFactors In normal circumstances, the size factors should be stored in the DESeqDataSet object itself and not in the assays, so they are typically not retrievable via the assays() function. However, due to the issues you're experiencing, you might be able to manually compute the size factors and assign them back to the DESeqDataSet. To calculate size factors manually, DESeq2 uses the median ratio method. Here's a very simplified version of how you could compute this manually: > assays(dds) List of length 6 names(6): counts avgTxLength normalizationFactors mu H cooks To calculate size factors manually, DESeq2 uses the median ratio method. Here's a very simplified version of how you could compute this manually: geoMeans <- apply(assays(dds)$counts, 1, function(row) if (all(row == 0)) 0 else exp(mean(log(row[row != 0])))) sizeFactors(dds) <- median(assays(dds)$counts / geoMeans, na.rm = TRUE) # ---- DEBUG END ---- #unter konsole # control_r1 ... # 1/0.9978755 ... > sizeFactors(dds) HeLa_TO_r1 HeLa_TO_r2 0.9978755 1.1092227 1/0.9978755=1.002129023 1/1.1092227= #bamCoverage --bam ../markDuplicates/${sample}Aligned.sortedByCoord.out.bam -o ${sample}_norm.bw --binSize 10 --scaleFactor --effectiveGenomeSize 2864785220 bamCoverage --bam ../markDuplicates/HeLa_TO_r1Aligned.sortedByCoord.out.markDups.bam -o HeLa_TO_r1.bw --binSize 10 --scaleFactor 1.002129023 --effectiveGenomeSize 2864785220 bamCoverage --bam ../markDuplicates/HeLa_TO_r2Aligned.sortedByCoord.out.markDups.bam -o HeLa_TO_r2.bw --binSize 10 --scaleFactor 0.901532217 --effectiveGenomeSize 2864785220
-
compare the normalization methods
# ---- draw normalization before and after ---- ### Let's implement such a function ### cds is a countDataset estimSf <- function (cds){ # Get the count matrix cts <- counts(cds) # Compute the geometric mean geomMean <- function(x) prod(x)^(1/length(x)) # Compute the geometric mean over the line gm.mean <- apply(cts, 1, geomMean) # Zero values are set to NA (avoid subsequentcdsdivision by 0) gm.mean[gm.mean == 0] <- NA # Divide each line by its corresponding geometric mean # sweep(x, MARGIN, STATS, FUN = "-", check.margin = TRUE, ...) # MARGIN: 1 or 2 (line or columns) # STATS: a vector of length nrow(x) or ncol(x), depending on MARGIN # FUN: the function to be applied cts <- sweep(cts, 1, gm.mean, FUN="/") # Compute the median over the columns med <- apply(cts, 2, median, na.rm=TRUE) # Return the scaling factor return(med) } #https://dputhier.github.io/ASG/practicals/rnaseq_diff_Snf2/rnaseq_diff_Snf2.html #http://bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#data-transformations-and-visualization #https://hbctraining.github.io/DGE_workshop/lessons/02_DGE_count_normalization.html #https://hbctraining.github.io/DGE_workshop/lessons/04_DGE_DESeq2_analysis.html #https://genviz.org/module-04-expression/0004/02/01/DifferentialExpression/ #DESeq2’s median of ratios [1] #EdgeR’s trimmed mean of M values (TMM) [2] #http://www.nathalievialaneix.eu/doc/html/TP1_normalization.html #very good website! test_normcount <- sweep(raw_counts, 2, sizeFactors(dds), "/") sum(test_normcount != normalized_counts) head(estimSf(dds)) all(round(estimSf(dds),6) == round(sizeFactors(dds), 6)) ## Checking the normalization png("normalization.png", width=800, height=600) epsilon <- 1 # pseudo-count to avoid problems with log(0) par(mfrow=c(1,2),cex.lab=0.7) boxplot(log2(raw_counts+epsilon), cex.axis=0.7, las=1, xlab="log2(raw counts)", horizontal=TRUE, main="Raw counts") boxplot(log2(normalized_counts+epsilon), cex.axis=0.7, las=1, xlab="log2(normalized counts)", horizontal=TRUE, main="Normalized counts") #boxplot(log10(assays(dds)[["cooks"]]), range=0, las=2) #plotDensity(log2(counts(dds.norm)+epsilon), col=col.pheno.selected, # xlab="log2(counts)", cex.lab=0.7, panel.first=grid()) #plotDensity(log2(counts(dds.norm, normalized=TRUE)+epsilon), col=col.pheno.selected, # xlab="log2(normalized counts)", cex.lab=0.7, panel.first=grid()) dev.off() # since we Gene-level differential expression analysis with DESeq2, the splicing plays no role in the analysis! # 用nanopore 可以 compare transcript length distribution. 有可能Cellline很长,Extracellular vesicles (EVs)很短! library(ggplot2) library(gridExtra) library(reshape2) library(mixOmics) library(RColorBrewer) library(DESeq) library(edgeR) library(VennDiagram) library(devtools) raw_counts_wn <- raw_counts[rowSums(raw_counts) > 0, ] dim(raw_counts_wn) #--Raw counts-- pseudo_counts <- log2(raw_counts_wn + 1) head(pseudo_counts) df_raw <- melt(pseudo_counts, id = rownames(raw_counts_wn)) names(df_raw)[1:2]<- c("id", "sample") df_raw$method <- rep("Raw counts", nrow(df_raw)) head(df_raw) #--DESeq-- cData = data.frame(row.names=colnames(raw_counts_wn), replicates=replicates, ids=ids) dge<-DESeqDataSetFromMatrix(countData=raw_counts_wn, colData=cData, design=~replicates) dge <- estimateSizeFactors(dge) sizeFactors(dge) deseq_normcount <- counts(dge, normalized = TRUE) test_normcount <- sweep(raw_counts_wn, 2, sizeFactors(dge), "/") sum(test_normcount != deseq_normcount) pseudo_deseq <- log2(deseq_normcount + 1) df_deseq <- melt(pseudo_deseq, id = rownames(raw_counts_wn)) names(df_deseq)[1:2]<- c("id", "sample") df_deseq$method <- rep("DESeq (RLE)", nrow(df_raw)) #--edgeR-- dge2 <- DGEList(raw_counts_wn) dge2 dge2$samples #--Total count-- pseudo_TC <- log2(cpm(dge2) + 1) df_TC <- melt(pseudo_TC, id = rownames(raw_counts_wn)) names(df_TC)[1:2] <- c ("id", "sample") df_TC$method <- rep("TC", nrow(df_TC)) ##--RPKM-- #gene_lengths_wn <- gene_lengths[rowSums(raw_counts) > 0] #pseudo_RPKM <- log2(rpkm(dge2, gene.length = gene_lengths_wn) + 1) #df_RPKM <- melt(pseudo_RPKM, id = rownames(raw_counts_wn)) #names(df_RPKM)[1:2] <- c ("id", "sample") #df_RPKM$method <- rep("RPKM", nrow(df_RPKM)) #--Upper quartile-- dge2 <- calcNormFactors(dge2, method = "upperquartile") dge2$samples test_normcount <- sweep(dge2$counts, 2, dge2$samples$lib.size*dge2$samples$norm.factors / 10^6, "/") range(as.vector(test_normcount - cpm(dge2))) pseudo_UQ <- log2(cpm(dge2) + 1) df_UQ <- melt(pseudo_UQ, id = rownames(raw_counts_wn)) names(df_UQ)[1:2] <- c ("id", "sample") df_UQ$method <- rep("UQ", nrow(df_UQ)) #--TMM-- dge2 <- calcNormFactors(dge2, method = "TMM") dge2$samples pseudo_TMM <- log2(cpm(dge2) + 1) df_TMM <- melt(pseudo_TMM, id = rownames(raw_counts_wn)) names(df_TMM)[1:2] <- c ("id", "sample") #MODIFIED! df_TMM$method <- rep("DESeq (RLE)", nrow(df_TMM)) #TMM #--Comparison-- png("normalization.png", width=800, height=600) #df_allnorm <- rbind(df_raw, df_deseq, df_TC, df_UQ, df_TMM) #df_allnorm$method <- factor(df_allnorm$method, levels = c("Raw counts", "DESeq (RLE)", "TC", "TMM", "UQ")) df_allnorm <- rbind(df_raw, df_TMM) df_allnorm$method <- factor(df_allnorm$method, levels = c("Raw counts", "DESeq (RLE)")) p <- ggplot(data=df_allnorm, aes(x=sample, y=value, fill=method)) p <- p + geom_boxplot() p <- p + theme_bw() p <- p + ggtitle("Boxplots of normalized pseudo counts\n for all samples by normalization methods") p <- p + facet_grid(. ~ method) p <- p + ylab(expression(log[2] ~ (normalized ~ count + 1))) + xlab("") p <- p + theme(title = element_text(size=10), axis.text.x = element_blank(), axis.ticks.x = element_blank()) print(p) dev.off()
-
select the differentially expressed genes
#https://galaxyproject.eu/posts/2020/08/22/three-steps-to-galaxify-your-tool/ #https://www.biostars.org/p/282295/ #https://www.biostars.org/p/335751/ #> condition # [1] control control HSV.d2 HSV.d2 HSV.d4 HSV.d4 HSV.d6 HSV.d6 HSV.d8 HSV.d8 #Levels: control HSV.d2 HSV.d4 HSV.d6 HSV.d8 #CONSOLE: mkdir /home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/results_chrHsv1_downstream/star_salmon/degenes setwd("/home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/results_chrHsv1_downstream/star_salmon/degenes") #---- relevel to control ---- dds$condition <- relevel(dds$condition, "control") dds = DESeq(dds, betaPrior=FALSE) resultsNames(dds) clist <- c("HSV.d2_vs_control","HSV.d4_vs_control", "HSV.d6_vs_control", "HSV.d8_vs_control") dds$condition <- relevel(dds$condition, "HSV.d2") dds = DESeq(dds, betaPrior=FALSE) resultsNames(dds) clist <- c("HSV.d4_vs_HSV.d2", "HSV.d6_vs_HSV.d2", "HSV.d8_vs_HSV.d2") dds$condition <- relevel(dds$condition, "HSV.d4") dds = DESeq(dds, betaPrior=FALSE) resultsNames(dds) clist <- c("HSV.d6_vs_HSV.d4", "HSV.d8_vs_HSV.d4") dds$condition <- relevel(dds$condition, "HSV.d6") dds = DESeq(dds, betaPrior=FALSE) resultsNames(dds) clist <- c("HSV.d8_vs_HSV.d6") for (i in clist) { contrast = paste("condition", i, sep="_") res = results(dds, name=contrast) res <- res[!is.na(res$log2FoldChange),] res_df <- as.data.frame(res) write.csv(as.data.frame(res_df[order(res_df$pvalue),]), file = paste(i, "all.txt", sep="-")) up <- subset(res_df, padj<=0.05 & log2FoldChange>=1) down <- subset(res_df, padj<=0.05 & log2FoldChange<=-1) write.csv(as.data.frame(up[order(up$log2FoldChange,decreasing=TRUE),]), file = paste(i, "up.txt", sep="-")) write.csv(as.data.frame(down[order(abs(down$log2FoldChange),decreasing=TRUE),]), file = paste(i, "down.txt", sep="-")) } echo "contrast = paste(\"condition\", \"${i}\", sep=\"_\")" echo "res = results(dds, name=contrast)" #echo "res <- res[!is.na(res$log2FoldChange),]" echo "res <- na.omit(res)" ##https://github.com/kevinblighe/EnhancedVolcano #BiocManager::install("EnhancedVolcano") library("EnhancedVolcano") for i in HSV.d2_vs_control HSV.d4_vs_control HSV.d6_vs_control HSV.d8_vs_control HSV.d4_vs_HSV.d2 HSV.d6_vs_HSV.d2 HSV.d8_vs_HSV.d2 HSV.d6_vs_HSV.d4 HSV.d8_vs_HSV.d4 HSV.d8_vs_HSV.d6; do #for i in HSV.d8_vs_control; do echo "res <- read.csv(file = paste(\"${i}\", \"all.txt\", sep=\"-\"), row.names = 1)" echo "res_df <- as.data.frame(res)" echo "png(\"${i}.png\",width=800, height=600)" #legendPosition = 'right',legendLabSize = 12, arrowheads = FALSE, #echo "EnhancedVolcano(res, lab = rownames(res),x = 'log2FoldChange',y = 'padj', pCutoff=5e-2, FCcutoff=1.2, title='', subtitleLabSize = 18, pointSize = 3.0, labSize = 5.0, colAlpha=1, legendIconSize = 4.0, drawConnectors = TRUE, widthConnectors = 0.5, colConnectors = 'black', subtitle=expression(~Delta*\"$(echo $i | cut -d'_' -f1) versus \" *~Delta*\"$(echo $i | cut -d'_' -f3)\"))" echo "EnhancedVolcano(res, lab = rownames(res),x = 'log2FoldChange',y = 'padj', pCutoff=5e-2, FCcutoff=1.0, title='', subtitleLabSize = 18, pointSize = 3.0, labSize = 5.0, colAlpha=1, legendIconSize = 4.0, drawConnectors = TRUE, widthConnectors = 0.5, colConnectors = 'black', subtitle=expression(\"$(echo $i | cut -d'_' -f1) versus $(echo $i | cut -d'_' -f3)\"))" echo "dev.off()" done #DEBUG: why some genes in HSV.d8 in control high regulated --> ERROR! We should keep the number of reads in the raw counts, leading all genes low regulated! Not using the default normalization method!!!! #res <- read.csv(file = paste("HSV.d8_vs_control", "all.txt", sep="-"), row.names = 1) #res_df <- as.data.frame(res) #png("HSV.d8_vs_control.png",width=800, height=600) #EnhancedVolcano(res, lab = rownames(res),x = 'log2FoldChange',y = 'padj', pCutoff=5e-2, FCcutoff=1.0, title='', subtitleLabSize = 18, pointSize = 3.0, labSize = 5.0, colAlpha=1, legendIconSize = 4.0, drawConnectors = TRUE, widthConnectors = 0.5, colConnectors = 'black', subtitle=expression("HSV.d8 versus control")) #dev.off() #under DIR degenes under KONSOLE for i in HSV.d2_vs_control HSV.d4_vs_control HSV.d6_vs_control HSV.d8_vs_control HSV.d4_vs_HSV.d2 HSV.d6_vs_HSV.d2 HSV.d8_vs_HSV.d2 HSV.d6_vs_HSV.d4 HSV.d8_vs_HSV.d4 HSV.d8_vs_HSV.d6; do echo "~/Tools/csv2xls-0.4/csv_to_xls.py ${i}-all.txt ${i}-up.txt ${i}-down.txt -d$',' -o ${i}.xls;"; done
-
clustering the genes and draw heatmap
for i in HSV.d2_vs_control HSV.d4_vs_control HSV.d6_vs_control HSV.d8_vs_control HSV.d4_vs_HSV.d2 HSV.d6_vs_HSV.d2 HSV.d8_vs_HSV.d2 HSV.d6_vs_HSV.d4 HSV.d8_vs_HSV.d4 HSV.d8_vs_HSV.d6; do echo "cut -d',' -f1-1 ${i}-up.txt > ${i}-up.id"; echo "cut -d',' -f1-1 ${i}-down.txt > ${i}-down.id"; done cat *.id | sort -u > ids #add Gene_Id in the first line, delete the "" GOI <- read.csv("ids")$Gene_Id RNASeq.NoCellLine <- assay(rld) #install.packages("gplots") library("gplots") #clustering methods: "ward.D", "ward.D2", "single", "complete", "average" (= UPGMA), "mcquitty" (= WPGMA), "median" (= WPGMC) or "centroid" (= UPGMC). pearson or spearman #datamat = RNASeq.NoCellLine[GOI, ] datamat = RNASeq.NoCellLine write.csv(as.data.frame(datamat), file ="gene_expressions.txt") constant_rows <- apply(datamat, 1, function(row) var(row) == 0) if(any(constant_rows)) { cat("Removing", sum(constant_rows), "constant rows.\n") datamat <- datamat[!constant_rows, ] } hr <- hclust(as.dist(1-cor(t(datamat), method="pearson")), method="complete") hc <- hclust(as.dist(1-cor(datamat, method="spearman")), method="complete") mycl = cutree(hr, h=max(hr$height)/1.5) mycol = c("YELLOW", "BLUE", "ORANGE", "MAGENTA", "CYAN", "RED", "GREEN", "MAROON", "LIGHTBLUE", "PINK", "MAGENTA", "LIGHTCYAN", "LIGHTRED", "LIGHTGREEN"); mycol = mycol[as.vector(mycl)] #png("DEGs_heatmap.png", width=900, height=800) #cex.lab=10, labRow="", png("DEGs_heatmap.png", width=900, height=1000) heatmap.2(as.matrix(datamat),Rowv=as.dendrogram(hr),Colv = NA, dendrogram = 'row', scale='row',trace='none',col=bluered(75), RowSideColors = mycol, margins=c(10,20), cexRow=1.5, srtCol=45, lhei = c(2, 8)) #rownames(datamat) #heatmap.2(datamat, Rowv=as.dendrogram(hr), col=bluered(75), scale="row", RowSideColors=mycol, trace="none", margin=c(5,5), sepwidth=c(0,0), dendrogram = 'row', Colv = 'false', density.info='none', labRow="", srtCol=30, lhei=c(0.1,2)) dev.off() #### cluster members ##### write.csv(names(subset(mycl, mycl == '1')),file='cluster1_YELLOW.txt') write.csv(names(subset(mycl, mycl == '2')),file='cluster2_DARKBLUE.txt') write.csv(names(subset(mycl, mycl == '3')),file='cluster3_DARKORANGE.txt') #~/Tools/csv2xls-0.4/csv_to_xls.py cluster*.txt -d',' -o DEGs_heatmap_cluster_members.xls ~/Tools/csv2xls-0.4/csv_to_xls.py \ significant_gene_expressions.txt \ -d',' -o DEGs_heatmap_expression_data.xls;
RNA-seq skin organoids on GRCh38+chrHsv1
Leave a reply