#nextflow run rnaseq --reads '/home/jhuang/DATA/Data_Denise_tx_epi_MCPyV/Raw_Data_RNAseq_K331A/*.fastq.gz' --fasta /home/jhuang/REFs/Homo_sapiens/UCSC/hg38/Sequence/WholeGenomeFasta/genome.fa --gtf /home/jhuang/REFs/Homo_sapiens/UCSC/hg38/Annotation/Genes/genes.gtf --bed12 /home/jhuang/REFs/Homo_sapiens/UCSC/hg38/Annotation/Genes/genes.bed --singleEnd -profile standard --aligner star --saveReference -resume --saveAlignedIntermediates --skip_rseqc --skip_dupradar --skip_genebody_coverage --skip_preseq --skip_edger --fcGroupFeaturesType gene_name
#under jhuang@hamm:~/DATA/Data_Denise_LT_RNASeq/Raw_Data_RNAseq_K331A_d8_SPANDx$
#Replace "p600" with "control", "p602" with "LT", "p605" with "LTtr", and "p783" with "K331A". Please note that the RNAseq data from the LT_K331A_d8 replicates were sequenced alongside the ChIPseq batch.
ln -s ~/DATA/Data_Denise_LT_RNASeq/231006_NB501882_0434_AHCT3NBGXV/nf928/1_NHDF_Donor_2_p600_S23_R1_001.fastq.gz control-d8-DII_re.fastq.gz
ln -s ~/DATA/Data_Denise_LT_RNASeq/231006_NB501882_0434_AHCT3NBGXV/nf929/2_NHDF_Donor_2_p783_S24_R1_001.fastq.gz LT-K331A-d8-DII_re.fastq.gz
(rnaseq_2021) jhuang@hamm:~/DATA/Data_Denise_LT_RNASeq$ nextflow run rnaseq --reads 'Raw_Data_RNAseq_K331A_d8_SPANDx/*.fastq.gz' --fasta /home/jhuang/REFs/Homo_sapiens/UCSC/hg38/Sequence/WholeGenomeFasta/genome.fa --gtf /home/jhuang/REFs/Homo_sapiens/UCSC/hg38/Annotation/Genes/genes.gtf --bed12 /home/jhuang/REFs/Homo_sapiens/UCSC/hg38/Annotation/Genes/genes.bed --singleEnd -profile standard --aligner star --saveReference -resume --saveAlignedIntermediates --skip_rseqc --skip_dupradar --skip_genebody_coverage --skip_preseq --skip_edger --fcGroupFeaturesType gene_name
nextflow run rnaseq --reads 'Raw_Data_RNAseq_K331A_d8_SPANDx/*.fastq.gz' --fasta /media/jhuang/Smarty/ref/Homo_sapiens/UCSC/hg38/Sequence/WholeGenomeFasta/genome.fa --gtf /media/jhuang/Smarty/ref/Homo_sapiens/UCSC/hg38/Annotation/Genes/genes.gtf --star_index /media/jhuang/Smarty/ref/Homo_sapiens/UCSC/hg38/Sequence/STARIndex/ --bed12 /media/jhuang/Smarty/ref/Homo_sapiens/UCSC/hg38/Annotation/Genes/genes.bed --singleEnd -profile standard --aligner star --saveReference -resume --saveAlignedIntermediates --skip_rseqc --skip_dupradar --skip_genebody_coverage --skip_preseq --skip_edger --fcGroupFeaturesType gene_name
rsync -a -P jhuang@hamm:~/REFs/Homo_sapiens/UCSC/hg38 .
#cut -f2- merged_gene_counts.txt > merged_gene_counts_2.txt
library("AnnotationDbi")
library("clusterProfiler")
library("ReactomePA")
library("org.Hs.eg.db")
library(DESeq2)
library(gplots)
setwd("/media/jhuang/Elements1/Data_Denise_LT_DNA_Bindung/results_rnaseq_K331A_hg38/featureCounts_together_d8_2")
d.raw<- read.delim2("merged_gene_counts_2.txt",sep="\t", header=TRUE, row.names=1)
[1] K331A_DonorIAligned.sortedByCoord.out.bam
[2] V_8_2_4_p600_d8_DonorIAligned.sortedByCoord.out.bam
[3] V_8_2_3_p600_d8_DonorIIAligned.sortedByCoord.out.bam
[4] V_8_4_2_p602_d3_DonorIIAligned.sortedByCoord.out.bam
[5] V_8_2_4_p600_d3_DonorIAligned.sortedByCoord.out.bam
[6] V_8_4_2_p602_d3_DonorIAligned.sortedByCoord.out.bam
[7] K331A_DonorIIAligned.sortedByCoord.out.bam
[8] V_8_2_4_p605_d3_DonorIAligned.sortedByCoord.out.bam
[9] V_8_4_2_p602_d8_DonorIAligned.sortedByCoord.out.bam
[10] V_8_2_3_p600_d3_DonorIIAligned.sortedByCoord.out.bam
[11] V_8_4_1_p602_d8_DonorIIAligned.sortedByCoord.out.bam
[12] V_8_2_3_p605_d8_DonorIIAligned.sortedByCoord.out.bam
[13] V_8_2_3_p605_d3_DonorIIAligned.sortedByCoord.out.bam
[14] V_8_2_4_p605_d8_DonorIAligned.sortedByCoord.out.bam
#replace p600 to control, p602 to LT, p605 to LTtr
colnames(d.raw)<- c("K331A_d8_DonorI","ctrl_d8_DonorI","ctrl_d8_DonorII","LT_d3_DonorII","ctrl_d3_DonorI","LT_d3_DonorI","K331A_d8_DonorII","LTtr_d3_DonorI","LT_d8_DonorI","ctrl_d3_DonorII","LT_d8_DonorII","LTtr_d8_DonorII","LTtr_d3_DonorII","LTtr_d8_DonorI") #26485
col_order <- c("ctrl_d3_DonorI","ctrl_d3_DonorII","ctrl_d8_DonorI","ctrl_d8_DonorII", "LT_d3_DonorI","LT_d3_DonorII","LT_d8_DonorI","LT_d8_DonorII", "LTtr_d3_DonorI","LTtr_d3_DonorII","LTtr_d8_DonorI","LTtr_d8_DonorII", "K331A_d8_DonorI","K331A_d8_DonorII")
reordered.raw <- d.raw[,col_order]
#selected all columns with "d8"
selected_reordered.raw <- reordered.raw[, c("ctrl_d8_DonorI","ctrl_d8_DonorII", "LT_d8_DonorI","LT_d8_DonorII", "LTtr_d8_DonorI","LTtr_d8_DonorII", "K331A_d8_DonorI","K331A_d8_DonorII")]
condition = as.factor(c("control_d8","control_d8", "LT_d8","LT_d8", "LTtr_d8","LTtr_d8", "K331A_d8","K331A_d8"))
ids = as.factor(c("ctrl_d8_DonorI","ctrl_d8_DonorII", "LT_d8_DonorI","LT_d8_DonorII", "LTtr_d8_DonorI","LTtr_d8_DonorII", "K331A_d8_DonorI","K331A_d8_DonorII"))
#NOTE_3D_PCA (IMPORTANT): should always keep the name 'DI' and 'DII' as donor names; they are important for 3D PCA drawing!
#NOTE2_3D_PCA: correct the condition names: control_d8-->control d8, K331A_d8-->K331A d8, LT_d8-->LT d8, LTtr_d8-->LTtr d8.
donor = as.factor(c("DI","DII", "DI","DII", "DI","DII", "DI","DII"))
#Note that we need selected_reordered.raw
cData = data.frame(row.names=colnames(selected_reordered.raw), condition=condition, donor=donor, ids=ids)
dds<-DESeqDataSetFromMatrix(countData=selected_reordered.raw, colData=cData, design=~donor+condition) #batch+
> sizeFactors(dds)
NULL
> dds <- estimateSizeFactors(dds)
> sizeFactors(dds)
ctrl_d3_DonorI ctrl_d3_DonorII *ctrl_d8_DonorI *ctrl_d8_DonorII LT_d3_DonorI
1.2221026 0.8126614 1.0631329 0.6271887 0.8553245
LT_d3_DonorII *LT_d8_DonorI *LT_d8_DonorII LTtr_d3_DonorI LTtr_d3_DonorII
1.0386420 1.0324488 1.3568818 1.3070621 0.8283129
*LTtr_d8_DonorI *LTtr_d8_DonorII *K331A_DonorI *K331A_DonorII
1.6206130 0.8352236 1.6908424 0.6012788
ctrl_d8_DonorI ctrl_d8_DonorII LT_d8_DonorI LT_d8_DonorII LTtr_d8_DonorI LTtr_d8_DonorII K331A_DonorI K331A_DonorII
1.0429778 0.6131069 1.0162992 1.3406000 1.5984592 0.8268460 1.6634451 0.5943143
> sizeFactors(dds)
ctrl_d8_DonorI ctrl_d8_DonorII LT_d8_DonorI LT_d8_DonorII
1.0429778 0.6131069 1.0162992 1.3406000
LTtr_d8_DonorI LTtr_d8_DonorII K331A_d8_DonorI K331A_d8_DonorII
1.5984592 0.8268460 1.6634451 0.5943143
./STAR/V_8_2_4_p600_d3_DonorIAligned.sortedByCoord.out.bam
./STAR/V_8_2_3_p600_d3_DonorIIAligned.sortedByCoord.out.bam
./STAR/V_8_2_4_p600_d8_DonorIAligned.sortedByCoord.out.bam
./STAR/V_8_2_3_p600_d8_DonorIIAligned.sortedByCoord.out.bam
./STAR/V_8_4_2_p602_d3_DonorIAligned.sortedByCoord.out.bam
./STAR/V_8_4_2_p602_d3_DonorIIAligned.sortedByCoord.out.bam
./STAR/V_8_4_2_p602_d8_DonorIAligned.sortedByCoord.out.bam
./STAR/V_8_4_1_p602_d8_DonorIIAligned.sortedByCoord.out.bam
./STAR/V_8_2_4_p605_d3_DonorIAligned.sortedByCoord.out.bam
./STAR/V_8_2_3_p605_d3_DonorIIAligned.sortedByCoord.out.bam
./STAR/V_8_2_4_p605_d8_DonorIAligned.sortedByCoord.out.bam
./STAR/V_8_2_3_p605_d8_DonorIIAligned.sortedByCoord.out.bam
./STAR/K331A_DonorIAligned.sortedByCoord.out.bam
./STAR/K331A_DonorIIAligned.sortedByCoord.out.bam
bamCoverage --bam ./STAR/V_8_2_4_p600_d3_DonorIAligned.sortedByCoord.out.bam -o bigWigs/ctrl_d3_DonorI_norm.bw --binSize 10 --scaleFactor 0.8182619037059573 --effectiveGenomeSize 2913022398
bamCoverage --bam ./STAR/V_8_2_3_p600_d3_DonorIIAligned.sortedByCoord.out.bam -o bigWigs/ctrl_d3_DonorII_norm.bw --binSize 10 --scaleFactor 1.230524791752137 --effectiveGenomeSize 2913022398
bamCoverage --bam ./STAR/V_8_2_4_p600_d8_DonorIAligned.sortedByCoord.out.bam -o bigWigs/ctrl_d8_DonorI_norm.bw --binSize 10 --scaleFactor 0.9406161731990421 --effectiveGenomeSize 2913022398
bamCoverage --bam ./STAR/V_8_2_3_p600_d8_DonorIIAligned.sortedByCoord.out.bam -o bigWigs/ctrl_d8_DonorII_norm.bw --binSize 10 --scaleFactor 1.5944164810367278 --effectiveGenomeSize 2913022398
bamCoverage --bam ./STAR/V_8_4_2_p602_d3_DonorIAligned.sortedByCoord.out.bam -o bigWigs/LT_d3_DonorI_norm.bw --binSize 10 --scaleFactor 1.1691469144166922 --effectiveGenomeSize 2913022398
bamCoverage --bam ./STAR/V_8_4_2_p602_d3_DonorIIAligned.sortedByCoord.out.bam -o bigWigs/LT_d3_DonorII_norm.bw --binSize 10 --scaleFactor 0.9627956504743693 --effectiveGenomeSize 2913022398
bamCoverage --bam ./STAR/V_8_4_2_p602_d8_DonorIAligned.sortedByCoord.out.bam -o bigWigs/LT_d8_DonorI_norm.bw --binSize 10 --scaleFactor 0.9685710322875091 --effectiveGenomeSize 2913022398
bamCoverage --bam ./STAR/V_8_4_1_p602_d8_DonorIIAligned.sortedByCoord.out.bam -o bigWigs/LT_d8_DonorII_norm.bw --binSize 10 --scaleFactor 0.7369838699288324 --effectiveGenomeSize 2913022398
bamCoverage --bam ./STAR/V_8_2_4_p605_d3_DonorIAligned.sortedByCoord.out.bam -o bigWigs/LTtr_d3_DonorI_norm.bw --binSize 10 --scaleFactor 0.7650745897995206 --effectiveGenomeSize 2913022398
bamCoverage --bam ./STAR/V_8_2_3_p605_d3_DonorIIAligned.sortedByCoord.out.bam -o bigWigs/LTtr_d3_DonorII_norm.bw --binSize 10 --scaleFactor 1.2072732417906324 --effectiveGenomeSize 2913022398
bamCoverage --bam ./STAR/V_8_2_4_p605_d8_DonorIAligned.sortedByCoord.out.bam -o bigWigs/LTtr_d8_DonorI_norm.bw --binSize 10 --scaleFactor 0.617050461769713 --effectiveGenomeSize 2913022398
bamCoverage --bam ./STAR/V_8_2_3_p605_d8_DonorIIAligned.sortedByCoord.out.bam -o bigWigs/LTtr_d8_DonorII_norm.bw --binSize 10 --scaleFactor 1.1972841763570858 --effectiveGenomeSize 2913022398
bamCoverage --bam ./STAR/K331A_DonorIAligned.sortedByCoord.out.bam -o bigWigs/K331A_DonorI_norm.bw --binSize 10 --scaleFactor 0.5914211756222816 --effectiveGenomeSize 2913022398
bamCoverage --bam ./STAR/K331A_DonorIIAligned.sortedByCoord.out.bam -o bigWigs/K331A_DonorII_norm.bw --binSize 10 --scaleFactor 1.6631219993121327 --effectiveGenomeSize 2913022398
#END
rld <- rlogTransformation(dds)
library(ggplot2)
svg("pca6.svg")
data <- plotPCA(rld, intgroup=c("condition", "donor"), returnData=TRUE)
percentVar <- round(100 * attr(data, "percentVar"))
ggplot(data, aes(PC1, PC2, color=condition, shape=donor)) +
geom_point(size=3) +
scale_color_manual(values = c("control" = "grey",
"K331A"="#1f78b4")) +
xlab(paste0("PC1: ",percentVar[1],"% variance")) +
ylab(paste0("PC2: ",percentVar[2],"% variance")) +
coord_fixed()
dev.off()
png("pca.png", width=800, height=800)
data <- plotPCA(rld, intgroup=c("condition", "donor"), returnData=TRUE)
percentVar <- round(100 * attr(data, "percentVar"))
ggplot(data, aes(PC1, PC2, color=condition, shape=donor)) +
geom_point(size=3) +
scale_color_manual(values = c("control" = "grey",
"K331A"="#1f78b4")) +
xlab(paste0("PC1: ",percentVar[1],"% variance")) +
ylab(paste0("PC2: ",percentVar[2],"% variance")) +
coord_fixed()
dev.off()
png("pca2.png", 1200, 800)
plotPCA(rld, intgroup=c("condition"))
dev.off()
library(ggplot2)
data <- plotPCA(rld, intgroup=c("condition", "donor"), returnData=TRUE)
write.csv(data, file="plotPCA_data.csv")
#calculate all PCs including PC3 with the following codes
library(genefilter)
ntop <- 500
rv <- rowVars(assay(rld))
select <- order(rv, decreasing = TRUE)[seq_len(min(ntop, length(rv)))]
mat <- t( assay(rld)[select, ] )
pc <- prcomp(mat)
pc$x[,1:3]
#df_pc <- data.frame(pc$x[,1:3])
df_pc <- data.frame(pc$x)
identical(rownames(data), rownames(df_pc)) #-->TRUE
## define the desired order of row names
#desired_order <- rownames(data)
## sort the data frame by the desired order of row names
#df <- df[match(desired_order, rownames(df_pc)), ]
data$PC1 <- NULL
data$PC2 <- NULL
merged_df <- merge(data, df_pc, by = "row.names")
#merged_df <- merged_df[, -1]
row.names(merged_df) <- merged_df$Row.names
merged_df$Row.names <- NULL # remove the "name" column
merged_df$name <- NULL
merged_df <- merged_df[, c("PC1","PC2","PC3","PC4","PC5","PC6","PC7","PC8","group","condition","donor")]
#results in 26PCs: merged_df <- merged_df[, c("PC1","PC2","PC3","PC4","PC5","PC6","PC7","PC8","group","condition","donor")]
write.csv(merged_df, file="merged_df_8PCs.csv")
#> summary(pc) #0.5657 0.2195 0.1347 --> 0.57 0.22 0.13
#Importance of components:
# PC1 PC2 PC3 PC4 PC5 PC6 PC7
#Standard deviation 19.4051 12.0878 9.4683 4.5569 4.01016 3.00610 2.71918
#Proportion of Variance 0.5657 0.2195 0.1347 0.0312 0.02416 0.01358 0.01111
#Cumulative Proportion 0.5657 0.7853 0.9200 0.9512 0.97531 0.98889 1.00000
#---- * to untreated ----
dds$condition <- relevel(dds$condition, "control_d8")
dds = DESeq(dds, betaPrior=FALSE)
resultsNames(dds)
clist <- c("K331A_d8_vs_control_d8", "LT_d8_vs_control_d8", "LTtr_d8_vs_control_d8")
for (i in clist) {
contrast = paste("condition", i, sep="_")
res = results(dds, name=contrast)
res <- res[!is.na(res$log2FoldChange),]
geness <- AnnotationDbi::select(org.Hs.eg.db, keys = rownames(res), keytype = "SYMBOL", columns = c("ENSEMBL", "ENTREZID", "SYMBOL", "GENENAME"))
geness <- geness[!duplicated(geness$SYMBOL), ]
res$SYMBOL = rownames(res)
rownames(geness) <- geness$SYMBOL
identical(rownames(res), rownames(geness))
res_df <- as.data.frame(res)
geness_res <- merge(geness, res_df)
dim(geness_res)
geness_res$padj <- ifelse(is.na(geness_res$padj), 1, geness_res$padj)
geness_res$pvalue <- ifelse(is.na(geness_res$pvalue), 1, geness_res$pvalue)
write.csv(as.data.frame(geness_res[order(geness_res$pvalue),]), file = paste(i, "all.txt", sep="-"))
#write.csv(res_df, file = paste(i, "background.txt", sep="_"))
up <- subset(geness_res, padj<=0.05 & log2FoldChange>=2)
down <- subset(geness_res, padj<=0.05 & log2FoldChange<=-2)
write.csv(as.data.frame(up[order(up$log2FoldChange,decreasing=TRUE),]), file = paste(i, "up.txt", sep="-"))
write.csv(as.data.frame(down[order(abs(down$log2FoldChange),decreasing=TRUE),]), file = paste(i, "down.txt", sep="-"))
}
#cd ${comp}_output; \
#~/Tools/csv2xls-0.4/csv_to_xls.py upregulated_filtered downregulated_filtered background -d$',' -o ../${comp}_degenes.xls; \
#cd ..; \
~/Tools/csv2xls-0.4/csv_to_xls.py \
K331A_d8_vs_control_d8-all.txt \
K331A_d8_vs_control_d8-up.txt \
K331A_d8_vs_control_d8-down.txt \
-d$',' -o K331A_d8_vs_control_d8.xls;
~/Tools/csv2xls-0.4/csv_to_xls.py \
LT_d8_vs_control_d8-all.txt \
LT_d8_vs_control_d8-up.txt \
LT_d8_vs_control_d8-down.txt \
-d$',' -o LT_d8_vs_control_d8.xls;
~/Tools/csv2xls-0.4/csv_to_xls.py \
LTtr_d8_vs_control_d8-all.txt \
LTtr_d8_vs_control_d8-up.txt \
LTtr_d8_vs_control_d8-down.txt \
-d$',' -o LTtr_d8_vs_control_d8.xls;
# 719 K331A_d8_vs_control_d8-up.txt
# 371 LT_d8_vs_control_d8-up.txt
# 292 LTtr_d8_vs_control_d8-up.txt
# 1382 total
# 323 K331A_d8_vs_control_d8-down.txt
# 141 LT_d8_vs_control_d8-down.txt
# 80 LTtr_d8_vs_control_d8-down.txt
# 544 total
library(ggrepel)
geness_res <- read.csv(file = paste("LT_d8_vs_control_d8", "all.txt", sep="-"), row.names=1)
geness_res$Color <- "NS or log2FC < 2.0"
geness_res$Color[geness_res$pvalue < 0.05] <- "P < 0.05"
geness_res$Color[geness_res$padj < 0.05] <- "P-adj < 0.05"
geness_res$Color[geness_res$padj < 0.001] <- "P-adj < 0.001"
geness_res$Color[abs(geness_res$log2FoldChange) < 2.0] <- "NS or log2FC < 2.0"
geness_res$Color <- factor(geness_res$Color,
levels = c("NS or log2FC < 2.0", "P < 0.05",
"P-adj < 0.05", "P-adj < 0.001"))
geness_res$invert_P <- (-log10(geness_res$pvalue)) * sign(geness_res$log2FoldChange)
top_g <- c()
top_g <- c(top_g, geness_res[, 'SYMBOL'][order(geness_res[, 'invert_P'], decreasing = TRUE)[1:200]], geness_res[, 'SYMBOL'][order(geness_res[, 'invert_P'], decreasing = FALSE)[1:200]])
top_g <- unique(top_g)
geness_res <- geness_res[, -1*ncol(geness_res)] #remove invert_P from matrix
png("LT_d8_vs_control_d8.png",width=1400, height=1000)
ggplot(geness_res,
aes(x = log2FoldChange, y = -log10(pvalue),
color = Color, label = SYMBOL)) +
geom_vline(xintercept = c(2.0, -2.0), lty = "dashed") +
geom_hline(yintercept = -log10(0.05), lty = "dashed") +
geom_point() +
labs(x = "log2(FC)",
y = "Significance, -log10(P)",
color = "Significance") +
scale_color_manual(values = c(`P-adj < 0.001` = "dodgerblue",
`P-adj < 0.05` = "lightblue",
`P < 0.05` = "orange2",
`NS or log2FC < 2.0` = "gray"),
guide = guide_legend(override.aes = list(size = 4))) +
scale_y_continuous(expand = expansion(mult = c(0,0.05))) +
geom_text_repel(data = subset(geness_res, SYMBOL %in% top_g & pvalue < 0.05 & (abs(geness_res$log2FoldChange) >= 2.0)),
size = 4, point.padding = 0.15, color = "black",
min.segment.length = .1, box.padding = .2, lwd = 2) +
theme_bw(base_size = 16) +
theme(legend.position = "bottom")
dev.off()
up1 <-read.csv(file="./K331A_d8_vs_control_d8-up.txt")
up3 <-read.csv(file="./LT_d8_vs_control_d8-up.txt")
up5 <-read.csv(file="./LTtr_d8_vs_control_d8-up.txt")
down1 <-read.csv(file="./K331A_d8_vs_control_d8-down.txt")
down3 <-read.csv(file="./LT_d8_vs_control_d8-down.txt")
down5 <-read.csv(file="./LTtr_d8_vs_control_d8-down.txt")
RNASeq.NoCellLine <- assay(rld)
#Option3: as paper described, A heatmap showing expression values of all DEGs which are significant between any pair conditions.
all_genes <- c(up1$SYMBOL,down1$SYMBOL, up3$SYMBOL,down3$SYMBOL, up5$SYMBOL,down5$SYMBOL) #1920
all_genes <- unique(all_genes) #1223
RNASeq.NoCellLine_ <- RNASeq.NoCellLine[all_genes,]
write.csv(as.data.frame(RNASeq.NoCellLine_), file ="significant_gene_expressions.txt")
##### STEP4: clustering the genes and draw heatmap #####
#clustering methods: "ward.D", "ward.D2", "single", "complete", "average" (= UPGMA), "mcquitty" (= WPGMA), "median" (= WPGMC) or "centroid" (= UPGMC). pearson or spearman
datamat = RNASeq.NoCellLine_
hr <- hclust(as.dist(1-cor(t(datamat), method="pearson")), method="complete")
hc <- hclust(as.dist(1-cor(datamat, method="spearman")), method="complete")
mycl = cutree(hr, h=max(hr$height)/1.4) #1.08: 3 clusters;
mycol = c("YELLOW", "DARKBLUE", "DARKORANGE", "DARKMAGENTA", "DARKCYAN", "DARKRED", "MAROON", "DARKGREEN", "LIGHTBLUE", "PINK", "MAGENTA", "LIGHTCYAN","LIGHTGREEN", "BLUE", "ORANGE", "CYAN", "RED", "GREEN");
mycol = mycol[as.vector(mycl)]
#sampleCols <- rep('GREY',ncol(RNASeq.NoCellLine_))
#names(sampleCols) <- c("mock_r1", "mock_r2", "mock_r3", "mock_r4", "WAP_r1", "WAP_r2", "WAP_r3", "WAP_r4", "WAC_r1","WAC_r2")
#sampleCols[substr(colnames(RNASeq.NoCellLine_),1,4)=='mock'] <- 'GREY'
#sampleCols[substr(colnames(RNASeq.NoCellLine_),1,3)=='WAP'] <- 'RED'
#sampleCols[substr(colnames(RNASeq.NoCellLine_),1,3)=='dM_'] <- 'CYAN'
#sampleCols[substr(colnames(RNASeq.NoCellLine_),1,3)=='dP_'] <- 'BLUE'
#sampleCols[substr(colnames(RNASeq.NoCellLine_),1,3)=='WAC'] <- 'GREEN'
png("DEGs_heatmap.png", width=900, height=1010)
heatmap.2(as.matrix(datamat),Rowv=as.dendrogram(hr),Colv = NA, dendrogram = 'row',
scale='row',trace='none',col=bluered(75),
RowSideColors = mycol, labRow="", srtCol=15, keysize=0.72)
dev.off()
#### cluster members #####
write.csv(names(subset(mycl, mycl == '1')),file='cluster1_YELLOW.txt')
write.csv(names(subset(mycl, mycl == '2')),file='cluster2_DARKBLUE.txt')
write.csv(names(subset(mycl, mycl == '3')),file='cluster3_DARKORANGE.txt')
write.csv(names(subset(mycl, mycl == '4')),file='cluster4_DARKMAGENTA.txt')
write.csv(names(subset(mycl, mycl == '5')),file='cluster5_DARKCYAN.txt')
~/Tools/csv2xls-0.4/csv_to_xls.py \
cluster1_YELLOW.txt \
cluster2_DARKBLUE.txt \
cluster3_DARKORANGE.txt \
cluster4_DARKMAGENTA.txt \
cluster5_DARKCYAN.txt \
-d$',' -o gene_clusters.xls;
~/Tools/csv2xls-0.4/csv_to_xls.py \
significant_gene_expressions.txt \
-d',' -o DEGs_heatmap_data.xls;
# ------------ LT (p602) vs ctrl (p600) d8 ------------
colnames(d.raw)<- c("K331A_d8_DonorI","ctrl_d8_DonorI","ctrl_d8_DonorII","LT_d3_DonorII","ctrl_d3_DonorI","LT_d3_DonorI","K331A_d8_DonorII","LTtr_d3_DonorI","LT_d8_DonorI","ctrl_d3_DonorII","LT_d8_DonorII","LTtr_d8_DonorII","LTtr_d3_DonorII","LTtr_d8_DonorI") #26485
col_order <- c("ctrl_d3_DonorI","ctrl_d3_DonorII","ctrl_d8_DonorI","ctrl_d8_DonorII", "LT_d3_DonorI","LT_d3_DonorII","LT_d8_DonorI","LT_d8_DonorII", "LTtr_d3_DonorI","LTtr_d3_DonorII","LTtr_d8_DonorI","LTtr_d8_DonorII", "K331A_d8_DonorI","K331A_d8_DonorII")
reordered.raw <- d.raw[,col_order]
#selected all columns with "d8"
selected_reordered.raw <- reordered.raw[, c("ctrl_d8_DonorI","ctrl_d8_DonorII", "LT_d8_DonorI","LT_d8_DonorII")]
condition = as.factor(c("control_d8","control_d8", "LT_d8","LT_d8"))
ids = as.factor(c("ctrl_d8_DonorI","ctrl_d8_DonorII", "LT_d8_DonorI","LT_d8_DonorII"))
donor = as.factor(c("DI","DII", "DI","DII"))
cData = data.frame(row.names=colnames(selected_reordered.raw), condition=condition, donor=donor, ids=ids)
dds<-DESeqDataSetFromMatrix(countData=selected_reordered.raw, colData=cData, design=~donor+condition) #batch+
#> sizeFactors(dds)
#NULL
dds <- estimateSizeFactors(dds)
sizeFactors(dds)
# ctrl_d8_DonorI ctrl_d8_DonorII LT_d8_DonorI LT_d8_DonorII
# 1.0879047 0.6411992 1.0519079 1.3937541
rld <- rlogTransformation(dds)
setwd("~/DATA/Data_Denise_LT_DNA_Bindung/results_RNAseq_K331A_hg38/featureCounts_LT_vs_ctrl_d8")
dds$condition <- relevel(dds$condition, "control_d8")
dds = DESeq(dds, betaPrior=FALSE)
resultsNames(dds)
clist <- c("LT_d8_vs_control_d8")
for (i in clist) {
contrast = paste("condition", i, sep="_")
res = results(dds, name=contrast)
res <- res[!is.na(res$log2FoldChange),]
geness <- AnnotationDbi::select(org.Hs.eg.db, keys = rownames(res), keytype = "SYMBOL", columns = c("ENSEMBL", "ENTREZID", "SYMBOL", "GENENAME"))
geness <- geness[!duplicated(geness$SYMBOL), ]
res$SYMBOL = rownames(res)
rownames(geness) <- geness$SYMBOL
identical(rownames(res), rownames(geness))
res_df <- as.data.frame(res)
geness_res <- merge(geness, res_df)
dim(geness_res)
geness_res$padj <- ifelse(is.na(geness_res$padj), 1, geness_res$padj)
geness_res$pvalue <- ifelse(is.na(geness_res$pvalue), 1, geness_res$pvalue)
write.csv(as.data.frame(geness_res[order(geness_res$pvalue),]), file = paste(i, "all.txt", sep="-"))
#write.csv(res_df, file = paste(i, "background.txt", sep="_"))
up <- subset(geness_res, padj<=0.05 & log2FoldChange>=2)
down <- subset(geness_res, padj<=0.05 & log2FoldChange<=-2)
write.csv(as.data.frame(up[order(up$log2FoldChange,decreasing=TRUE),]), file = paste(i, "up.txt", sep="-"))
write.csv(as.data.frame(down[order(abs(down$log2FoldChange),decreasing=TRUE),]), file = paste(i, "down.txt", sep="-"))
}
#~/Tools/csv2xls-0.4/csv_to_xls.py \
#LT_d8_vs_control_d8-all.txt \
#LT_d8_vs_control_d8-up.txt \
#LT_d8_vs_control_d8-down.txt \
#-d$',' -o LT_d8_vs_control_d8.xls;
# 719 K331A_d8_vs_control_d8-up.txt
# 371 LT_d8_vs_control_d8-up.txt --> 399
# 292 LTtr_d8_vs_control_d8-up.txt
# 323 K331A_d8_vs_control_d8-down.txt
# 141 LT_d8_vs_control_d8-down.txt --> 184
# 80 LTtr_d8_vs_control_d8-down.txt
library(ggrepel)
geness_res <- read.csv(file = paste("LT_d8_vs_control_d8", "all.txt", sep="-"), row.names=1)
geness_res$Color <- "NS or log2FC < 2.0"
geness_res$Color[geness_res$pvalue < 0.05] <- "P < 0.05"
geness_res$Color[geness_res$padj < 0.05] <- "P-adj < 0.05"
geness_res$Color[geness_res$padj < 0.001] <- "P-adj < 0.001"
geness_res$Color[abs(geness_res$log2FoldChange) < 2.0] <- "NS or log2FC < 2.0"
geness_res$Color <- factor(geness_res$Color,
levels = c("NS or log2FC < 2.0", "P < 0.05",
"P-adj < 0.05", "P-adj < 0.001"))
geness_res$invert_P <- (-log10(geness_res$pvalue)) * sign(geness_res$log2FoldChange)
top_g <- c()
top_g <- c(top_g, geness_res[, 'SYMBOL'][order(geness_res[, 'invert_P'], decreasing = TRUE)[1:200]], geness_res[, 'SYMBOL'][order(geness_res[, 'invert_P'], decreasing = FALSE)[1:200]])
top_g <- unique(top_g)
geness_res <- geness_res[, -1*ncol(geness_res)] #remove invert_P from matrix
png("LT_d8_vs_control_d8.png",width=1400, height=1000)
ggplot(geness_res,
aes(x = log2FoldChange, y = -log10(pvalue),
color = Color, label = SYMBOL)) +
geom_vline(xintercept = c(2.0, -2.0), lty = "dashed") +
geom_hline(yintercept = -log10(0.05), lty = "dashed") +
geom_point() +
labs(x = "log2(FC)",
y = "Significance, -log10(P)",
color = "Significance") +
scale_color_manual(values = c(`P-adj < 0.001` = "dodgerblue",
`P-adj < 0.05` = "lightblue",
`P < 0.05` = "orange2",
`NS or log2FC < 2.0` = "gray"),
guide = guide_legend(override.aes = list(size = 4))) +
scale_y_continuous(expand = expansion(mult = c(0,0.05))) +
geom_text_repel(data = subset(geness_res, SYMBOL %in% top_g & pvalue < 0.05 & (abs(geness_res$log2FoldChange) >= 2.0)),
size = 4, point.padding = 0.15, color = "black",
min.segment.length = .1, box.padding = .2, lwd = 2) +
theme_bw(base_size = 16) +
theme(legend.position = "bottom")
dev.off()
# ------------ LTtr (p605) vs ctrl (p600) d8 ------------
colnames(d.raw)<- c("K331A_d8_DonorI","ctrl_d8_DonorI","ctrl_d8_DonorII","LT_d3_DonorII","ctrl_d3_DonorI","LT_d3_DonorI","K331A_d8_DonorII","LTtr_d3_DonorI","LT_d8_DonorI","ctrl_d3_DonorII","LT_d8_DonorII","LTtr_d8_DonorII","LTtr_d3_DonorII","LTtr_d8_DonorI") #26485
col_order <- c("ctrl_d3_DonorI","ctrl_d3_DonorII","ctrl_d8_DonorI","ctrl_d8_DonorII", "LT_d3_DonorI","LT_d3_DonorII","LT_d8_DonorI","LT_d8_DonorII", "LTtr_d3_DonorI","LTtr_d3_DonorII","LTtr_d8_DonorI","LTtr_d8_DonorII", "K331A_d8_DonorI","K331A_d8_DonorII")
reordered.raw <- d.raw[,col_order]
#selected all columns with "d8"
selected_reordered.raw <- reordered.raw[, c("ctrl_d8_DonorI","ctrl_d8_DonorII", "LTtr_d8_DonorI","LTtr_d8_DonorII")]
condition = as.factor(c("control_d8","control_d8", "LTtr_d8","LTtr_d8"))
ids = as.factor(c("ctrl_d8_DonorI","ctrl_d8_DonorII", "LTtr_d8_DonorI","LTtr_d8_DonorII"))
donor = as.factor(c("DI","DII", "DI","DII"))
cData = data.frame(row.names=colnames(selected_reordered.raw), condition=condition, donor=donor, ids=ids)
dds<-DESeqDataSetFromMatrix(countData=selected_reordered.raw, colData=cData, design=~donor+condition) #batch+
#> sizeFactors(dds)
#NULL
dds <- estimateSizeFactors(dds)
sizeFactors(dds)
# ctrl_d8_DonorI ctrl_d8_DonorII LTtr_d8_DonorI LTtr_d8_DonorII
# 1.0990996 0.6476647 1.6536010 0.8632049
rld <- rlogTransformation(dds)
setwd("~/DATA/Data_Denise_LT_DNA_Bindung/results_RNAseq_K331A_hg38/featureCounts_LTtr_vs_ctrl_d8")
dds$condition <- relevel(dds$condition, "control_d8")
dds = DESeq(dds, betaPrior=FALSE)
resultsNames(dds)
clist <- c("LTtr_d8_vs_control_d8")
for (i in clist) {
contrast = paste("condition", i, sep="_")
res = results(dds, name=contrast)
res <- res[!is.na(res$log2FoldChange),]
geness <- AnnotationDbi::select(org.Hs.eg.db, keys = rownames(res), keytype = "SYMBOL", columns = c("ENSEMBL", "ENTREZID", "SYMBOL", "GENENAME"))
geness <- geness[!duplicated(geness$SYMBOL), ]
res$SYMBOL = rownames(res)
rownames(geness) <- geness$SYMBOL
identical(rownames(res), rownames(geness))
res_df <- as.data.frame(res)
geness_res <- merge(geness, res_df)
dim(geness_res)
geness_res$padj <- ifelse(is.na(geness_res$padj), 1, geness_res$padj)
geness_res$pvalue <- ifelse(is.na(geness_res$pvalue), 1, geness_res$pvalue)
write.csv(as.data.frame(geness_res[order(geness_res$pvalue),]), file = paste(i, "all.txt", sep="-"))
#write.csv(res_df, file = paste(i, "background.txt", sep="_"))
up <- subset(geness_res, padj<=0.05 & log2FoldChange>=2)
down <- subset(geness_res, padj<=0.05 & log2FoldChange<=-2)
write.csv(as.data.frame(up[order(up$log2FoldChange,decreasing=TRUE),]), file = paste(i, "up.txt", sep="-"))
write.csv(as.data.frame(down[order(abs(down$log2FoldChange),decreasing=TRUE),]), file = paste(i, "down.txt", sep="-"))
}
#~/Tools/csv2xls-0.4/csv_to_xls.py \
#LTtr_d8_vs_control_d8-all.txt \
#LTtr_d8_vs_control_d8-up.txt \
#LTtr_d8_vs_control_d8-down.txt \
#-d$',' -o LTtr_d8_vs_control_d8.xls;
# 719 K331A_d8_vs_control_d8-up.txt
# 371 LT_d8_vs_control_d8-up.txt --> 399
# 292 LTtr_d8_vs_control_d8-up.txt --> 380
# 323 K331A_d8_vs_control_d8-down.txt
# 141 LT_d8_vs_control_d8-down.txt --> 184
# 80 LTtr_d8_vs_control_d8-down.txt --> 136
library(ggrepel)
geness_res <- read.csv(file = paste("LTtr_d8_vs_control_d8", "all.txt", sep="-"), row.names=1)
geness_res$Color <- "NS or log2FC < 2.0"
geness_res$Color[geness_res$pvalue < 0.05] <- "P < 0.05"
geness_res$Color[geness_res$padj < 0.05] <- "P-adj < 0.05"
geness_res$Color[geness_res$padj < 0.001] <- "P-adj < 0.001"
geness_res$Color[abs(geness_res$log2FoldChange) < 2.0] <- "NS or log2FC < 2.0"
geness_res$Color <- factor(geness_res$Color,
levels = c("NS or log2FC < 2.0", "P < 0.05",
"P-adj < 0.05", "P-adj < 0.001"))
geness_res$invert_P <- (-log10(geness_res$pvalue)) * sign(geness_res$log2FoldChange)
top_g <- c()
top_g <- c(top_g, geness_res[, 'SYMBOL'][order(geness_res[, 'invert_P'], decreasing = TRUE)[1:200]], geness_res[, 'SYMBOL'][order(geness_res[, 'invert_P'], decreasing = FALSE)[1:200]])
top_g <- unique(top_g)
geness_res <- geness_res[, -1*ncol(geness_res)] #remove invert_P from matrix
png("LTtr_d8_vs_control_d8.png",width=1400, height=1000)
ggplot(geness_res,
aes(x = log2FoldChange, y = -log10(pvalue),
color = Color, label = SYMBOL)) +
geom_vline(xintercept = c(2.0, -2.0), lty = "dashed") +
geom_hline(yintercept = -log10(0.05), lty = "dashed") +
geom_point() +
labs(x = "log2(FC)",
y = "Significance, -log10(P)",
color = "Significance") +
scale_color_manual(values = c(`P-adj < 0.001` = "dodgerblue",
`P-adj < 0.05` = "lightblue",
`P < 0.05` = "orange2",
`NS or log2FC < 2.0` = "gray"),
guide = guide_legend(override.aes = list(size = 4))) +
scale_y_continuous(expand = expansion(mult = c(0,0.05))) +
geom_text_repel(data = subset(geness_res, SYMBOL %in% top_g & pvalue < 0.05 & (abs(geness_res$log2FoldChange) >= 2.0)),
size = 4, point.padding = 0.15, color = "black",
min.segment.length = .1, box.padding = .2, lwd = 2) +
theme_bw(base_size = 16) +
theme(legend.position = "bottom")
dev.off()
# ------------ LT K331A (p783) vs ctrl (p600) d8 ------------
colnames(d.raw)<- c("K331A_d8_DonorI","ctrl_d8_DonorI","ctrl_d8_DonorII","LT_d3_DonorII","ctrl_d3_DonorI","LT_d3_DonorI","K331A_d8_DonorII","LTtr_d3_DonorI","LT_d8_DonorI","ctrl_d3_DonorII","LT_d8_DonorII","LTtr_d8_DonorII","LTtr_d3_DonorII","LTtr_d8_DonorI") #26485
col_order <- c("ctrl_d3_DonorI","ctrl_d3_DonorII","ctrl_d8_DonorI","ctrl_d8_DonorII", "LT_d3_DonorI","LT_d3_DonorII","LT_d8_DonorI","LT_d8_DonorII", "LTtr_d3_DonorI","LTtr_d3_DonorII","LTtr_d8_DonorI","LTtr_d8_DonorII", "K331A_d8_DonorI","K331A_d8_DonorII")
reordered.raw <- d.raw[,col_order]
#selected all columns with "d8"
selected_reordered.raw <- reordered.raw[, c("ctrl_d8_DonorI","ctrl_d8_DonorII", "K331A_d8_DonorI","K331A_d8_DonorII")]
condition = as.factor(c("control_d8","control_d8", "K331A_d8","K331A_d8"))
ids = as.factor(c("ctrl_d8_DonorI","ctrl_d8_DonorII", "K331A_d8_DonorI","K331A_d8_DonorII"))
donor = as.factor(c("DI","DII", "DI","DII"))
cData = data.frame(row.names=colnames(selected_reordered.raw), condition=condition, donor=donor, ids=ids)
dds<-DESeqDataSetFromMatrix(countData=selected_reordered.raw, colData=cData, design=~donor+condition) #batch+
#> sizeFactors(dds)
#NULL
dds <- estimateSizeFactors(dds)
sizeFactors(dds)
# ctrl_d8_DonorI ctrl_d8_DonorII K331A_d8_DonorI K331A_d8_DonorII
# 1.1747544 0.6871283 1.8656332 0.6817289
rld <- rlogTransformation(dds)
setwd("~/DATA/Data_Denise_LT_DNA_Bindung/results_RNAseq_K331A_hg38/featureCounts_K331A_vs_ctrl_d8")
dds$condition <- relevel(dds$condition, "control_d8")
dds = DESeq(dds, betaPrior=FALSE)
resultsNames(dds)
clist <- c("K331A_d8_vs_control_d8")
for (i in clist) {
contrast = paste("condition", i, sep="_")
res = results(dds, name=contrast)
res <- res[!is.na(res$log2FoldChange),]
geness <- AnnotationDbi::select(org.Hs.eg.db, keys = rownames(res), keytype = "SYMBOL", columns = c("ENSEMBL", "ENTREZID", "SYMBOL", "GENENAME"))
geness <- geness[!duplicated(geness$SYMBOL), ]
res$SYMBOL = rownames(res)
rownames(geness) <- geness$SYMBOL
identical(rownames(res), rownames(geness))
res_df <- as.data.frame(res)
geness_res <- merge(geness, res_df)
dim(geness_res)
geness_res$padj <- ifelse(is.na(geness_res$padj), 1, geness_res$padj)
geness_res$pvalue <- ifelse(is.na(geness_res$pvalue), 1, geness_res$pvalue)
write.csv(as.data.frame(geness_res[order(geness_res$pvalue),]), file = paste(i, "all.txt", sep="-"))
#write.csv(res_df, file = paste(i, "background.txt", sep="_"))
up <- subset(geness_res, padj<=0.05 & log2FoldChange>=2)
down <- subset(geness_res, padj<=0.05 & log2FoldChange<=-2)
write.csv(as.data.frame(up[order(up$log2FoldChange,decreasing=TRUE),]), file = paste(i, "up.txt", sep="-"))
write.csv(as.data.frame(down[order(abs(down$log2FoldChange),decreasing=TRUE),]), file = paste(i, "down.txt", sep="-"))
}
#~/Tools/csv2xls-0.4/csv_to_xls.py \
#K331A_d8_vs_control_d8-all.txt \
#K331A_d8_vs_control_d8-up.txt \
#K331A_d8_vs_control_d8-down.txt \
#-d$',' -o K331A_d8_vs_control_d8.xls;
# 719 K331A_d8_vs_control_d8-up.txt --> 315
# 371 LT_d8_vs_control_d8-up.txt --> 399
# 292 LTtr_d8_vs_control_d8-up.txt --> 380
# 323 K331A_d8_vs_control_d8-down.txt --> 277
# 141 LT_d8_vs_control_d8-down.txt --> 184
# 80 LTtr_d8_vs_control_d8-down.txt --> 136
library(ggrepel)
geness_res <- read.csv(file = paste("K331A_d8_vs_control_d8", "all.txt", sep="-"), row.names=1)
geness_res$Color <- "NS or log2FC < 2.0"
geness_res$Color[geness_res$pvalue < 0.05] <- "P < 0.05"
geness_res$Color[geness_res$padj < 0.05] <- "P-adj < 0.05"
geness_res$Color[geness_res$padj < 0.001] <- "P-adj < 0.001"
geness_res$Color[abs(geness_res$log2FoldChange) < 2.0] <- "NS or log2FC < 2.0"
geness_res$Color <- factor(geness_res$Color,
levels = c("NS or log2FC < 2.0", "P < 0.05",
"P-adj < 0.05", "P-adj < 0.001"))
geness_res$invert_P <- (-log10(geness_res$pvalue)) * sign(geness_res$log2FoldChange)
top_g <- c()
top_g <- c(top_g, geness_res[, 'SYMBOL'][order(geness_res[, 'invert_P'], decreasing = TRUE)[1:200]], geness_res[, 'SYMBOL'][order(geness_res[, 'invert_P'], decreasing = FALSE)[1:200]])
top_g <- unique(top_g)
geness_res <- geness_res[, -1*ncol(geness_res)] #remove invert_P from matrix
png("K331A_d8_vs_control_d8.png",width=1400, height=1000)
ggplot(geness_res,
aes(x = log2FoldChange, y = -log10(pvalue),
color = Color, label = SYMBOL)) +
geom_vline(xintercept = c(2.0, -2.0), lty = "dashed") +
geom_hline(yintercept = -log10(0.05), lty = "dashed") +
geom_point() +
labs(x = "log2(FC)",
y = "Significance, -log10(P)",
color = "Significance") +
scale_color_manual(values = c(`P-adj < 0.001` = "dodgerblue",
`P-adj < 0.05` = "lightblue",
`P < 0.05` = "orange2",
`NS or log2FC < 2.0` = "gray"),
guide = guide_legend(override.aes = list(size = 4))) +
scale_y_continuous(expand = expansion(mult = c(0,0.05))) +
geom_text_repel(data = subset(geness_res, SYMBOL %in% top_g & pvalue < 0.05 & (abs(geness_res$log2FoldChange) >= 2.0)),
size = 4, point.padding = 0.15, color = "black",
min.segment.length = .1, box.padding = .2, lwd = 2) +
theme_bw(base_size = 16) +
theme(legend.position = "bottom")
dev.off()
# ------------------- heatmap for separate analyses ----------------
setwd("~/DATA/Data_Denise_LT_DNA_Bindung/results_rnaseq_K331A_hg38/heatmap_separate_analyses")
up1 <-read.csv(file="../featureCounts_K331A_vs_ctrl_d8_csv/K331A_d8_vs_control_d8-up.txt")
up3 <-read.csv(file="../featureCounts_LT_vs_ctrl_d8_csv/LT_d8_vs_control_d8-up.txt")
up5 <-read.csv(file="../featureCounts_LTtr_vs_ctrl_d8_csv/LTtr_d8_vs_control_d8-up.txt")
down1 <-read.csv(file="../featureCounts_K331A_vs_ctrl_d8_csv/K331A_d8_vs_control_d8-down.txt")
down3 <-read.csv(file="../featureCounts_LT_vs_ctrl_d8_csv/LT_d8_vs_control_d8-down.txt")
down5 <-read.csv(file="../featureCounts_LTtr_vs_ctrl_d8_csv/LTtr_d8_vs_control_d8-down.txt")
RNASeq.NoCellLine <- assay(rld) # using the overview analysis, only choose the gene ids between conditions seperately. NOT DONE!
#Option3: as paper described, A heatmap showing expression values of all DEGs which are significant between any pair conditions.
all_genes <- c(up1$SYMBOL,down1$SYMBOL, up3$SYMBOL,down3$SYMBOL, up5$SYMBOL,down5$SYMBOL) #1685
all_genes <- unique(all_genes) #1685-->1016
RNASeq.NoCellLine_ <- RNASeq.NoCellLine[all_genes,]
write.csv(as.data.frame(RNASeq.NoCellLine_), file ="significant_gene_expressions.txt")
##### STEP4: clustering the genes and draw heatmap #####
#clustering methods: "ward.D", "ward.D2", "single", "complete", "average" (= UPGMA), "mcquitty" (= WPGMA), "median" (= WPGMC) or "centroid" (= UPGMC). pearson or spearman
datamat = RNASeq.NoCellLine_
hr <- hclust(as.dist(1-cor(t(datamat), method="pearson")), method="complete")
hc <- hclust(as.dist(1-cor(datamat, method="spearman")), method="complete")
mycl = cutree(hr, h=max(hr$height)/1.08)
mycol = c("YELLOW", "DARKBLUE", "DARKORANGE", "DARKMAGENTA", "DARKCYAN", "DARKRED", "MAROON", "DARKGREEN", "LIGHTBLUE", "PINK", "MAGENTA", "LIGHTCYAN","LIGHTGREEN", "BLUE", "ORANGE", "CYAN", "RED", "GREEN");
mycol = mycol[as.vector(mycl)]
#sampleCols <- rep('GREY',ncol(RNASeq.NoCellLine_))
#names(sampleCols) <- c("mock_r1", "mock_r2", "mock_r3", "mock_r4", "WAP_r1", "WAP_r2", "WAP_r3", "WAP_r4", "WAC_r1","WAC_r2")
#sampleCols[substr(colnames(RNASeq.NoCellLine_),1,4)=='mock'] <- 'GREY'
#sampleCols[substr(colnames(RNASeq.NoCellLine_),1,3)=='WAP'] <- 'RED'
#sampleCols[substr(colnames(RNASeq.NoCellLine_),1,3)=='dM_'] <- 'CYAN'
#sampleCols[substr(colnames(RNASeq.NoCellLine_),1,3)=='dP_'] <- 'BLUE'
#sampleCols[substr(colnames(RNASeq.NoCellLine_),1,3)=='WAC'] <- 'GREEN'
png("DEGs_heatmap.png", width=900, height=1010)
heatmap.2(as.matrix(datamat),Rowv=as.dendrogram(hr),Colv = NA, dendrogram = 'row',
scale='row',trace='none',col=bluered(75),
RowSideColors = mycol, labRow="", srtCol=15, keysize=0.72)
dev.off()
#### cluster members #####
write.csv(names(subset(mycl, mycl == '1')),file='cluster1_YELLOW.txt')
write.csv(names(subset(mycl, mycl == '2')),file='cluster2_DARKBLUE.txt')
write.csv(names(subset(mycl, mycl == '3')),file='cluster3_DARKORANGE.txt')
~/Tools/csv2xls-0.4/csv_to_xls.py \
cluster1_YELLOW.txt \
cluster2_DARKBLUE.txt \
cluster3_DARKORANGE.txt \
-d$',' -o gene_culsters.xls;
~/Tools/csv2xls-0.4/csv_to_xls.py \
significant_gene_expressions.txt \
-d',' -o DEGs_heatmap_data.xls;
Author Archives: gene_x
Nextflow RNAseq
- Merge re-sequenced FastQ files (cat)
- Sub-sample FastQ files and auto-infer strandedness (fq, Salmon)
- Read QC (FastQC)
- UMI extraction (UMI-tools)
- Adapter and quality trimming (Trim Galore!)
- Removal of genome contaminants (BBSplit)
- Removal of ribosomal RNA (SortMeRNA)
- Choice of multiple alignment and quantification routes: STAR -> Salmon STAR -> RSEM HiSAT2 -> NO QUANTIFICATION
- Sort and index alignments (SAMtools)
- UMI-based deduplication (UMI-tools)
- Duplicate read marking (picard MarkDuplicates)
- Transcript assembly and quantification (StringTie)
- Create bigWig coverage files (BEDTools, bedGraphToBigWig)
- Extensive quality control: RSeQC Qualimap dupRadar Preseq DESeq2
- Pseudo-alignment and quantification (Salmon; optional)
- Present QC for raw read, alignment, gene biotype, sample similarity, and strand-specificity checks (MultiQC, R)
umi-tools extract:
Flexible removal of UMI sequences from fastq reads.
UMIs are removed and appended to the read name. Any other barcode, for example a library barcode, is left on the read. Can also filter reads by quality or against a whitelist (see above)
The remaining commands, group, dedup and count/count_tab, are used to identify PCR duplicates using the UMIs and perform different levels of analysis depending on the needs of the user. A number of different UMI deduplication schemes are enabled - The recommended method is directional.
umi-tools dedup:
Groups PCR duplicates and deduplicates reads to yield one read per group
Use this when you want to remove the PCR duplicates prior to any downstream analysis
Introducing BBSplit: Read Binning Tool for Metagenomes and Contaminated Libraries
Removal of genome contaminants (BBSplit)
Removal of ribosomal RNA
StringTie for Transcript assembly and quantification
Extensive quality control:
The preseq package is aimed at predicting and estimating the complexity of a genomic sequencing library,
Herpes-virus is double-stranded RNA.
Herpes-simplex-Viren (humane Herpesviren Typ 1 und 2) sind häufige Ursachen rezidivierender Infektionen mit Beteiligung von Haut, Mund, Lippen, Augen und …
Salmon used expectation–maximization (EM) algorithm to assign reads if two copy of genes occurs.
Reproduce the plots in the ‘Methods’ part
-
a dashed vertical line dividing two sets of rectangular bars.
library(ggplot2) # Install and load the ggplot2 package install.packages("ggplot2") library(ggplot2) # Create a mock dataset for the left visualization left_data <- data.frame( group = c(rep("A", 5), rep("B", 3)), value = c(1, 2, 3, 4, 5, 6, 7, 8) ) left_plot <- ggplot(left_data, aes(x=group, y=value)) + geom_col(position="dodge") + geom_vline(xintercept=1.5, linetype="dashed") + theme_minimal() # Create a mock dataset for the right visualization right_data <- data.frame( x = 1:20, y = rnorm(20, 10, 5), group = sample(c("red", "blue"), 20, replace=TRUE) ) right_plot <- ggplot(right_data, aes(x=x, y=y)) + geom_col(aes(fill="blue"), width=0.5) + geom_point(aes(color=group), size=4, position=position_jitter(width=0.3, height=0)) + scale_color_manual(values=c("red"="red", "blue"="blue")) + theme_minimal() # Combine and display the plots library(gridExtra) png("plot1.png", width=800, height=800) grid.arrange(left_plot, right_plot, ncol=2) dev.off()
-
a bar plot with overlaid scatter points in two colors.
Compressing Y-Axis Ranges in Volcano Plots (火山图)
Volcano plots are graphical representations of the significance versus fold-change for values obtained from a particular data analysis. They are especially useful in high-throughput studies, such as genomics, to visually represent the relationship between statistical significance and the magnitude of change. However, in some datasets, extreme values might stretch the scale of the plot, making it difficult to discern data points in regions of particular interest. By introducing compressed ranges, we can “squeeze” a specific range of values into a shorter y-axis scale, allowing for clearer visualization of data points within that range, without losing the overall context of the entire dataset. This method retains the informative nature of the volcano plot while emphasizing specific data regions of interest.
火山图是表示从特定数据分析获得的显著性与倍数变化的图形表示。它们在高通量研究中,如基因组学,特别有用,用于直观地表示统计显著性和变化幅度之间的关系。然而,在某些数据集中,极端值可能会拉长图的比例,使得在特定感兴趣的区域难以辨别数据点。通过引入压缩范围,我们可以将特定范围的值“挤压”到较短的y轴比例中,从而更清晰地显示该范围内的数据点,同时不失去整个数据集的整体背景。这种方法保留了火山图的信息性,同时强调了特定的感兴趣的数据区域。
library(ggplot2)
library(ggrepel)
setwd("~/DATA/Data_Anastasia_RNASeq_PUBLISHING/results/featureCounts/degenes_publishing_compressed_ranges")
#---------------- K3R_24h ----------------
geness_res <- read.csv(file = "K3R_24h.csv", sep="\t", row.names=1)
# Color setting
geness_res$Color <- ifelse(geness_res$padj > 0.05 | abs(geness_res$log2FoldChange) < 0.585, "gray",
ifelse(geness_res$log2FoldChange > 0, "red", "blue"))
# Predefined genes colored in green
predefined_genes <- c("GNPDA1", "POR", "GEM", "SPOCD1", "DUSP10", "DUSP12", "SLC16A6", "HLA-B", "LSS", "SMPD1", "NDRG1", "USP2")
geness_res$Color[geness_res$external_gene_name %in% predefined_genes] <- "green"
geness_res$invert_Padj <- (-log10(geness_res$padj)) * sign(geness_res$log2FoldChange)
top_g <- unique(c(geness_res[order(geness_res$invert_Padj, decreasing = TRUE), 'external_gene_name'][1:100],
geness_res[order(geness_res$invert_Padj, decreasing = FALSE), 'external_gene_name'][1:100]))
# Define the original and compressed ranges
original_range <- c(35, 60)
compressed_range <- c(35.0, 36.0)
# Adjust the p-values based on the ranges
geness_res$adjusted_pvalue <- with(geness_res,
ifelse(-log10(padj) > original_range[1] & -log10(padj) <= original_range[2],
((-log10(padj) - original_range[1]) / (original_range[2] - original_range[1])) * (compressed_range[2] - compressed_range[1]) + compressed_range[1],
ifelse(-log10(padj) > original_range[2],
-log10(padj) - (original_range[2] - original_range[1]) + (compressed_range[2] - compressed_range[1]),
-log10(padj))))
# Calculate breaks for the y-axis
y_breaks_below <- seq(0, 30, by=10)
y_breaks_compressed <- c(35.0, 36.0)
y_breaks_above <- c()
y_breaks <- c(y_breaks_below, y_breaks_compressed, y_breaks_above)
y_labels_below <- seq(0, 30, by=10)
y_labels_compressed <- c(35, 60)
y_labels_above <- c()
y_labels <- c(y_labels_below, y_labels_compressed, y_labels_above)
# Create the plot
png("K3R_24h.png", width=1200, height=1200)
ggplot(geness_res, aes(x = log2FoldChange, y = adjusted_pvalue, color = Color, label = external_gene_name)) +
geom_vline(xintercept = c(0.585, -0.585), lty = "dashed", size = 1.5) +
geom_hline(yintercept = -log10(0.05), lty = "dashed", size = 1.5) +
geom_point(size = 3) +
labs(x = "log2(Fold change)", y = "-log10(P-adj)", color = "Significance") +
scale_color_identity() +
geom_text_repel(data = subset(geness_res, external_gene_name %in% top_g & padj < 0.05 & (abs(log2FoldChange) >= 0.585)),
size = 8,
fontface = "bold",
point.padding = 0.15,
color = "black",
min.segment.length = .1,
box.padding = .2,
lwd = 2) +
theme_bw(base_size = 28) +
theme(legend.position = "bottom") +
annotate("rect", xmin = -Inf, xmax = Inf, ymin = compressed_range[1], ymax = compressed_range[1], linetype = "dashed", color = "grey") +
annotate("rect", xmin = -Inf, xmax = Inf, ymin = compressed_range[2], ymax = compressed_range[2], linetype = "dashed", color = "grey") +
annotate("text", x = -Inf, y = compressed_range[1], label = "/", hjust = 0, size = 10) +
annotate("text", x = -Inf, y = compressed_range[2], label = "/", hjust = 0, size = 10) +
scale_y_continuous(breaks = sort(y_breaks), labels = sort(y_labels))
dev.off()
#---------------- wt_24h ----------------
geness_res <- read.csv(file = "wt_24h.csv", sep="\t", row.names=1)
#K3R_24h.csv
#wt_24h.csv
#wt_3+21h.csv
#K3R_3+21h.csv
#geness_res <- read.csv(file = "K3R_3hdox21hchase_vs_control-all.txt", sep=",", row.names=1)
#WT_24hdox_vs_control-all.txt
#K3R_24hdox_vs_control-all.txt
#WT_3hdox21hchase_vs_control-all.txt
#K3R_3hdox21hchase_vs_control-all.txt
# Color setting
geness_res$Color <- ifelse(geness_res$padj > 0.05 | abs(geness_res$log2FoldChange) < 0.585, "gray",
ifelse(geness_res$log2FoldChange > 0, "red", "blue"))
# Predefined genes colored in green
predefined_genes <- c("TRIM63", "ATP6V0D2", "SULT1C2", "UPP1", "SLC16A6", "RRAGD", "HEXA-AS1", "IL6R", "GPNMB", "PRKAG2", "STS", "RASGRP3", "KCNAB2", "PLEKHM1", "SLC39A1", "USP2", "ADGRG1", "SPRING1", "FNIP2", "CTNS", "CTSD", "NDRG1", "BHLHE40", "VPS18", "PLIN2", "ASAH1", "PPP1R3B", "SNX8", "GEM", "AMDHD2", "PER3", "PFKFB2", "LSS", "WBP2", "BRI3", "GRN", "TPRA1", "ATP6V1D", "ATP6V1B2", "ATP6V1C1", "TMC6", "SLC26A11", "GNPDA1", "EPG5", "GNA13", "VAT1", "DUSP3", "ATP6V1H", "KIAA0930", "ATP6V0D1", "HMOX1", "RRAGC", "SLC25A13", "ATP6V0A1", "WDR81", "DUSP10", "CSTB", "FLCN", "PPARGC1A", "TOM1", "MTM1", "NEU1", "ZFYVE26", "ZCCHC8", "ATP6V1F", "CD63", "CTSB", "PEA15", "NRBF2", "CLCN7", "PGAP6", "PSAP", "GLA", "SGSH", "S100A6", "PIP4K2C", "SLC38A7", "GLMP", "GSTO1", "THAP8", "MCOLN1", "POR", "SNX16", "M6PR", "USP32", "VPS8", "LONP1", "GPX1", "MVP", "UGDH", "CC2D1B", "CTSA", "GBA", "RBM19", "LACTB2", "UTP18", "AP5Z1", "NAGLU", "EIF4B", "SOCS5", "HLA-B") #for wt_24h.png
#predefined_genes <- c() #for wt_3+21h.png, K3R_3+21h.png, and *_vs_control.png
geness_res$Color[geness_res$external_gene_name %in% predefined_genes] <- "green"
geness_res$invert_Padj <- (-log10(geness_res$padj)) * sign(geness_res$log2FoldChange)
top_g <- unique(c(geness_res[order(geness_res$invert_Padj, decreasing = TRUE), 'external_gene_name'][1:100],
geness_res[order(geness_res$invert_Padj, decreasing = FALSE), 'external_gene_name'][1:100]))
# Define the original and compressed ranges
original_range <- c(100, 175)
compressed_range <- c(100.0, 104.0)
# Adjust the p-values based on the ranges
geness_res$adjusted_pvalue <- with(geness_res,
ifelse(-log10(padj) > original_range[1] & -log10(padj) <= original_range[2],
((-log10(padj) - original_range[1]) / (original_range[2] - original_range[1])) * (compressed_range[2] - compressed_range[1]) + compressed_range[1],
ifelse(-log10(padj) > original_range[2],
-log10(padj) - (original_range[2] - original_range[1]) + (compressed_range[2] - compressed_range[1]),
-log10(padj))))
# Calculate breaks for the y-axis
y_breaks_below <- seq(0, 100, by=25)
y_breaks_compressed <- c(104.0) # We already have 35 in y_breaks_below
#y_breaks_above <- if (max(geness_res$adjusted_pvalue, na.rm = TRUE)+5 < 63) {
# seq(60, max(geness_res$adjusted_pvalue, na.rm = TRUE), by=-5)
# } else {
# seq(63, max(geness_res$adjusted_pvalue, na.rm = TRUE)+5, by=5)
# }
y_breaks_above <- c(129.0)
y_breaks <- c(y_breaks_below, y_breaks_compressed, y_breaks_above)
y_labels_below <- seq(0, 100, by=25)
y_labels_compressed <- c(175) # We already have 35 in y_labels_below
#y_labels_above <- if (max(geness_res$adjusted_pvalue, na.rm = TRUE)+5 < 63) {
# seq(60, max(geness_res$adjusted_pvalue, na.rm = TRUE), by=-5)
# } else {
# seq(63, max(geness_res$adjusted_pvalue, na.rm = TRUE)+5, by=5)
# }
y_labels_above <- c(200)
y_labels <- c(y_labels_below, y_labels_compressed, y_labels_above)
# Create the plot
png("wt_24h.png", width=1200, height=1200)
ggplot(geness_res, aes(x = log2FoldChange, y = adjusted_pvalue, color = Color, label = external_gene_name)) +
geom_vline(xintercept = c(0.585, -0.585), lty = "dashed", size = 1.5) +
geom_hline(yintercept = -log10(0.05), lty = "dashed", size = 1.5) +
geom_point(size = 3) +
labs(x = "log2(Fold change)", y = "-log10(P-adj)", color = "Significance") +
scale_color_identity() +
geom_text_repel(data = subset(geness_res, external_gene_name %in% top_g & padj < 0.05 & (abs(log2FoldChange) >= 0.585)),
size = 8,
fontface = "bold",
point.padding = 0.15,
color = "black",
min.segment.length = .1,
box.padding = .2,
lwd = 2) +
theme_bw(base_size = 28) +
theme(legend.position = "bottom") +
annotate("rect", xmin = -Inf, xmax = Inf, ymin = compressed_range[1], ymax = compressed_range[1], linetype = "dashed", color = "grey") +
annotate("rect", xmin = -Inf, xmax = Inf, ymin = compressed_range[2], ymax = compressed_range[2], linetype = "dashed", color = "grey") +
annotate("text", x = -Inf, y = compressed_range[1], label = "/", hjust = 0, size = 10) +
annotate("text", x = -Inf, y = compressed_range[2], label = "/", hjust = 0, size = 10) +
scale_y_continuous(breaks = sort(y_breaks), labels = sort(y_labels))
dev.off()
#---------------- wt_3+21h ... ----------------
geness_res <- read.csv(file = "wt_3+21h.csv", sep="\t", row.names=1)
#wt_3+21h.csv
#K3R_3+21h.csv
#geness_res <- read.csv(file = "K3R_3hdox21hchase_vs_control-all.txt", sep=",", row.names=1)
#WT_24hdox_vs_control-all.txt
#K3R_24hdox_vs_control-all.txt
#WT_3hdox21hchase_vs_control-all.txt
#K3R_3hdox21hchase_vs_control-all.txt
# Color setting
geness_res$Color <- ifelse(geness_res$padj > 0.05 | abs(geness_res$log2FoldChange) < 0.585, "gray",
ifelse(geness_res$log2FoldChange > 0, "red", "blue"))
# Predefined genes colored in green
predefined_genes <- c() #for wt_3+21h.png, K3R_3+21h.png, and *_vs_control.png
geness_res$Color[geness_res$external_gene_name %in% predefined_genes] <- "green"
geness_res$invert_Padj <- (-log10(geness_res$padj)) * sign(geness_res$log2FoldChange)
top_g <- unique(c(geness_res[order(geness_res$invert_Padj, decreasing = TRUE), 'external_gene_name'][1:100],
geness_res[order(geness_res$invert_Padj, decreasing = FALSE), 'external_gene_name'][1:100]))
# Define the original and compressed ranges
original_range <- c(50, 100)
compressed_range <- c(50.0, 52.0)
# Adjust the p-values based on the ranges
geness_res$adjusted_pvalue <- with(geness_res,
ifelse(-log10(padj) > original_range[1] & -log10(padj) <= original_range[2],
((-log10(padj) - original_range[1]) / (original_range[2] - original_range[1])) * (compressed_range[2] - compressed_range[1]) + compressed_range[1],
ifelse(-log10(padj) > original_range[2],
-log10(padj) - (original_range[2] - original_range[1]) + (compressed_range[2] - compressed_range[1]),
-log10(padj))))
# Calculate breaks for the y-axis
y_breaks_below <- seq(0, 50, by=25)
y_breaks_compressed <- c(52.0) # We already have 35 in y_breaks_below
y_breaks_above <- c(77.0)
y_breaks <- c(y_breaks_below, y_breaks_compressed, y_breaks_above)
y_labels_below <- seq(0, 50, by=25)
y_labels_compressed <- c(100) # We already have 35 in y_labels_below
y_labels_above <- c(125)
y_labels <- c(y_labels_below, y_labels_compressed, y_labels_above)
# Create the plot
png("wt_3+21h.png", width=1200, height=1200)
ggplot(geness_res, aes(x = log2FoldChange, y = adjusted_pvalue, color = Color, label = external_gene_name)) +
geom_vline(xintercept = c(0.585, -0.585), lty = "dashed", size = 1.5) +
geom_hline(yintercept = -log10(0.05), lty = "dashed", size = 1.5) +
geom_point(size = 3) +
labs(x = "log2(Fold change)", y = "-log10(P-adj)", color = "Significance") +
scale_color_identity() +
geom_text_repel(data = subset(geness_res, external_gene_name %in% top_g & padj < 0.05 & (abs(log2FoldChange) >= 0.585)),
size = 8,
fontface = "bold",
point.padding = 0.15,
color = "black",
min.segment.length = .1,
box.padding = .2,
lwd = 2) +
theme_bw(base_size = 28) +
theme(legend.position = "bottom") +
annotate("rect", xmin = -Inf, xmax = Inf, ymin = compressed_range[1], ymax = compressed_range[1], linetype = "dashed", color = "grey") +
annotate("rect", xmin = -Inf, xmax = Inf, ymin = compressed_range[2], ymax = compressed_range[2], linetype = "dashed", color = "grey") +
annotate("text", x = -Inf, y = compressed_range[1], label = "/", hjust = 0, size = 10) +
annotate("text", x = -Inf, y = compressed_range[2], label = "/", hjust = 0, size = 10) +
scale_y_continuous(breaks = sort(y_breaks), labels = sort(y_labels))
dev.off()
#---------------- K3R_3+21h ----------------
geness_res <- read.csv(file = "K3R_3+21h.csv", sep="\t", row.names=1)
#geness_res <- read.csv(file = "K3R_3hdox21hchase_vs_control-all.txt", sep=",", row.names=1)
#WT_24hdox_vs_control-all.txt
#K3R_24hdox_vs_control-all.txt
#WT_3hdox21hchase_vs_control-all.txt
#K3R_3hdox21hchase_vs_control-all.txt
# Color setting
geness_res$Color <- ifelse(geness_res$padj > 0.05 | abs(geness_res$log2FoldChange) < 0.585, "gray",
ifelse(geness_res$log2FoldChange > 0, "red", "blue"))
# Predefined genes colored in green
predefined_genes <- c()
geness_res$Color[geness_res$external_gene_name %in% predefined_genes] <- "green"
geness_res$invert_Padj <- (-log10(geness_res$padj)) * sign(geness_res$log2FoldChange)
top_g <- unique(c(geness_res[order(geness_res$invert_Padj, decreasing = TRUE), 'external_gene_name'][1:100],
geness_res[order(geness_res$invert_Padj, decreasing = FALSE), 'external_gene_name'][1:100]))
# Define the original and compressed ranges
original_range <- c(15, 25)
compressed_range <- c(15.0, 16.0)
# Adjust the p-values based on the ranges
geness_res$adjusted_pvalue <- with(geness_res,
ifelse(-log10(padj) > original_range[1] & -log10(padj) <= original_range[2],
((-log10(padj) - original_range[1]) / (original_range[2] - original_range[1])) * (compressed_range[2] - compressed_range[1]) + compressed_range[1],
ifelse(-log10(padj) > original_range[2],
-log10(padj) - (original_range[2] - original_range[1]) + (compressed_range[2] - compressed_range[1]),
-log10(padj))))
# Calculate breaks for the y-axis
y_breaks_below <- seq(0, 15, by=5)
y_breaks_compressed <- c(16.0)
y_breaks_above <- c(21.0)
y_breaks <- c(y_breaks_below, y_breaks_compressed, y_breaks_above)
y_labels_below <- seq(0, 15, by=5)
y_labels_compressed <- c(25)
y_labels_above <- c(30)
y_labels <- c(y_labels_below, y_labels_compressed, y_labels_above)
# Create the plot
png("K3R_3+21h.png", width=1200, height=1200)
ggplot(geness_res, aes(x = log2FoldChange, y = adjusted_pvalue, color = Color, label = external_gene_name)) +
geom_vline(xintercept = c(0.585, -0.585), lty = "dashed", size = 1.5) +
geom_hline(yintercept = -log10(0.05), lty = "dashed", size = 1.5) +
geom_point(size = 3) +
labs(x = "log2(Fold change)", y = "-log10(P-adj)", color = "Significance") +
scale_color_identity() +
geom_text_repel(data = subset(geness_res, external_gene_name %in% top_g & padj < 0.05 & (abs(log2FoldChange) >= 0.585)),
size = 8,
fontface = "bold",
point.padding = 0.15,
color = "black",
min.segment.length = .1,
box.padding = .2,
lwd = 2) +
theme_bw(base_size = 28) +
theme(legend.position = "bottom") +
annotate("rect", xmin = -Inf, xmax = Inf, ymin = compressed_range[1], ymax = compressed_range[1], linetype = "dashed", color = "grey") +
annotate("rect", xmin = -Inf, xmax = Inf, ymin = compressed_range[2], ymax = compressed_range[2], linetype = "dashed", color = "grey") +
annotate("text", x = -Inf, y = compressed_range[1], label = "/", hjust = 0, size = 10) +
annotate("text", x = -Inf, y = compressed_range[2], label = "/", hjust = 0, size = 10) +
scale_y_continuous(breaks = sort(y_breaks), labels = sort(y_labels))
dev.off()
#---------------- wt_3+21h_vs_control ----------------
geness_res <- read.csv(file = "WT_3hdox21hchase_vs_control-all.txt", sep=",", row.names=1)
#K3R_24hdox_vs_control-all.txt
#WT_3hdox21hchase_vs_control-all.txt
#K3R_3hdox21hchase_vs_control-all.txt
# Color setting
geness_res$Color <- ifelse(geness_res$padj > 0.05 | abs(geness_res$log2FoldChange) < 0.585, "gray",
ifelse(geness_res$log2FoldChange > 0, "red", "blue"))
# Predefined genes colored in green
predefined_genes <- c()
geness_res$Color[geness_res$external_gene_name %in% predefined_genes] <- "green"
geness_res$invert_Padj <- (-log10(geness_res$padj)) * sign(geness_res$log2FoldChange)
top_g <- unique(c(geness_res[order(geness_res$invert_Padj, decreasing = TRUE), 'external_gene_name'][1:100],
geness_res[order(geness_res$invert_Padj, decreasing = FALSE), 'external_gene_name'][1:100]))
# Define the original and compressed ranges
original_range <- c(150, 250)
compressed_range <- c(150.0, 154.0)
# Adjust the p-values based on the ranges
geness_res$adjusted_pvalue <- with(geness_res,
ifelse(-log10(padj) > original_range[1] & -log10(padj) <= original_range[2],
((-log10(padj) - original_range[1]) / (original_range[2] - original_range[1])) * (compressed_range[2] - compressed_range[1]) + compressed_range[1],
ifelse(-log10(padj) > original_range[2],
-log10(padj) - (original_range[2] - original_range[1]) + (compressed_range[2] - compressed_range[1]),
-log10(padj))))
# Calculate breaks for the y-axis
y_breaks_below <- seq(0, 150, by=50)
y_breaks_compressed <- c(154.0)
y_breaks_above <- c()
y_breaks <- c(y_breaks_below, y_breaks_compressed, y_breaks_above)
y_labels_below <- seq(0, 150, by=50)
y_labels_compressed <- c(250)
y_labels_above <- c()
y_labels <- c(y_labels_below, y_labels_compressed, y_labels_above)
# Create the plot
png("wt_3+21h_vs_control.png", width=1200, height=1200)
ggplot(geness_res, aes(x = log2FoldChange, y = adjusted_pvalue, color = Color, label = external_gene_name)) +
geom_vline(xintercept = c(0.585, -0.585), lty = "dashed", size = 1.5) +
geom_hline(yintercept = -log10(0.05), lty = "dashed", size = 1.5) +
geom_point(size = 3) +
labs(x = "log2(Fold change)", y = "-log10(P-adj)", color = "Significance") +
scale_color_identity() +
geom_text_repel(data = subset(geness_res, external_gene_name %in% top_g & padj < 0.05 & (abs(log2FoldChange) >= 0.585)),
size = 8,
fontface = "bold",
point.padding = 0.15,
color = "black",
min.segment.length = .1,
box.padding = .2,
lwd = 2) +
theme_bw(base_size = 28) +
theme(legend.position = "bottom") +
annotate("rect", xmin = -Inf, xmax = Inf, ymin = compressed_range[1], ymax = compressed_range[1], linetype = "dashed", color = "grey") +
annotate("rect", xmin = -Inf, xmax = Inf, ymin = compressed_range[2], ymax = compressed_range[2], linetype = "dashed", color = "grey") +
annotate("text", x = -Inf, y = compressed_range[1], label = "/", hjust = 0, size = 10) +
annotate("text", x = -Inf, y = compressed_range[2], label = "/", hjust = 0, size = 10) +
scale_y_continuous(breaks = sort(y_breaks), labels = sort(y_labels))
dev.off()
#---------------- K3R_3+21h_vs_control ----------------
geness_res <- read.csv(file = "K3R_3hdox21hchase_vs_control-all.txt", sep=",", row.names=1)
#K3R_24hdox_vs_control-all.txt
#WT_24hdox_vs_control-all.txt
# Color setting
geness_res$Color <- ifelse(geness_res$padj > 0.05 | abs(geness_res$log2FoldChange) < 0.585, "gray",
ifelse(geness_res$log2FoldChange > 0, "red", "blue"))
# Predefined genes colored in green
predefined_genes <- c()
geness_res$Color[geness_res$external_gene_name %in% predefined_genes] <- "green"
geness_res$invert_Padj <- (-log10(geness_res$padj)) * sign(geness_res$log2FoldChange)
top_g <- unique(c(geness_res[order(geness_res$invert_Padj, decreasing = TRUE), 'external_gene_name'][1:100],
geness_res[order(geness_res$invert_Padj, decreasing = FALSE), 'external_gene_name'][1:100]))
# Define the original and compressed ranges
original_range <- c(40, 65)
compressed_range <- c(40.0, 43.0)
# Adjust the p-values based on the ranges
geness_res$adjusted_pvalue <- with(geness_res,
ifelse(-log10(padj) > original_range[1] & -log10(padj) <= original_range[2],
((-log10(padj) - original_range[1]) / (original_range[2] - original_range[1])) * (compressed_range[2] - compressed_range[1]) + compressed_range[1],
ifelse(-log10(padj) > original_range[2],
-log10(padj) - (original_range[2] - original_range[1]) + (compressed_range[2] - compressed_range[1]),
-log10(padj))))
# Calculate breaks for the y-axis
y_breaks_below <- seq(0, 40, by=20)
y_breaks_compressed <- c(43.0)
y_breaks_above <- c()
y_breaks <- c(y_breaks_below, y_breaks_compressed, y_breaks_above)
y_labels_below <- seq(0, 40, by=20)
y_labels_compressed <- c(65)
y_labels_above <- c()
y_labels <- c(y_labels_below, y_labels_compressed, y_labels_above)
# Create the plot
png("K3R_3+21h_vs_control.png", width=1200, height=1200)
ggplot(geness_res, aes(x = log2FoldChange, y = adjusted_pvalue, color = Color, label = external_gene_name)) +
geom_vline(xintercept = c(0.585, -0.585), lty = "dashed", size = 1.5) +
geom_hline(yintercept = -log10(0.05), lty = "dashed", size = 1.5) +
geom_point(size = 3) +
labs(x = "log2(Fold change)", y = "-log10(P-adj)", color = "Significance") +
scale_color_identity() +
geom_text_repel(data = subset(geness_res, external_gene_name %in% top_g & padj < 0.05 & (abs(log2FoldChange) >= 0.585)),
size = 8,
fontface = "bold",
point.padding = 0.15,
color = "black",
min.segment.length = .1,
box.padding = .2,
lwd = 2) +
theme_bw(base_size = 28) +
theme(legend.position = "bottom") +
annotate("rect", xmin = -Inf, xmax = Inf, ymin = compressed_range[1], ymax = compressed_range[1], linetype = "dashed", color = "grey") +
annotate("rect", xmin = -Inf, xmax = Inf, ymin = compressed_range[2], ymax = compressed_range[2], linetype = "dashed", color = "grey") +
annotate("text", x = -Inf, y = compressed_range[1], label = "/", hjust = 0, size = 10) +
annotate("text", x = -Inf, y = compressed_range[2], label = "/", hjust = 0, size = 10) +
scale_y_continuous(breaks = sort(y_breaks), labels = sort(y_labels))
dev.off()
#---------------- wt_24h_vs_control ----------------
geness_res <- read.csv(file = "WT_24hdox_vs_control-all.txt", sep=",", row.names=1)
#K3R_24hdox_vs_control-all.txt
# Color setting
geness_res$Color <- ifelse(geness_res$padj > 0.05 | abs(geness_res$log2FoldChange) < 0.585, "gray",
ifelse(geness_res$log2FoldChange > 0, "red", "blue"))
# Predefined genes colored in green
predefined_genes <- c()
geness_res$Color[geness_res$external_gene_name %in% predefined_genes] <- "green"
geness_res$invert_Padj <- (-log10(geness_res$padj)) * sign(geness_res$log2FoldChange)
top_g <- unique(c(geness_res[order(geness_res$invert_Padj, decreasing = TRUE), 'external_gene_name'][1:100],
geness_res[order(geness_res$invert_Padj, decreasing = FALSE), 'external_gene_name'][1:100]))
# Define the original and compressed ranges
original_range <- c(130, 180)
compressed_range <- c(130.0, 134.0)
# Adjust the p-values based on the ranges
geness_res$adjusted_pvalue <- with(geness_res,
ifelse(-log10(padj) > original_range[1] & -log10(padj) <= original_range[2],
((-log10(padj) - original_range[1]) / (original_range[2] - original_range[1])) * (compressed_range[2] - compressed_range[1]) + compressed_range[1],
ifelse(-log10(padj) > original_range[2],
-log10(padj) - (original_range[2] - original_range[1]) + (compressed_range[2] - compressed_range[1]),
-log10(padj))))
# Calculate breaks for the y-axis
y_breaks_below <- seq(0, 100, by=50)
y_breaks_compressed <- c(130.0, 134.0)
y_breaks_above <- c(154)
y_breaks <- c(y_breaks_below, y_breaks_compressed, y_breaks_above)
y_labels_below <- seq(0, 100, by=50)
y_labels_compressed <- c(130, 180)
y_labels_above <- c(200)
y_labels <- c(y_labels_below, y_labels_compressed, y_labels_above)
# Create the plot
png("wt_24h_vs_control.png", width=1200, height=1200)
ggplot(geness_res, aes(x = log2FoldChange, y = adjusted_pvalue, color = Color, label = external_gene_name)) +
geom_vline(xintercept = c(0.585, -0.585), lty = "dashed", size = 1.5) +
geom_hline(yintercept = -log10(0.05), lty = "dashed", size = 1.5) +
geom_point(size = 3) +
labs(x = "log2(Fold change)", y = "-log10(P-adj)", color = "Significance") +
scale_color_identity() +
geom_text_repel(data = subset(geness_res, external_gene_name %in% top_g & padj < 0.05 & (abs(log2FoldChange) >= 0.585)),
size = 8,
fontface = "bold",
point.padding = 0.15,
color = "black",
min.segment.length = .1,
box.padding = .2,
lwd = 2) +
theme_bw(base_size = 28) +
theme(legend.position = "bottom") +
annotate("rect", xmin = -Inf, xmax = Inf, ymin = compressed_range[1], ymax = compressed_range[1], linetype = "dashed", color = "grey") +
annotate("rect", xmin = -Inf, xmax = Inf, ymin = compressed_range[2], ymax = compressed_range[2], linetype = "dashed", color = "grey") +
annotate("text", x = -Inf, y = compressed_range[1], label = "/", hjust = 0, size = 10) +
annotate("text", x = -Inf, y = compressed_range[2], label = "/", hjust = 0, size = 10) +
scale_y_continuous(breaks = sort(y_breaks), labels = sort(y_labels))
dev.off()
#---------------- K3R_24h_vs_control ----------------
geness_res <- read.csv(file = "K3R_24hdox_vs_control-all.txt", sep=",", row.names=1)
#K3R_24hdox_vs_control-all.txt
# Color setting
geness_res$Color <- ifelse(geness_res$padj > 0.05 | abs(geness_res$log2FoldChange) < 0.585, "gray",
ifelse(geness_res$log2FoldChange > 0, "red", "blue"))
# Predefined genes colored in green
predefined_genes <- c()
geness_res$Color[geness_res$external_gene_name %in% predefined_genes] <- "green"
geness_res$invert_Padj <- (-log10(geness_res$padj)) * sign(geness_res$log2FoldChange)
top_g <- unique(c(geness_res[order(geness_res$invert_Padj, decreasing = TRUE), 'external_gene_name'][1:100],
geness_res[order(geness_res$invert_Padj, decreasing = FALSE), 'external_gene_name'][1:100]))
# Define the original and compressed ranges
original_range <- c(60, 180)
compressed_range <- c(60.0, 64.0)
# Adjust the p-values based on the ranges
geness_res$adjusted_pvalue <- with(geness_res,
ifelse(-log10(padj) > original_range[1] & -log10(padj) <= original_range[2],
((-log10(padj) - original_range[1]) / (original_range[2] - original_range[1])) * (compressed_range[2] - compressed_range[1]) + compressed_range[1],
ifelse(-log10(padj) > original_range[2],
-log10(padj) - (original_range[2] - original_range[1]) + (compressed_range[2] - compressed_range[1]),
-log10(padj))))
# Calculate breaks for the y-axis
y_breaks_below <- seq(0, 60, by=25)
y_breaks_compressed <- c(60.0, 64.0)
y_breaks_above <- c()
y_breaks <- c(y_breaks_below, y_breaks_compressed, y_breaks_above)
y_labels_below <- seq(0, 60, by=25)
y_labels_compressed <- c(60, 180)
y_labels_above <- c()
y_labels <- c(y_labels_below, y_labels_compressed, y_labels_above)
# Create the plot
png("K3R_24h_vs_control.png", width=1200, height=1200)
ggplot(geness_res, aes(x = log2FoldChange, y = adjusted_pvalue, color = Color, label = external_gene_name)) +
geom_vline(xintercept = c(0.585, -0.585), lty = "dashed", size = 1.5) +
geom_hline(yintercept = -log10(0.05), lty = "dashed", size = 1.5) +
geom_point(size = 3) +
labs(x = "log2(Fold change)", y = "-log10(P-adj)", color = "Significance") +
scale_color_identity() +
geom_text_repel(data = subset(geness_res, external_gene_name %in% top_g & padj < 0.05 & (abs(log2FoldChange) >= 0.585)),
size = 8,
fontface = "bold",
point.padding = 0.15,
color = "black",
min.segment.length = .1,
box.padding = .2,
lwd = 2) +
theme_bw(base_size = 28) +
theme(legend.position = "bottom") +
annotate("rect", xmin = -Inf, xmax = Inf, ymin = compressed_range[1], ymax = compressed_range[1], linetype = "dashed", color = "grey") +
annotate("rect", xmin = -Inf, xmax = Inf, ymin = compressed_range[2], ymax = compressed_range[2], linetype = "dashed", color = "grey") +
annotate("text", x = -Inf, y = compressed_range[1], label = "/", hjust = 0, size = 10) +
annotate("text", x = -Inf, y = compressed_range[2], label = "/", hjust = 0, size = 10) +
scale_y_continuous(breaks = sort(y_breaks), labels = sort(y_labels))
dev.off()
#---------------- K3R_3+21h_vs_control (multiple compressed, NOT_FINISHED) ----------------
geness_res <- read.csv(file = "K3R_3hdox21hchase_vs_control-all.txt", sep=",", row.names=1)
# Color setting
geness_res$Color <- ifelse(geness_res$padj > 0.05 | abs(geness_res$log2FoldChange) < 0.585, "gray",
ifelse(geness_res$log2FoldChange > 0, "red", "blue"))
# Predefined genes colored in green
predefined_genes <- c()
geness_res$Color[geness_res$external_gene_name %in% predefined_genes] <- "green"
geness_res$invert_Padj <- (-log10(geness_res$padj)) * sign(geness_res$log2FoldChange)
top_g <- unique(c(geness_res[order(geness_res$invert_Padj, decreasing = TRUE), 'external_gene_name'][1:100],
geness_res[order(geness_res$invert_Padj, decreasing = FALSE), 'external_gene_name'][1:100]))
# Define the original and compressed ranges for multiple regions
original_ranges <- list(c(40, 49), c(51, 57))
compressed_ranges <- list(c(40.0, 41.0), c(51.0, 52.0))
adjust_value <- function(padj_value, original_range, compressed_range) {
if (padj_value > original_range[1] && padj_value <= original_range[2]) {
return(((padj_value - original_range[1]) / (original_range[2] - original_range[1])) *
(compressed_range[2] - compressed_range[1]) + compressed_range[1])
} else if (padj_value > original_range[2]) {
return(padj_value - (original_range[2] - original_range[1]) + (compressed_range[2] - compressed_range[1]))
} else {
return(padj_value)
}
}
geness_res$adjusted_pvalue <- -log10(geness_res$padj)
for (i in 1:length(original_ranges)) {
geness_res$adjusted_pvalue <- mapply(adjust_value, geness_res$adjusted_pvalue,
MoreArgs = list(original_range = original_ranges[[i]],
compressed_range = compressed_ranges[[i]]))
}
# Calculate breaks and labels for the y-axis
y_breaks_below <- seq(0, 15, by=5)
y_breaks_compressed1 <- c(15.0, 16.0)
y_breaks_compressed2 <- c(35.0, 36.0)
y_breaks_above <- c(50)
y_breaks <- c(y_breaks_below, y_breaks_compressed1, y_breaks_compressed2, y_breaks_above)
y_labels_below <- seq(0, 15, by=5)
y_labels_compressed1 <- c(15, 25)
y_labels_compressed2 <- c(35, 45)
y_labels_above <- c(50)
y_labels <- c(y_labels_below, y_labels_compressed1, y_labels_compressed2, y_labels_above)
# Create the plot
png("K3R_3+21h_vs_control.png", width=1200, height=1200)
ggplot(geness_res, aes(x = log2FoldChange, y = adjusted_pvalue, color = Color, label = external_gene_name)) +
geom_vline(xintercept = c(0.585, -0.585), lty = "dashed", size = 1.5) +
geom_hline(yintercept = -log10(0.05), lty = "dashed", size = 1.5) +
geom_point(size = 3) +
labs(x = "log2(Fold change)", y = "-log10(P-adj)", color = "Significance") +
scale_color_identity() +
geom_text_repel(data = subset(geness_res, external_gene_name %in% top_g & padj < 0.05 & (abs(log2FoldChange) >= 0.585)),
size = 8,
fontface = "bold",
point.padding = 0.15,
color = "black",
min.segment.length = .1,
box.padding = .2,
lwd = 2) +
theme_bw(base_size = 28) +
theme(legend.position = "bottom") +
annotate("rect", xmin = -Inf, xmax = Inf, ymin = compressed_ranges[[1]][1], ymax = compressed_ranges[[1]][1], linetype = "dashed", color = "grey") +
annotate("rect", xmin = -Inf, xmax = Inf, ymin = compressed_ranges[[1]][2], ymax = compressed_ranges[[1]][2], linetype = "dashed", color = "grey") +
annotate("text", x = -Inf, y = compressed_ranges[[1]][1], label = "/", hjust = 0, size = 10) +
annotate("text", x = -Inf, y = compressed_ranges[[1]][2], label = "/", hjust = 0, size = 10) +
annotate("rect", xmin = -Inf, xmax = Inf, ymin = compressed_ranges[[2]][1], ymax = compressed_ranges[[2]][1], linetype = "dashed", color = "grey") +
annotate("rect", xmin = -Inf, xmax = Inf, ymin = compressed_ranges[[2]][2], ymax = compressed_ranges[[2]][2], linetype = "dashed", color = "grey") +
annotate("text", x = -Inf, y = compressed_ranges[[2]][1], label = "/", hjust = 0, size = 10) +
annotate("text", x = -Inf, y = compressed_ranges[[2]][2], label = "/", hjust = 0, size = 10) +
scale_y_continuous(breaks = sort(y_breaks), labels = sort(y_labels))
dev.off()
RNAseq Analysis LT-K331A-LTtr-control
-
nextflow
(rnaseq) [jhuang@sage Data_Denise_LT_RNAseq]$ nextflow run rnaseq/main.nf --input samplesheet.csv --outdir results_GRCh38 --genome GRCh38 -profile test_full -resume --max_memory 256.GB --max_time 2400.h --save_align_intermeds --save_unaligned --aligner 'star_salmon' --skip_multiqc ln -s ~/Tools/rnaseq/assets/multiqc_config.yaml multiqc_config.yaml multiqc -f --config multiqc_config.yaml . 2>&1 rm multiqc_config.yaml
-
construct DESeqDataSet
# Import the required libraries library("AnnotationDbi") library("clusterProfiler") library("ReactomePA") library(gplots) library(tximport) library(DESeq2) setwd("~/DATA/Data_Denise_LT_RNASeq/results_GRCh38/star_salmon") # Define paths to your Salmon output quantification files, quant.sf refers to tx-counts, later will be summaried as gene-counts. files <- c("control_DI" = "./control_d8_DI/quant.sf", "control_DII" = "./control_d8_DII/quant.sf", #"control_DII" = "./control_d8_DII_re/quant.sf", "LT_DI" = "./LT_d8_DI/quant.sf", "LT_DII" = "./LT_d8_DII/quant.sf", "LTtr_DI" = "./LTtr_d8_DI/quant.sf", "LTtr_DII" = "./LTtr_d8_DII/quant.sf", "LT_K331A_DI" = "./LT_K331A_d8_DI/quant.sf", #"LT_K331A_DII" = "./LT_K331A_d8_DII/quant.sf", "LT_K331A_DII" = "./LT_K331A_d8_DII_re/quant.sf") # ---- tx-level count data ---- # Import the transcript abundance data with tximport txi <- tximport(files, type = "salmon", txIn = TRUE, txOut = TRUE) # Define the replicates (or donors) and condition of the samples #donor <- factor(c("DI", "DII", "DII", "DI", "DII", "DI", "DII", "DI", "DII", "DII")) #batch <- factor(c("batch1", "batch1", "batch2", "batch1", "batch1", "batch1", "batch1", "batch1", "batch1", "batch2")) #condition <- factor(c("control", "control", "control", "LT", "LT", "LTtr", "LTtr", "LT_K331A", "LT_K331A", "LT_K331A")) donor <- factor(c("DI", "DII", "DI", "DII", "DI", "DII", "DI", "DII")) batch <- factor(c("batch1", "batch1", "batch1", "batch1", "batch1", "batch1", "batch1", "batch2")) condition <- factor(c("control", "control", "LT", "LT", "LTtr", "LTtr", "LT_K331A", "LT_K331A")) # Output raw count data to a CSV file write.csv(counts(dds), file="transcript_counts.csv") # ---- gene-level count data ---- # Read in the tx2gene map from salmon_tx2gene.tsv #tx2gene <- read.csv("salmon_tx2gene.tsv", sep="\t", header=FALSE) tx2gene <- read.table("salmon_tx2gene.tsv", header=FALSE, stringsAsFactors=FALSE) # Set the column names colnames(tx2gene) <- c("transcript_id", "gene_id", "gene_name") # Remove the gene_name column if not needed tx2gene <- tx2gene[,1:2] # Import and summarize the Salmon data with tximport txi <- tximport(files, type = "salmon", tx2gene = tx2gene, txOut = FALSE) # Continue with the DESeq2 workflow as before... colData <- data.frame(donor=donor, batch=batch, condition=condition, row.names=names(files)) dds <- DESeqDataSetFromTximport(txi, colData=colData, design=~donor+condition) #dds <- dds[rowSums(counts(dds) > 3) > 2, ] #60605-->26543 dds <- DESeq(dds) rld <- rlogTransformation(dds) write.csv(counts(dds, normalized=FALSE), file="gene_counts.csv") dim(counts(dds)) head(counts(dds), 10)
-
draw 3D PCA plots.
library(gplots) library("RColorBrewer") library(ggplot2) data <- plotPCA(rld, intgroup=c("condition", "donor"), returnData=TRUE) write.csv(data, file="plotPCA_data.csv") #calculate all PCs including PC3 with the following codes library(genefilter) ntop <- 500 rv <- rowVars(assay(rld)) select <- order(rv, decreasing = TRUE)[seq_len(min(ntop, length(rv)))] mat <- t( assay(rld)[select, ] ) pc <- prcomp(mat) pc$x[,1:3] #df_pc <- data.frame(pc$x[,1:3]) df_pc <- data.frame(pc$x) identical(rownames(data), rownames(df_pc)) #-->TRUE data$PC1 <- NULL data$PC2 <- NULL merged_df <- merge(data, df_pc, by = "row.names") #merged_df <- merged_df[, -1] row.names(merged_df) <- merged_df$Row.names merged_df$Row.names <- NULL # remove the "name" column merged_df$name <- NULL merged_df <- merged_df[, c("PC1","PC2","PC3","PC4","PC5","PC6","PC7","PC8","group","condition","donor")] write.csv(merged_df, file="merged_df_8PCs.csv") summary(pc) #0.3311 0.2376 0.1247 #0.3637 0.2564 0.1184 --> 0.36, 026, 0,12 draw_3D.py
-
draw_3D.py
import plotly.graph_objects as go import pandas as pd from sklearn.decomposition import PCA import numpy as np from scipy.linalg import eigh, sqrtm # Read in data as a pandas dataframe #df = pd.DataFrame({ # 'PC1': [-13.999925, -12.504291, -12.443057, -13.065235, -17.316215], # 'PC2': [-1.498823, -3.342411, -6.067055, -8.205809, 3.293993], # 'PC3': [-3.335085, 15.207755, -14.725450, 15.078469, -6.917358], # 'condition': ['GFP d3', 'GFP d3', 'GFP d8', 'GFP d8', 'GFP+mCh d9/12'], # 'donor': ['DI', 'DII', 'DI', 'DII', 'DI'] #}) df = pd.read_csv('merged_df_8PCs.csv', index_col=0, header=0) df['condition'] = df['condition'].replace("control", "ctrl d8") df['condition'] = df['condition'].replace("LTtr", "LTtr d8") df['condition'] = df['condition'].replace("LT", "LT d8") df['condition'] = df['condition'].replace("LT_K331A", "LT K331A d8") # Fit PCA model to reduce data dimensions to 3 pca = PCA(n_components=3) pca.fit(df.iloc[:, :-3]) X_reduced = pca.transform(df.iloc[:, :-3]) # Add reduced data back to dataframe df['PC1'] = X_reduced[:, 0] df['PC2'] = X_reduced[:, 1] df['PC3'] = X_reduced[:, 2] # Create PCA plot with 3D scatter fig = go.Figure() #['circle', 'circle-open', 'square', 'square-open', 'diamond', 'diamond-open', 'cross', 'x'] # if donor == 'DI' else marker=dict(size=2, opacity=0.8, color=condition_color, symbol=donor_symbol) #decrease diamond size to 6 while keep the circle as size 10 in the following code: #'rgb(128, 150, 128)' #I need three families of colors, always from light to deep, the first one should close to grey. #the first serie for 'ctrl LTtr+sT d9/12', 'LTtr+sT d9/12' #the second serie for 'ctrl LT/LTtr d3', 'ctrl LT/LTtr d8', 'LT d3', 'LT d8', 'LTtr d3', 'LTtr d8' #the third serie for 'ctrl sT d3', 'ctrl sT d8', 'sT d3', 'sT d8', 'sT+LT d3' condition_color_map_untreated = {'untreated':'black'} donor_symbol_map_untreated = {'DI': 'circle-open', 'DII': 'diamond-open'} #condition_color_map = {'ctrl LTtr+sT d9/12': 'green', 'GFP d3': 'blue', 'GFP d8': 'red', 'GFP+mCh d9/12': 'green', 'LT d3': 'orange'} condition_color_map = { 'ctrl d8': '#A9A9A9', 'LT d8': '#a6cee3', 'LT K331A d8': '#1f78b4', 'LTtr d8': '#e31a1c' } donor_symbol_map = {'DI': 'circle', 'DII': 'diamond'} for donor, donor_symbol in donor_symbol_map_untreated.items(): for condition, condition_color in condition_color_map_untreated.items(): mask = (df['condition'] == condition) & (df['donor'] == donor) fig.add_trace(go.Scatter3d(x=df.loc[mask, 'PC1'], y=df.loc[mask, 'PC2'], z=df.loc[mask, 'PC3'], mode='markers', name=f'{condition}' if donor == 'DI' else None, legendgroup=f'{condition}', showlegend=True if donor == 'DI' else False, marker=dict(size=4 if donor_symbol in ['diamond-open'] else 6, opacity=0.8, color=condition_color, symbol=donor_symbol))) for donor, donor_symbol in donor_symbol_map.items(): for condition, condition_color in condition_color_map.items(): mask = (df['condition'] == condition) & (df['donor'] == donor) fig.add_trace(go.Scatter3d(x=df.loc[mask, 'PC1'], y=df.loc[mask, 'PC2'], z=df.loc[mask, 'PC3'], mode='markers', name=f'{condition}' if donor == 'DI' else None, legendgroup=f'{condition}', showlegend=True if donor == 'DI' else False, marker=dict(size=4 if donor_symbol in ['diamond'] else 6, opacity=0.8, color=condition_color, symbol=donor_symbol))) for donor, donor_symbol in donor_symbol_map.items(): fig.add_trace(go.Scatter3d(x=[None], y=[None], z=[None], mode='markers', name=donor, legendgroup=f'{donor}', showlegend=True, marker=dict(size=6, opacity=1, color='black', symbol=donor_symbol), hoverinfo='none')) # Annotations for the legend blocks fig.update_layout( annotations=[ dict(x=1.1, y=1.0, xref='paper', yref='paper', showarrow=False, text='Condition', font=dict(size=15)), dict(x=1.1, y=0.6, xref='paper', yref='paper', showarrow=False, text='Donor', font=dict(size=15)), dict(x=1.08, y=0.2, xref='paper', yref='paper', showarrow=False, text='PC3: 12% variance', font=dict(size=15), textangle=-90) ], scene=dict( xaxis=dict(gridcolor='lightgrey', linewidth=2, linecolor='black', backgroundcolor='white', zerolinecolor='black', zerolinewidth=2, title='PC1: 36% variance'), yaxis=dict(gridcolor='lightgrey', linewidth=2, linecolor='black', backgroundcolor='white', zerolinecolor='black', zerolinewidth=2, title='PC2: 26% v.'), zaxis=dict(gridcolor='lightgrey', linewidth=2, linecolor='black', backgroundcolor='white', zerolinecolor='black', zerolinewidth=2, title=''), bgcolor='white' ), margin=dict(l=0, r=0, b=0, t=0) # Adjust the margins to prevent clipping of axis titles ) #0.3311 0.2376 0.1247 #summary(pc) #--> Proportion of Variance 0.3647 0.1731 0.1515 #percentVar <- round(100 * attr(data, "percentVar")) #percentVar <- c(36,17,15) #fig.show() fig.write_image("PCA_3D.svg")
-
draw 2D PCA plots.
library(gplots) library("RColorBrewer") #mat <- assay(rld) #mm <- model.matrix(~condition, colData(rld)) #mat <- limma::removeBatchEffect(mat, batch=rld$batch, design=mm) #assay(rld) <- mat # -- pca -- png("pca.png", 1200, 800) plotPCA(rld, intgroup=c("condition")) dev.off() # -- heatmap -- png("heatmap.png", 1200, 800) distsRL <- dist(t(assay(rld))) mat <- as.matrix(distsRL) hc <- hclust(distsRL) hmcol <- colorRampPalette(brewer.pal(9,"GnBu"))(100) heatmap.2(mat, Rowv=as.dendrogram(hc),symm=TRUE, trace="none",col = rev(hmcol), margin=c(13, 13)) dev.off()
-
differential expressions
dds$condition <- relevel(dds$condition, "control") dds = DESeq(dds, betaPrior=FALSE) resultsNames(dds) clist <- c("LT_K331A_vs_control", "LT_vs_control", "LTtr_vs_control") dds$condition <- relevel(dds$condition, "LT") dds = DESeq(dds, betaPrior=FALSE) resultsNames(dds) clist <- c("LT_K331A_vs_LT") library(dplyr) library(tidyverse) library(biomaRt) listEnsembl() listMarts() ensembl <- useEnsembl(biomart = "ensembl", dataset = "hsapiens_gene_ensembl", version="104") datasets <- listDatasets(ensembl) attributes = listAttributes(ensembl) attributes[1:25,] for (i in clist) { contrast = paste("condition", i, sep="_") res = results(dds, name=contrast) res <- res[!is.na(res$log2FoldChange),] geness <- getBM(attributes = c('ensembl_gene_id', 'external_gene_name', 'gene_biotype', 'entrezgene_id', 'chromosome_name', 'start_position', 'end_position', 'strand', 'description'), filters = 'ensembl_gene_id', values = rownames(res), mart = ensembl) geness_uniq <- distinct(geness, ensembl_gene_id, .keep_all= TRUE) res$ENSEMBL = rownames(res) identical(rownames(res), geness_uniq$ensembl_gene_id) res_df <- as.data.frame(res) geness_res <- merge(geness_uniq, res_df, by.x="ensembl_gene_id", by.y="ENSEMBL") dim(geness_res) rownames(geness_res) <- geness_res$ensembl_gene_id geness_res$ensembl_gene_id <- NULL write.csv(as.data.frame(geness_res[order(geness_res$pvalue),]), file = paste(i, "all.txt", sep="-")) up <- subset(geness_res, padj<=0.05 & log2FoldChange>=2) down <- subset(geness_res, padj<=0.05 & log2FoldChange<=-2) write.csv(as.data.frame(up[order(up$log2FoldChange,decreasing=TRUE),]), file = paste(i, "up.txt", sep="-")) write.csv(as.data.frame(down[order(abs(down$log2FoldChange),decreasing=TRUE),]), file = paste(i, "down.txt", sep="-")) }
-
volcano plots with automatically finding top_g
#sorted_geness_res <- geness_res %>% arrange(desc(log2FoldChange)) #CSF3, ADORA2A, RPS26P6, EHF, TNFRSF6B #sorted_geness_res <- geness_res %>% arrange(log2FoldChange) library(ggrepel) geness_res <- read.csv(file = "LT_K331A_vs_control-all.txt", sep=",", row.names=1) # Color setting geness_res$Color <- ifelse(geness_res$padj > 0.05 | abs(geness_res$log2FoldChange) < 2, "gray", ifelse(geness_res$log2FoldChange > 0, "red", "blue")) # Predefined genes colored in green predefined_genes <- c() #for wt_3+21h.png, K3R_3+21h.png, and *_vs_control.png geness_res$Color[geness_res$external_gene_name %in% predefined_genes] <- "green" geness_res$invert_Padj <- (-log10(geness_res$padj)) * sign(geness_res$log2FoldChange) top_g <- unique(c(geness_res[order(geness_res$invert_Padj, decreasing = TRUE), 'external_gene_name'][1:1000], geness_res[order(geness_res$invert_Padj, decreasing = FALSE), 'external_gene_name'][1:1000])) # Define the original and compressed ranges original_range <- c(18, 30) compressed_range <- c(18.0, 22.0) # Adjust the p-values based on the ranges geness_res$adjusted_pvalue <- with(geness_res, ifelse(-log10(padj) > original_range[1] & -log10(padj) <= original_range[2], ((-log10(padj) - original_range[1]) / (original_range[2] - original_range[1])) * (compressed_range[2] - compressed_range[1]) + compressed_range[1], ifelse(-log10(padj) > original_range[2], -log10(padj) - (original_range[2] - original_range[1]) + (compressed_range[2] - compressed_range[1]), -log10(padj)))) # Calculate breaks for the y-axis y_breaks_below <- seq(0, 15, by=5) y_breaks_compressed <- c(18.0, 22.0) y_breaks_above <- c(27.0) y_breaks <- c(y_breaks_below, y_breaks_compressed, y_breaks_above) y_labels_below <- seq(0, 15, by=5) y_labels_compressed <- c(18, 30) y_labels_above <- c(35) y_labels <- c(y_labels_below, y_labels_compressed, y_labels_above) # Create the plot png("LT_K331A_vs_control.png", width=1000, height=1000) ggplot(geness_res, aes(x = log2FoldChange, y = adjusted_pvalue, color = Color, label = external_gene_name)) + geom_vline(xintercept = c(2, -2), lty = "dashed", size = 1.5) + geom_hline(yintercept = -log10(0.05), lty = "dashed", size = 1.5) + geom_point(size = 3) + labs(x = "log2(Fold change)", y = "-log10(P-adj)", color = "Significance") + scale_color_identity() + geom_text_repel(data = subset(geness_res, external_gene_name %in% top_g & padj < 0.05 & (abs(log2FoldChange) >= 2)), size = 6, point.padding = 0.15, color = "black", min.segment.length = .1, box.padding = .2, lwd = 2) + theme_bw(base_size = 22) + theme(legend.position = "bottom") + annotate("rect", xmin = -Inf, xmax = Inf, ymin = compressed_range[1], ymax = compressed_range[1], linetype = "dashed", color = "grey") + annotate("rect", xmin = -Inf, xmax = Inf, ymin = compressed_range[2], ymax = compressed_range[2], linetype = "dashed", color = "grey") + annotate("text", x = -Inf, y = compressed_range[1], label = "/", hjust = 0, size = 10) + annotate("text", x = -Inf, y = compressed_range[2], label = "/", hjust = 0, size = 10) + scale_y_continuous(breaks = sort(y_breaks), labels = sort(y_labels)) dev.off() geness_res <- read.csv(file = "LT_vs_control-all.txt", sep=",", row.names=1) # Color setting geness_res$Color <- ifelse(geness_res$padj > 0.05 | abs(geness_res$log2FoldChange) < 2, "gray", ifelse(geness_res$log2FoldChange > 0, "red", "blue")) # Predefined genes colored in green predefined_genes <- c() #for wt_3+21h.png, K3R_3+21h.png, and *_vs_control.png geness_res$Color[geness_res$external_gene_name %in% predefined_genes] <- "green" geness_res$invert_Padj <- (-log10(geness_res$padj)) * sign(geness_res$log2FoldChange) top_g <- unique(c(geness_res[order(geness_res$invert_Padj, decreasing = TRUE), 'external_gene_name'][1:400], geness_res[order(geness_res$invert_Padj, decreasing = FALSE), 'external_gene_name'][1:400])) # Define the original and compressed ranges original_range <- c(18, 38) compressed_range <- c(18.0, 22.0) # Adjust the p-values based on the ranges geness_res$adjusted_pvalue <- with(geness_res, ifelse(-log10(padj) > original_range[1] & -log10(padj) <= original_range[2], ((-log10(padj) - original_range[1]) / (original_range[2] - original_range[1])) * (compressed_range[2] - compressed_range[1]) + compressed_range[1], ifelse(-log10(padj) > original_range[2], -log10(padj) - (original_range[2] - original_range[1]) + (compressed_range[2] - compressed_range[1]), -log10(padj)))) # Calculate breaks for the y-axis y_breaks_below <- seq(0, 15, by=5) y_breaks_compressed <- c(18.0, 22.0) y_breaks_above <- c(27.0) y_breaks <- c(y_breaks_below, y_breaks_compressed, y_breaks_above) y_labels_below <- seq(0, 15, by=5) y_labels_compressed <- c(18, 38) y_labels_above <- c(43) y_labels <- c(y_labels_below, y_labels_compressed, y_labels_above) # Create the plot png("LT_vs_control.png", width=1000, height=1000) ggplot(geness_res, aes(x = log2FoldChange, y = adjusted_pvalue, color = Color, label = external_gene_name)) + geom_vline(xintercept = c(2, -2), lty = "dashed", size = 1.5) + geom_hline(yintercept = -log10(0.05), lty = "dashed", size = 1.5) + geom_point(size = 3) + labs(x = "log2(Fold change)", y = "-log10(P-adj)", color = "Significance") + scale_color_identity() + geom_text_repel(data = subset(geness_res, external_gene_name %in% top_g & padj < 0.05 & (abs(log2FoldChange) >= 2)), size = 6, point.padding = 0.15, color = "black", min.segment.length = .1, box.padding = .2, lwd = 2) + theme_bw(base_size = 22) + theme(legend.position = "bottom") + annotate("rect", xmin = -Inf, xmax = Inf, ymin = compressed_range[1], ymax = compressed_range[1], linetype = "dashed", color = "grey") + annotate("rect", xmin = -Inf, xmax = Inf, ymin = compressed_range[2], ymax = compressed_range[2], linetype = "dashed", color = "grey") + annotate("text", x = -Inf, y = compressed_range[1], label = "/", hjust = 0, size = 10) + annotate("text", x = -Inf, y = compressed_range[2], label = "/", hjust = 0, size = 10) + scale_y_continuous(breaks = sort(y_breaks), labels = sort(y_labels)) dev.off() geness_res <- read.csv(file = "LTtr_vs_control-all.txt", sep=",", row.names=1) # Color setting geness_res$Color <- ifelse(geness_res$padj > 0.05 | abs(geness_res$log2FoldChange) < 2, "gray", ifelse(geness_res$log2FoldChange > 0, "red", "blue")) # Predefined genes colored in green predefined_genes <- c() #for wt_3+21h.png, K3R_3+21h.png, and *_vs_control.png geness_res$Color[geness_res$external_gene_name %in% predefined_genes] <- "green" geness_res$invert_Padj <- (-log10(geness_res$padj)) * sign(geness_res$log2FoldChange) top_g <- unique(c(geness_res[order(geness_res$invert_Padj, decreasing = TRUE), 'external_gene_name'][1:400], geness_res[order(geness_res$invert_Padj, decreasing = FALSE), 'external_gene_name'][1:400])) # Define the original and compressed ranges original_range <- c(15, 25) compressed_range <- c(15.0, 18.0) # Adjust the p-values based on the ranges geness_res$adjusted_pvalue <- with(geness_res, ifelse(-log10(padj) > original_range[1] & -log10(padj) <= original_range[2], ((-log10(padj) - original_range[1]) / (original_range[2] - original_range[1])) * (compressed_range[2] - compressed_range[1]) + compressed_range[1], ifelse(-log10(padj) > original_range[2], -log10(padj) - (original_range[2] - original_range[1]) + (compressed_range[2] - compressed_range[1]), -log10(padj)))) # Calculate breaks for the y-axis y_breaks_below <- seq(0, 10, by=5) y_breaks_compressed <- c(15.0, 18.0) y_breaks_above <- c(23.0) y_breaks <- c(y_breaks_below, y_breaks_compressed, y_breaks_above) y_labels_below <- seq(0, 10, by=5) y_labels_compressed <- c(15, 25) y_labels_above <- c(30) y_labels <- c(y_labels_below, y_labels_compressed, y_labels_above) # Create the plot png("LTtr_vs_control.png", width=1000, height=1000) ggplot(geness_res, aes(x = log2FoldChange, y = adjusted_pvalue, color = Color, label = external_gene_name)) + geom_vline(xintercept = c(2, -2), lty = "dashed", size = 1.5) + geom_hline(yintercept = -log10(0.05), lty = "dashed", size = 1.5) + geom_point(size = 3) + labs(x = "log2(Fold change)", y = "-log10(P-adj)", color = "Significance") + scale_color_identity() + geom_text_repel(data = subset(geness_res, external_gene_name %in% top_g & padj < 0.05 & (abs(log2FoldChange) >= 2)), size = 6, point.padding = 0.15, color = "black", min.segment.length = .1, box.padding = .2, lwd = 2) + theme_bw(base_size = 22) + theme(legend.position = "bottom") + annotate("rect", xmin = -Inf, xmax = Inf, ymin = compressed_range[1], ymax = compressed_range[1], linetype = "dashed", color = "grey") + annotate("rect", xmin = -Inf, xmax = Inf, ymin = compressed_range[2], ymax = compressed_range[2], linetype = "dashed", color = "grey") + annotate("text", x = -Inf, y = compressed_range[1], label = "/", hjust = 0, size = 10) + annotate("text", x = -Inf, y = compressed_range[2], label = "/", hjust = 0, size = 10) + scale_y_continuous(breaks = sort(y_breaks), labels = sort(y_labels)) dev.off() geness_res <- read.csv(file = "LT_K331A_vs_LT-all.txt", sep=",", row.names=1) # Color setting geness_res$Color <- ifelse(geness_res$padj > 0.05 | abs(geness_res$log2FoldChange) < 2, "gray", ifelse(geness_res$log2FoldChange > 0, "red", "blue")) # Predefined genes colored in green predefined_genes <- c() #for wt_3+21h.png, K3R_3+21h.png, and *_vs_control.png geness_res$Color[geness_res$external_gene_name %in% predefined_genes] <- "green" geness_res$invert_Padj <- (-log10(geness_res$padj)) * sign(geness_res$log2FoldChange) top_g <- unique(c(geness_res[order(geness_res$invert_Padj, decreasing = TRUE), 'external_gene_name'][1:400], geness_res[order(geness_res$invert_Padj, decreasing = FALSE), 'external_gene_name'][1:400])) # Define the original and compressed ranges original_range <- c(10, 11) compressed_range <- c(10.0, 11.0) # Adjust the p-values based on the ranges geness_res$adjusted_pvalue <- with(geness_res, ifelse(-log10(padj) > original_range[1] & -log10(padj) <= original_range[2], ((-log10(padj) - original_range[1]) / (original_range[2] - original_range[1])) * (compressed_range[2] - compressed_range[1]) + compressed_range[1], ifelse(-log10(padj) > original_range[2], -log10(padj) - (original_range[2] - original_range[1]) + (compressed_range[2] - compressed_range[1]), -log10(padj)))) # Create the plot png("LT_K331A_vs_LT.png", width=1000, height=1000) ggplot(geness_res, aes(x = log2FoldChange, y = adjusted_pvalue, color = Color, label = external_gene_name)) + geom_vline(xintercept = c(2, -2), lty = "dashed", size = 1.5) + geom_hline(yintercept = -log10(0.05), lty = "dashed", size = 1.5) + geom_point(size = 3) + labs(x = "log2(Fold change)", y = "-log10(P-adj)", color = "Significance") + scale_color_identity() + geom_text_repel(data = subset(geness_res, external_gene_name %in% top_g & padj < 0.05 & (abs(log2FoldChange) >= 2)), size = 6, point.padding = 0.15, color = "black", min.segment.length = .1, box.padding = .2, lwd = 2) + theme_bw(base_size = 22) + theme(legend.position = "bottom") dev.off() for i in LT_K331A_vs_control LT_vs_control LTtr_vs_control LT_K331A_vs_LT; do echo "~/Tools/csv2xls-0.4/csv_to_xls.py ${i}-all.txt ${i}-up.txt ${i}-down.txt -d$',' -o ${i}.xls;" done
-
clustering the genes and draw heatmap
install.packages("gplots") library("gplots") for i in LT_K331A_vs_control LT_vs_control LTtr_vs_control LT_K331A_vs_LT; do echo "cut -d',' -f1-1 ${i}-up.txt > ${i}-up.id" echo "cut -d',' -f1-1 ${i}-down.txt > ${i}-down.id" done # 307 LT_K331A_vs_control-down.id # 667 LT_K331A_vs_control-up.id # 66 LT_K331A_vs_LT-down.id # 71 LT_K331A_vs_LT-up.id # 157 LTtr_vs_control-down.id # 484 LTtr_vs_control-up.id # 379 LT_vs_control-down.id # 749 LT_vs_control-up.id # 2880 total cat *.id | sort -u > ids #add Gene_Id in the first line, delete the "" GOI <- read.csv("ids")$Gene_Id #570 genes RNASeq.NoCellLine <- assay(rld) # Defining the custom order column_order <- c( "control_DI", "control_DII", "LTtr_DI", "LTtr_DII", "LT_DI", "LT_DII", "LT_K331A_DI", "LT_K331A_DII" ) RNASeq.NoCellLine_reordered <- RNASeq.NoCellLine[, column_order] head(RNASeq.NoCellLine_reordered) #clustering methods: "ward.D", "ward.D2", "single", "complete", "average" (= UPGMA), "mcquitty" (= WPGMA), "median" (= WPGMC) or "centroid" (= UPGMC). pearson or spearman datamat = RNASeq.NoCellLine_reordered[GOI, ] write.csv(as.data.frame(datamat), file ="DEGs_heatmap_data.csv") hr <- hclust(as.dist(1-cor(t(datamat), method="pearson")), method="complete") hc <- hclust(as.dist(1-cor(datamat, method="spearman")), method="complete") mycl = cutree(hr, h=max(hr$height)/1.05) mycol = c("YELLOW", "DARKBLUE", "DARKORANGE", "DARKMAGENTA", "DARKCYAN", "DARKRED", "MAROON", "DARKGREEN", "LIGHTBLUE", "PINK", "MAGENTA", "LIGHTCYAN","LIGHTGREEN", "BLUE", "ORANGE", "CYAN", "RED", "GREEN"); mycol = mycol[as.vector(mycl)] png("DEGs_heatmap.png", width=900, height=1010) heatmap.2(as.matrix(datamat),Rowv=as.dendrogram(hr),Colv = NA, dendrogram = 'row', scale='row',trace='none',col=bluered(75), RowSideColors = mycol, labRow="", srtCol=20, keysize=0.72, cexRow = 2, cexCol = 1.4) dev.off() # Extract rows from datamat where the row names match the identifiers in subset_1 #### cluster members ##### subset_1<-names(subset(mycl, mycl == '1')) data <- as.data.frame(datamat[rownames(datamat) %in% subset_1, ]) #579 subset_2<-names(subset(mycl, mycl == '2')) data <- as.data.frame(datamat[rownames(datamat) %in% subset_2, ]) #399 subset_3<-names(subset(mycl, mycl == '3')) data <- as.data.frame(datamat[rownames(datamat) %in% subset_3, ]) #70 subset_4<-names(subset(mycl, mycl == '4')) data <- as.data.frame(datamat[rownames(datamat) %in% subset_4, ]) #433 subset_5<-names(subset(mycl, mycl == '5')) data <- as.data.frame(datamat[rownames(datamat) %in% subset_5, ]) #140 # Initialize an empty data frame for the annotated data annotated_data <- data.frame() # Determine total number of genes total_genes <- length(rownames(data)) # Loop through each gene to annotate for (i in 1:total_genes) { gene <- rownames(data)[i] result <- getBM(attributes = c('ensembl_gene_id', 'external_gene_name', 'gene_biotype', 'entrezgene_id', 'chromosome_name', 'start_position', 'end_position', 'strand', 'description'), filters = 'ensembl_gene_id', values = gene, mart = ensembl) # If multiple rows are returned, take the first one if (nrow(result) > 1) { result <- result[1, ] } # Check if the result is empty if (nrow(result) == 0) { result <- data.frame(ensembl_gene_id = gene, external_gene_name = NA, gene_biotype = NA, entrezgene_id = NA, chromosome_name = NA, start_position = NA, end_position = NA, strand = NA, description = NA) } # Transpose expression values expression_values <- t(data.frame(t(data[gene, ]))) colnames(expression_values) <- colnames(data) # Combine gene information and expression data combined_result <- cbind(result, expression_values) # Append to the final dataframe annotated_data <- rbind(annotated_data, combined_result) # Print progress every 100 genes if (i %% 100 == 0) { cat(sprintf("Processed gene %d out of %d\n", i, total_genes)) } } # Save the annotated data to a new CSV file write.csv(annotated_data, "cluster1_YELLOW.csv", row.names=FALSE) write.csv(annotated_data, "cluster2_DARKBLUE.csv", row.names=FALSE) write.csv(annotated_data, "cluster3_DARKORANGE.csv", row.names=FALSE) write.csv(annotated_data, "cluster4_DARKMAGENTA.csv", row.names=FALSE) write.csv(annotated_data, "cluster5_DARKCYAN.csv", row.names=FALSE) #~/Tools/csv2xls-0.4/csv_to_xls.py cluster*.csv -d',' -o DEGs_heatmap_clusters.xls
HSV Types and Toll-Like Receptors’ Role in Immunity
-
Herpes Simplex Virus 1 (HSV-1) and Herpes Simplex Virus 2 (HSV-2) are closely related viruses that belong to the Herpesviridae family. Both viruses have a similar genomic structure but differ in their DNA sequences, leading to different serological and biological properties. Here are some of the key genomic differences between them:
-
Genome Sequence Variation: While HSV-1 and HSV-2 share approximately 50% sequence homology at the nucleotide level, there are specific regions in their genomes that are highly divergent. This sequence variation underlies many of the biological and immunological differences between the two viruses.
-
Genome Size: Both HSV-1 and HSV-2 genomes are relatively large and complex, with each containing linear double-stranded DNA. Their genome sizes are very similar, with HSV-1 being approximately 152 kbp and HSV-2 being about 155 kbp.
-
Gene Number and Arrangement: Both viruses encode for more than 70 proteins. Although many of these genes are conserved between the two viruses and have similar functions, there are differences in the arrangement and expression of some genes that can contribute to the distinct properties of each virus.
-
Unique Short (US) and Unique Long (UL) Regions: Both viruses have regions in their genome termed the Unique Short (US) and Unique Long (UL) regions. These regions are flanked by inverted repeat sequences. The gene content of the US and UL regions differs between HSV-1 and HSV-2, which leads to differences in viral replication, virulence, and pathogenesis.
-
Latency-Associated Transcripts (LATs): Both HSV-1 and HSV-2 establish latent infections in sensory neurons. While the mechanisms of latency are not entirely understood, Latency-Associated Transcripts (LATs) play a crucial role. Although both viruses produce LATs, there are differences in their structures and possible functions between HSV-1 and HSV-2.
-
Immunological Cross-reactivity: Due to their genetic similarities, there is a degree of immunological cross-reactivity between HSV-1 and HSV-2. However, certain genomic regions and the proteins they encode are more specific to one type of virus, leading to distinct serological responses.
-
-
“Toll型受体”(Toll-like receptors,简称TLRs)是天然免疫系统中的关键组成部分,对于宿主防御外来病原体至关重要。以下是关于Toll型受体的更多详细信息:
-
定义:Toll型受体是一类在细胞表面和内质网上找到的受体蛋白,它们可以识别特定的病原体相关分子模式(PAMPs)和损伤相关分子模式(DAMPs)。
-
功能:当TLRs识别到PAMPs或DAMPs时,它们会激活信号通路,从而导致产生炎症细胞因子和其他免疫反应。这为机体提供了一种迅速反应的机制,帮助抵抗感染。
-
种类:人体内有多种Toll型受体,如TLR1至TLR10。每种TLR都对应于特定的PAMPs,因此可以识别和反应于不同的病原体。
-
分布:Toll型受体主要在免疫细胞(如巨噬细胞、树突状细胞)上表达,但也在其他细胞类型中找到。
-
疾病关联:TLRs在许多疾病中都有关联,包括自身免疫性疾病、感染性疾病、和一些癌症。TLR信号异常可能导致炎症反应的过度或减弱。
-
治疗潜力:由于TLRs在免疫应答中的关键作用,研究人员正在研究如何利用TLRs进行疾病治疗,例如开发TLR激动剂或抑制剂以调节免疫反应。
-
-
在人类单纯疱疹病毒(Herpes Simplex Virus,简称HSV)感染的背景下,Toll型受体(Toll-like receptors, TLRs)在早期的免疫应答中起到关键的作用。以下是关于HSV感染中TLRs的一些详细信息:
-
TLR2和TLR9的作用:在HSV感染中,特定的TLRs如TLR2和TLR9已被证实与病毒检测有关。例如,TLR2能够识别HSV的病毒膜蛋白,而TLR9则能够识别病毒的DNA。
-
信号转导:当TLRs被激活后,它们会触发信号通路,如MyD88依赖通路,进而促进细胞因子和化学趋化因子的产生。这有助于招募和激活免疫细胞来清除病毒。
-
免疫应答调节:TLRs激活后的信号转导也会调节自然杀伤细胞、T细胞和B细胞的免疫应答,这对于控制HSV感染和清除病毒很重要。
-
HSV的免疫逃逸策略:HSV已经进化出了多种策略来避免宿主的免疫应答,包括对TLR信号通路的干扰。例如,病毒蛋白ICP0可以阻止某些细胞因子的产生,这些细胞因子通常是通过TLR信号通路激活的。
-
治疗应用:考虑到TLRs在HSV感染的早期阶段的作用,它们可以被视为治疗的潜在目标。例如,利用TLR激动剂可以提高免疫系统对HSV的反应,这可能有助于疾病的治疗或预防。
-
-
人类单纯疱疹病毒(Herpes Simplex Virus,简称HSV)主要有两个亚型:
-
HSV-1 (Herpes Simplex Virus Type 1):通常称为口腔疱疹病毒。
- 主要感染部位:通常感染嘴巴、嘴唇、面部和眼睛,导致所谓的”冷疮”或”发烧疮”。
- 传播途径:主要通过接触受感染的皮肤、分泌物或物体(如唾液或通过接吻)来传播。
- 其他症状:在某些情况下,HSV-1还可以导致脑炎或角膜炎。
-
HSV-2 (Herpes Simplex Virus Type 2):通常被称为生殖疱疹病毒。
- 主要感染部位:通常感染生殖器和肛门,导致生殖器疱疹。
- 传播途径:主要通过性接触传播。
- 其他症状:新生儿在出生过程中可能会从感染HSV-2的母亲那里感染,这可能导致新生儿疱疹,这是一种严重的疾病,可能导致死亡。
-
RNAseq Refinement of Viral Sequences with StringTie
A common tool to utilize RNAseq data to correct or enhance a reference genome, especially for viruses, is StringTie. This tool is primarily designed for transcript assembly and quantification based on the alignments of RNAseq reads.
Here’s a basic pipeline using StringTie with a reference virus sequence:
-
Mapping/Aligning Reads to the Reference:
-
You’ll first need to align the RNAseq reads to the reference genome. You can use STAR or HISAT2 for this.
STAR --runThreadN 4 --genomeDir /path/to/genomeDir --readFilesIn /path/to/rnaseq.fastq --outFileNamePrefix /path/to/output_prefix
-
-
Transcript Assembly using StringTie:
-
With the alignment file produced (usually in BAM or SAM format), you can use StringTie to assemble the transcripts.
stringtie -p 4 -o output.gtf -l virus /path/to/aligned_data.bam
-
-
Compare and Correct the Reference:
- If you have a reference annotation, you can compare the assembled transcripts against the annotations to identify novel transcripts or refine existing ones.
-
GFFCompare can be used to compare and evaluate the accuracy of the assembled transcripts against the reference annotation.
gffcompare -r reference_annotation.gtf -G -o comparison_output output.gtf
-
Visual Inspection (Optional but Recommended):
- You can visualize the alignments and the assembled transcripts using a genome browser like IGV (Integrative Genomics Viewer). This can give you insights into regions with alternative splicing, novel exons, or discrepancies between your data and the reference annotation.
-
Further Analysis:
- Based on the assembled transcripts, you can proceed to differential expression analysis, identification of novel transcripts, or even SNPs/variants detection if needed.
For the entire process, you’d need:
- Reference genome (for STAR or HISAT2 alignment).
- RNAseq FASTQ files.
- (Optionally) Reference annotation for comparison.
Problembehebung bei langsamer Installation von OpenJDK mit Conda
-
Verwenden Sie Mamba: Mamba ist eine schnellere Alternative zu Conda und kann Pakete und deren Abhängigkeiten schneller auflösen und installieren. Sie können es mit ‘conda install mamba’ installieren und dann ‘mamba install -c conda-forge openjdk’ verwenden.
-
Aktualisieren Sie Conda: Ein veralteter Conda-Client kann Probleme verursachen. Sie können ihn mit ‘conda update conda’ aktualisieren.
-
Setzen Sie den Conda Cache zurück: Löschen Sie den Cache von Conda mit ‘conda clean –all’ und versuchen Sie es erneut.
-
Prüfen Sie Ihre Internetverbindung: Stellen Sie sicher, dass Sie eine stabile und relativ schnelle Internetverbindung haben.
-
Andere Spiegelserver verwenden: Manchmal kann die Auswahl eines anderen Mirrors oder Kanals helfen. Sie könnten versuchen, andere Kanäle hinzuzufügen oder die Reihenfolge der Kanäle in Ihrer .condarc-Datei zu ändern.
-
Manueller Download: Sie können das Paket manuell von der Conda-Forge-Website herunterladen und dann lokal mit Conda installieren.
Wenn alle oben genannten Vorschläge nicht funktionieren, könnten Sie überlegen, OpenJDK von einer anderen Quelle oder durch eine andere Methode zu installieren, z.B. mit einem Paketmanager Ihres Betriebssystems oder durch den Download von der offiziellen OpenJDK-Website.
Genomic Organization of Herpes Simplex Virus type 1 (HSV-1 s17)
HSV (Herpes Simplex Virus) contains two types of repeated sequences, which are:
-
IRS (Internal Repeat, Short): This refers to the short repeated sequences located between the unique long (UL) and unique short (US) regions of the HSV genome. There are two copies of IRS in the genome flanking the US region.
-
TRL (Terminal Repeat, Long) and IRL (Internal Repeat, Long): These are the long repeated sequences in the HSV genome. The TRL sequences are found at the very ends (terminals) of the linear HSV genome, while the IRL sequences are found internally, flanking the UL region.
The organization of the HSV genome can be summarized as: TRL – UL – IRL – US – IRS – US (in reverse orientation) – IRL
Here’s a brief breakdown:
-
UL (Unique Long): This is a unique sequence region found once in the genome.
-
US (Unique Short): This is another unique sequence region but it is shorter than UL and is found flanked by IRS sequences.
-
IRS (Internal Repeat Short): These are short repeated sequences that flank the US region.
-
TRL (Terminal Repeat Long) and IRL (Internal Repeat Long): The long repeated sequences found at the genome’s terminals and internally flanking the UL region.
These repeated sequences play crucial roles in the HSV life cycle, especially during the processes of recombination, genome replication, and the switch between latency and active replication.
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
def read_gtf(filename):
with open(filename, 'r') as file:
lines = file.readlines()
features = []
for line in lines:
if not line.startswith("#"):
split_line = line.strip().split("\t")
feature_type = split_line[2]
start = int(split_line[3])
end = int(split_line[4])
try:
gene_name = [x for x in split_line[8].split(";") if "gene_id" in x][0].split('"')[1]
except:
gene_name = "unknown"
strand = split_line[6]
features.append((feature_type, start, end, gene_name, strand))
return features
def plot_features(features, genome_id):
fig, ax = plt.subplots(figsize=(12, 16))
y_offset = 0
y_increment = 1
y_positions = {}
for feature_type, start, end, gene_name, strand in features:
if feature_type == "gene":
if gene_name not in y_positions:
y_positions[gene_name] = y_offset
y_offset += y_increment
y_pos = y_positions[gene_name]
color = "lightblue" if strand == "-" else (1, 0.6, 0.6) # Using RGB
rect = mpatches.Rectangle([start, y_pos], end-start, 0.6, ec="none", fc=color)
ax.add_patch(rect)
if strand == "+":
ax.text((start + end) / 2, y_pos, gene_name, ha='center', va='center', fontsize=9)
else:
ax.text((start + end) / 2, y_pos, gene_name, ha='center', va='center', fontsize=9)
ax.set_xlim(0, max([f[2] for f in features]))
ax.set_ylim(0, y_offset)
ax.set_yticks([])
ax.set_xlabel("Position (bp)")
ax.set_title(f"") #f"Genomic Organization of {genome_id}"
plt.tight_layout()
plt.savefig(f"{genome_id}_genomic_organization.png")
plt.show()
if __name__ == "__main__":
genome_id = "chrHsv1_s17"
features = read_gtf("chrHsv1_s17.gtf")
plot_features(features, genome_id)
RNA-seq on sage
-
TODO on sage
check the alignment of the reads to the annotation which sent from Munich is very bad, using the reference X14112 instead, find the CMV-GFP in the genome. Using alignment to detect the overall alignment rate to X14112 and chrHsv1_s17.
-
commands on sage
#under sage ln -s /home/jhuang/Tools/nf-core-rnaseq-3.12.0/ rnaseq [jhuang@sage Data_Caroline_RNAseq_wt_timecourse] nextflow run rnaseq/main.nf --input samplesheet_wt_timecourse.csv --outdir results_GRCh38 --genome GRCh38 -profile test_full -resume --max_memory 256.GB --max_time 2400.h --aligner 'star_salmon' --skip_multiqc [jhuang@sage Data_Caroline_RNAseq_wt_timecourse] nextflow run rnaseq/main.nf --input samplesheet_wt_timecourse.csv --outdir results_chrHsv1 --fasta chrHsv1_s17.fasta --gtf chrHsv1_s17.gtf -profile test_full -resume --max_memory 256.GB --max_time 2400.h --save_reference --aligner 'star_salmon' --gtf_extra_attributes 'gene_id' --gtf_group_features 'transcript_id' --featurecounts_group_type 'gene_id' --featurecounts_feature_type 'transcript' --skip_rseqc --skip_dupradar --skip_preseq --skip_biotype_qc --skip_deseq2_qc --skip_multiqc [jhuang@sage Data_Caroline_RNAseq_brain_organoids] nextflow run rnaseq/main.nf --input samplesheet_brain_organoids.12.csv --outdir results_GRCh38 --genome GRCh38 -profile test_full -resume --max_memory 256.GB --max_time 2400.h --aligner 'star_salmon' --skip_multiqc [jhuang@sage Data_Caroline_RNAseq_brain_organoids] nextflow run rnaseq/main.nf --input samplesheet_brain_organoids.12.csv --outdir results_chrHsv1 --fasta chrHsv1_s17.fasta --gtf chrHsv1_s17.gtf -profile test_full -resume --max_memory 256.GB --max_time 2400.h --save_reference --aligner 'star_salmon' --gtf_extra_attributes 'gene_id' --gtf_group_features 'transcript_id' --featurecounts_group_type 'gene_id' --featurecounts_feature_type 'transcript' --skip_rseqc --skip_dupradar --skip_preseq --skip_biotype_qc --skip_deseq2_qc --skip_multiqc #Processing *.umi_extract.fastq.gz (rnaseq) [jhuang@sage Data_Manja_RNAseq_Organoids_Virus]$ nextflow run rnaseq/main.nf --input samplesheet.umi_extract.csv --outdir results_chrHsv1 --fasta chrHsv1_s17.fasta --gtf chrHsv1_s17.gtf -profile test_full -resume --max_memory 256.GB --max_time 2400.h --save_reference --aligner 'star_salmon' --gtf_extra_attributes 'gene_id' --gtf_group_features 'transcript_id' --featurecounts_group_type 'gene_id' --featurecounts_feature_type 'transcript' --skip_rseqc --skip_dupradar --skip_preseq --skip_biotype_qc --skip_deseq2_qc --skip_multiqc #Processing raw data prepared with umi protocol (rnaseq) [jhuang@sage Data_Manja_RNAseq_Organoids_Virus]$ nextflow run rnaseq/main.nf --input samplesheet.csv --outdir results_chrHsv1 --fasta chrHsv1_s17.fasta --gtf chrHsv1_s17.gtf --with_umi --umitools_extract_method "regex" --umitools_bc_pattern "^(?P
.{12}).*” –umitools_dedup_stats -profile test_full -resume –max_memory 256.GB –max_time 2400.h –save_reference –aligner ‘star_salmon’ –gtf_extra_attributes ‘gene_id’ –gtf_group_features ‘transcript_id’ –featurecounts_group_type ‘gene_id’ –featurecounts_feature_type ‘transcript’ –skip_rseqc –skip_dupradar –skip_preseq –skip_biotype_qc –skip_deseq2_qc –skip_multiqc –min_mapped_reads 0 #Debug the following error: added “–minAssignedFrags 0 \\” to modules/nf-core/salmon/quant/main.nf option “salmon quant” and added “–min_mapped_reads 0” in the nextflow command above #hits: 0; hits per frag: 0[2023-10-20 11:35:22.944] [jointLog] [warning] salmon was only able to assign 0 fragments to transcripts in the index, but the minimum number of required assigned fragments (–minAssignedFrags) was 1. This could be indicative of a mismatch between the reference and sample, or a very bad sample. You can change the –minAssignedFrags parameter to force salmon to quantify with fewer assigned fragments (must have at least 1). (rnaseq) [jhuang@sage Data_Denise_LT_RNAseq]$ nextflow run rnaseq/main.nf –input samplesheet.csv –outdir results_GRCh38 –genome GRCh38 -profile test_full -resume –max_memory 256.GB –max_time 2400.h –save_align_intermeds –save_unaligned –aligner ‘star_salmon’ –skip_multiqc (rnaseq) [jhuang@sage Data_Samira_RNAseq]$ nextflow run rnaseq/main.nf –input samplesheet.csv –outdir results_GRCh38 –genome GRCh38 -profile test_full -resume –max_memory 256.GB –max_time 2400.h –save_align_intermeds –save_unaligned –aligner ‘star_salmon’ (rnaseq) [jhuang@sage Data_Manja_RNAseq_Organoids]$ nextflow run rnaseq/main.nf –input samplesheet.csv –outdir results_GRCh38 –genome GRCh38 –with_umi –umitools_extract_method “regex” –umitools_bc_pattern “^(?P .{12}).*” -profile test_full -resume –max_memory 256.GB –max_time 2400.h –save_align_intermeds –save_unaligned –save_reference –aligner ‘star_salmon’ –pseudo_aligner ‘salmon’ (rnaseq) [jhuang@sage Data_Manja_RNAseq_Organoids_Virus]$ nextflow run rnaseq/main.nf –input samplesheet.csv –outdir results_chrHsv1_s17 –fasta “/home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/chrHsv1_s17.fasta” –gtf “/home/jhuang/DATA/Data_Manja_RNAseq_Organoids_Virus/chrHsv1_s17.gtf” –with_umi –umitools_extract_method “regex” –umitools_bc_pattern “^(?P .{12}).*” –umitools_dedup_stats –skip_rseqc –skip_dupradar –skip_preseq -profile test_full -resume –max_memory 256.GB –max_time 2400.h –save_align_intermeds –save_unaligned –save_reference –aligner ‘star_salmon’ –gtf_extra_attributes ‘gene_id’ –gtf_group_features ‘transcript_id’ –featurecounts_group_type ‘gene_id’ –featurecounts_feature_type ‘transcript’ –skip_multiqc ln -s ~/Tools/rnaseq/assets/multiqc_config.yaml multiqc_config.yaml multiqc -f –config multiqc_config.yaml . 2>&1 rm multiqc_config.yaml -
reference on sage
/home/jhuang/REFs/Homo_sapiens/Ensembl/GRCh38 /home/jhuang/REFs/Homo_sapiens/hg38-blacklist.bed #C3i Science Day – Novel approaches to study the immune-tissue interface