-
Integration and analysis of multi-omics data: Explore the integration of different types of omics data (e.g., genomics, transcriptomics, proteomics) from public repositories to uncover novel biological insights and identify potential biomarkers or therapeutic targets.
-
Comparative genomics and evolution: Utilize publicly available genomic data to study the evolution of specific gene families or regulatory elements across different species. Investigate the functional implications of genomic variations and their impact on phenotype.
-
Network analysis of biological systems: Build and analyze biological networks using public repository data, such as protein-protein interaction networks or gene regulatory networks, to identify key nodes or modules associated with specific diseases or biological processes.
-
Machine learning and predictive modeling: Apply machine learning algorithms to public repository data to develop predictive models for disease diagnosis, drug response prediction, or patient outcome prognosis. Explore the potential of deep learning techniques for analyzing large-scale biological datasets.
-
Functional annotation and pathway analysis: Use publicly available functional annotation databases and pathway information to annotate and interpret genomic or transcriptomic data. Identify enriched biological pathways or functional modules associated with specific conditions or treatments.
-
Metagenomics and microbiome analysis: Analyze publicly available metagenomic and microbiome data to investigate microbial diversity, community dynamics, and functional potential in different environments or disease states. Explore the role of the microbiome in human health and disease.
-
Drug repurposing and target identification: Utilize public repository data, including drug databases and gene expression profiles, to identify potential drug candidates for repurposing and discover new therapeutic targets for specific diseases.
Author Archives: gene_x
RNAseq processing (1457)
-
construct DESeqDataSet from Matrix
library("AnnotationDbi") library("clusterProfiler") library("ReactomePA") library("org.Hs.eg.db") library(DESeq2) library(gplots) library(ggplot2) library(ggrepel) setwd("/home/jhuang/DATA/Data_Samira_RNAseq/results/featureCounts/") #cut -f2- merged_gene_counts.txt > merged_gene_counts_2.txt d.raw<- read.delim2("merged_gene_counts.txt",sep="\t", header=TRUE, row.names=1) colnames(d.raw)<- c("gene_name","TCR9_r3","X1457_r1","mock_r3","M2_r2","delta9H_r1","delta9H_r3","M2_r3","X1457_r2","M10_r3","TCR9_r1","M1_r1","TCR9_r2","M1_r2","X1457_r3","M10_r2","M2_r1","mock_r2","delta9H_r2","M1_r3","mock_r1","M10_r1") col_order <- c("gene_name","mock_r1","mock_r2","mock_r3","M1_r1","M1_r2","M1_r3","M2_r1","M2_r2","M2_r3","M10_r1","M10_r2","M10_r3","X1457_r1","X1457_r2","X1457_r3","TCR9_r1","TCR9_r2","TCR9_r3","delta9H_r1","delta9H_r2","delta9H_r3") reordered.raw <- d.raw[,col_order] reordered.raw$gene_name <- NULL #NOTE that we should d instead of d.raw!!!!!! d <- reordered.raw[rowSums(reordered.raw>3)>2,] condition = as.factor(c("mock","mock","mock","M1","M1","M1","M2","M2","M2","M10","M10","M10","X1457","X1457","X1457","TCR9","TCR9","TCR9","delta9H","delta9H","delta9H")) ids = as.factor(c("mock_r1","mock_r2","mock_r3","M1_r1","M1_r2","M1_r3","M2_r1","M2_r2","M2_r3","M10_r1","M10_r2","M10_r3","X1457_r1","X1457_r2","X1457_r3","TCR9_r1","TCR9_r2","TCR9_r3","delta9H_r1","delta9H_r2","delta9H_r3")) replicate = as.factor(c("r1","r2","r3","r1","r2","r3","r1","r2","r3","r1","r2","r3","r1","r2","r3","r1","r2","r3","r1","r2","r3")) #Note that we need d cData = data.frame(row.names=colnames(d), condition=condition, replicate=replicate, ids=ids) dds<-DESeqDataSetFromMatrix(countData=d, colData=cData, design=~condition) #batch+ > sizeFactors(dds) NULL > dds <- estimateSizeFactors(dds) > sizeFactors(dds) mock_r1 mock_r2 mock_r3 M1_r1 M1_r2 M1_r3 M2_r1 1.1510294 1.0108629 1.2046637 0.9219507 1.2912217 1.0233951 1.1781932 M2_r2 M2_r3 M10_r1 M10_r2 M10_r3 X1457_r1 X1457_r2 1.0286656 1.1274057 0.8521032 0.9723604 0.7937256 0.8869522 1.0276279 X1457_r3 TCR9_r1 TCR9_r2 TCR9_r3 delta9H_r1 delta9H_r2 delta9H_r3 0.8798504 1.4702299 0.9617160 1.2175588 0.7935592 0.8016998 1.0166897 bamCoverage --bam ./STAR/V_8_2_4_p600_d3_DonorIAligned.sortedByCoord.out.bam -o bigWigs/ctrl_d3_DonorI_norm.bw --binSize 10 --scaleFactor 0.8182619037059573 --effectiveGenomeSize 2913022398 bamCoverage --bam ./STAR/V_8_2_3_p600_d3_DonorIIAligned.sortedByCoord.out.bam -o bigWigs/ctrl_d3_DonorII_norm.bw --binSize 10 --scaleFactor 1.230524791752137 --effectiveGenomeSize 2913022398 bamCoverage --bam ./STAR/V_8_2_4_p600_d8_DonorIAligned.sortedByCoord.out.bam -o bigWigs/ctrl_d8_DonorI_norm.bw --binSize 10 --scaleFactor 0.9406161731990421 --effectiveGenomeSize 2913022398 bamCoverage --bam ./STAR/V_8_2_3_p600_d8_DonorIIAligned.sortedByCoord.out.bam -o bigWigs/ctrl_d8_DonorII_norm.bw --binSize 10 --scaleFactor 1.5944164810367278 --effectiveGenomeSize 2913022398 bamCoverage --bam ./STAR/V_8_4_2_p602_d3_DonorIAligned.sortedByCoord.out.bam -o bigWigs/LT_d3_DonorI_norm.bw --binSize 10 --scaleFactor 1.1691469144166922 --effectiveGenomeSize 2913022398 bamCoverage --bam ./STAR/V_8_4_2_p602_d3_DonorIIAligned.sortedByCoord.out.bam -o bigWigs/LT_d3_DonorII_norm.bw --binSize 10 --scaleFactor 0.9627956504743693 --effectiveGenomeSize 2913022398 bamCoverage --bam ./STAR/V_8_4_2_p602_d8_DonorIAligned.sortedByCoord.out.bam -o bigWigs/LT_d8_DonorI_norm.bw --binSize 10 --scaleFactor 0.9685710322875091 --effectiveGenomeSize 2913022398 bamCoverage --bam ./STAR/V_8_4_1_p602_d8_DonorIIAligned.sortedByCoord.out.bam -o bigWigs/LT_d8_DonorII_norm.bw --binSize 10 --scaleFactor 0.7369838699288324 --effectiveGenomeSize 2913022398 bamCoverage --bam ./STAR/V_8_2_4_p605_d3_DonorIAligned.sortedByCoord.out.bam -o bigWigs/LTtr_d3_DonorI_norm.bw --binSize 10 --scaleFactor 0.7650745897995206 --effectiveGenomeSize 2913022398 bamCoverage --bam ./STAR/V_8_2_3_p605_d3_DonorIIAligned.sortedByCoord.out.bam -o bigWigs/LTtr_d3_DonorII_norm.bw --binSize 10 --scaleFactor 1.2072732417906324 --effectiveGenomeSize 2913022398 bamCoverage --bam ./STAR/V_8_2_4_p605_d8_DonorIAligned.sortedByCoord.out.bam -o bigWigs/LTtr_d8_DonorI_norm.bw --binSize 10 --scaleFactor 0.617050461769713 --effectiveGenomeSize 2913022398 bamCoverage --bam ./STAR/V_8_2_3_p605_d8_DonorIIAligned.sortedByCoord.out.bam -o bigWigs/LTtr_d8_DonorII_norm.bw --binSize 10 --scaleFactor 1.1972841763570858 --effectiveGenomeSize 2913022398 bamCoverage --bam ./STAR/K331A_DonorIAligned.sortedByCoord.out.bam -o bigWigs/K331A_DonorI_norm.bw --binSize 10 --scaleFactor 0.5914211756222816 --effectiveGenomeSize 2913022398 bamCoverage --bam ./STAR/K331A_DonorIIAligned.sortedByCoord.out.bam -o bigWigs/K331A_DonorII_norm.bw --binSize 10 --scaleFactor 1.6631219993121327 --effectiveGenomeSize 2913022398 rld <- rlogTransformation(dds)
-
library(ggplot2) svg("pca6.svg") data <- plotPCA(rld, intgroup=c("condition", "replicate"), returnData=TRUE) percentVar <- round(100 * attr(data, "percentVar")) ggplot(data, aes(PC1, PC2, color=condition, shape=replicate)) + geom_point(size=3) + scale_color_manual(values = c("mock" = "grey", "M1"="#1f78b4")) + xlab(paste0("PC1: ",percentVar[1],"% variance")) + ylab(paste0("PC2: ",percentVar[2],"% variance")) + coord_fixed() dev.off() png("pca.png", width=800, height=800) data <- plotPCA(rld, intgroup=c("condition", "replicate"), returnData=TRUE) percentVar <- round(100 * attr(data, "percentVar")) ggplot(data, aes(PC1, PC2, color=condition, shape=replicate)) + geom_point(size=3) + scale_color_manual(values = c("mock" = "darkgrey", "M1"="#a14a1a", "M2"="#33a02c", "M10"="#1f78b4", "X1457"="#e31a1c", "TCR9"="orange", "delta9H"="purple")) + xlab(paste0("PC1: ",percentVar[1],"% variance")) + ylab(paste0("PC2: ",percentVar[2],"% variance")) + coord_fixed() dev.off() png("pca2.png", 1200, 800) plotPCA(rld, intgroup=c("condition")) dev.off() data <- plotPCA(rld, intgroup=c("condition", "donor"), returnData=TRUE) write.csv(data, file="plotPCA_data.csv") #calculate all PCs including PC3 with the following codes library(genefilter) ntop <- 500 rv <- rowVars(assay(rld)) select <- order(rv, decreasing = TRUE)[seq_len(min(ntop, length(rv)))] mat <- t( assay(rld)[select, ] ) pc <- prcomp(mat) pc$x[,1:3] #df_pc <- data.frame(pc$x[,1:3]) df_pc <- data.frame(pc$x) identical(rownames(data), rownames(df_pc)) #-->TRUE ## define the desired order of row names #desired_order <- rownames(data) ## sort the data frame by the desired order of row names #df <- df[match(desired_order, rownames(df_pc)), ] data$PC1 <- NULL data$PC2 <- NULL merged_df <- merge(data, df_pc, by = "row.names") #merged_df <- merged_df[, -1] row.names(merged_df) <- merged_df$Row.names merged_df$Row.names <- NULL # remove the "name" column merged_df$name <- NULL merged_df <- merged_df[, c("PC1","PC2","PC3","PC4","PC5","PC6","PC7","PC8","group","condition","donor")] #results in 26PCs: merged_df <- merged_df[, c("PC1","PC2","PC3","PC4","PC5","PC6","PC7","PC8","group","condition","donor")] write.csv(merged_df, file="merged_df_8PCs.csv") #> summary(pc) #0.5657 0.2195 0.1347 --> 0.57 0.22 0.13 #Importance of components: # PC1 PC2 PC3 PC4 PC5 PC6 PC7 #Standard deviation 19.4051 12.0878 9.4683 4.5569 4.01016 3.00610 2.71918 #Proportion of Variance 0.5657 0.2195 0.1347 0.0312 0.02416 0.01358 0.01111 #Cumulative Proportion 0.5657 0.7853 0.9200 0.9512 0.97531 0.98889 1.00000
-
dds$condition <- relevel(dds$condition, "mock") dds = DESeq(dds, betaPrior=FALSE) resultsNames(dds) clist <- c("M1_vs_mock","M2_vs_mock","M10_vs_mock", "delta9H_vs_mock","TCR9_vs_mock","X1457_vs_mock") dds$condition <- relevel(dds$condition, "X1457") dds = DESeq(dds, betaPrior=FALSE) resultsNames(dds) clist <- c("M10_vs_X1457", "delta9H_vs_X1457") library(biomaRt) listEnsembl() listMarts() ensembl <- useEnsembl(biomart = "ensembl", dataset = "hsapiens_gene_ensembl", version="104") datasets <- listDatasets(ensembl) listEnsemblArchives() attributes = listAttributes(ensembl) attributes[1:25,] library(dplyr) library(tidyverse) for (i in clist) { #i<-clist[1] #i<-"M1_vs_mock" contrast = paste("condition", i, sep="_") res = results(dds, name=contrast) res <- res[!is.na(res$log2FoldChange),] #geness <- AnnotationDbi::select(edb86, keys = rownames(res), keytype = "GENEID", columns = c("ENTREZID","EXONID","GENEBIOTYPE","GENEID","GENENAME","PROTEINDOMAINSOURCE","PROTEINID","SEQNAME","SEQSTRAND","SYMBOL","TXBIOTYPE","TXID","TXNAME","UNIPROTID")) #geness <- AnnotationDbi::select(edb86, keys = rownames(res), keytype = "GENEID", columns = c("GENEID", "ENTREZID", "SYMBOL", "GENENAME","GENEBIOTYPE","TXBIOTYPE","SEQSTRAND","UNIPROTID")) # In the ENSEMBL-database, GENEID is ENSEMBL-ID. #geness <- AnnotationDbi::select(edb86, keys = rownames(res), keytype = "GENEID", columns = c("GENEID", "SYMBOL", "GENEBIOTYPE")) # "ENTREZID", "TXID","TXBIOTYPE","TXSEQSTART","TXSEQEND" #geness <- geness[!duplicated(geness$GENEID), ] #using getBM replacing AnnotationDbi::select #filters = 'ensembl_gene_id' means the records should always have a valid ensembl_gene_ids. geness <- getBM(attributes = c('ensembl_gene_id', 'external_gene_name', 'gene_biotype', 'entrezgene_id', 'chromosome_name', 'start_position', 'end_position', 'strand', 'description'), filters = 'ensembl_gene_id', values = rownames(res), mart = ensembl) geness_uniq <- distinct(geness, ensembl_gene_id, .keep_all= TRUE) #merge by column by common colunmn name, in the case "GENEID" res$ENSEMBL = rownames(res) identical(rownames(res), rownames(geness_uniq)) res_df <- as.data.frame(res) geness_res <- merge(geness_uniq, res_df, by.x="ensembl_gene_id", by.y="ENSEMBL") dim(geness_res) rownames(geness_res) <- geness_res$ensembl_gene_id geness_res$ensembl_gene_id <- NULL write.csv(as.data.frame(geness_res[order(geness_res$pvalue),]), file = paste(i, "all.txt", sep="-")) up <- subset(geness_res, padj<=0.05 & log2FoldChange>=2) down <- subset(geness_res, padj<=0.05 & log2FoldChange<=-2) write.csv(as.data.frame(up[order(up$log2FoldChange,decreasing=TRUE),]), file = paste(i, "up.txt", sep="-")) write.csv(as.data.frame(down[order(abs(down$log2FoldChange),decreasing=TRUE),]), file = paste(i, "down.txt", sep="-")) } #for i in "M1_vs_mock" "M2_vs_mock" "M10_vs_mock" "delta9H_vs_mock" "TCR9_vs_mock" "X1457_vs_mock"; do for i in "M10_vs_X1457" "delta9H_vs_X1457"; do # read files to geness_res echo "geness_res <- read.csv(file = paste(\"${i}\", \"all.txt\", sep=\"-\"), row.names=1)" echo "subset(geness_res, external_gene_name %in% top_g & pvalue < 0.05 & (abs(geness_res\$log2FoldChange) >= 2.0))" echo "geness_res\$Color <- \"NS or log2FC < 2.0\"" echo "geness_res\$Color[geness_res\$pvalue < 0.05] <- \"P < 0.05\"" echo "geness_res\$Color[geness_res\$padj < 0.05] <- \"P-adj < 0.05\"" echo "geness_res\$Color[abs(geness_res\$log2FoldChange) < 2.0] <- \"NS or log2FC < 2.0\"" echo "write.csv(geness_res, \"${i}_with_Category.csv\")" # pick top genes for either side of volcano to label # order genes for convenience: echo "geness_res\$invert_P <- (-log10(geness_res\$pvalue)) * sign(geness_res\$log2FoldChange)" echo "top_g <- c()" echo "top_g <- c(top_g, \ geness_res[, 'external_gene_name'][order(geness_res[, 'invert_P'], decreasing = TRUE)[1:100]], \ geness_res[, 'external_gene_name'][order(geness_res[, 'invert_P'], decreasing = FALSE)[1:100]])" echo "top_g <- unique(top_g)" echo "geness_res <- geness_res[, -1*ncol(geness_res)]" # remove invert_P from matrix # Graph results echo "png(\"${i}.png\",width=1200, height=2000)" echo "ggplot(geness_res, \ aes(x = log2FoldChange, y = -log10(pvalue), \ color = Color, label = external_gene_name)) + \ geom_vline(xintercept = c(2.0, -2.0), lty = \"dashed\") + \ geom_hline(yintercept = -log10(0.05), lty = \"dashed\") + \ geom_point() + \ labs(x = \"log2(FC)\", y = \"Significance, -log10(P)\", color = \"Significance\") + \ scale_color_manual(values = c(\"P < 0.05\"=\"darkgray\",\"P-adj < 0.05\"=\"red\",\"lysosomal\"=\"lightblue\",\"TFEB\"=\"green\",\"lysosomal & TFEB\"=\"cyan\",\"NS or log2FC < 2.0\"=\"gray\"),guide = guide_legend(override.aes = list(size = 4))) + scale_y_continuous(expand = expansion(mult = c(0,0.05))) + \ geom_text_repel(data = subset(geness_res, external_gene_name %in% top_g & pvalue < 0.05 & (abs(geness_res\$log2FoldChange) >= 2.0)), size = 4, point.padding = 0.15, color = \"black\", min.segment.length = .1, box.padding = .2, lwd = 2) + \ theme_bw(base_size = 16) + \ theme(legend.position = \"bottom\")" echo "dev.off()" done ~/Tools/csv2xls-0.4/csv_to_xls.py \ M1_vs_mock-all.txt \ M1_vs_mock-up.txt \ M1_vs_mock-down.txt \ -d$',' -o M1_vs_mock.xls;
-
clustering the genes and draw heatmap
up1 <-read.csv(file="./M1_vs_mock-up.txt", header = TRUE, row.names = 1) up2 <-read.csv(file="./M2_vs_mock-up.txt", header = TRUE, row.names = 1) up3 <-read.csv(file="./M10_vs_mock-up.txt", header = TRUE, row.names = 1) up4 <-read.csv(file="./delta9H_vs_mock-up.txt", header = TRUE, row.names = 1) up5 <-read.csv(file="./TCR9_vs_mock-up.txt", header = TRUE, row.names = 1) up6 <-read.csv(file="./X1457_vs_mock-up.txt", header = TRUE, row.names = 1) down1 <-read.csv(file="./M1_vs_mock-down.txt", header = TRUE, row.names = 1) down2 <-read.csv(file="./M2_vs_mock-down.txt", header = TRUE, row.names = 1) down3 <-read.csv(file="./M10_vs_mock-down.txt", header = TRUE, row.names = 1) down4 <-read.csv(file="./delta9H_vs_mock-down.txt", header = TRUE, row.names = 1) down5 <-read.csv(file="./TCR9_vs_mock-down.txt", header = TRUE, row.names = 1) down6 <-read.csv(file="./X1457_vs_mock-down.txt", header = TRUE, row.names = 1) RNASeq.NoCellLine <- assay(rld) #Option3: as paper described, A heatmap showing expression values of all DEGs which are significant between any pair conditions. all_genes <- c(rownames(up1),rownames(down1), rownames(up2),rownames(down2), rownames(up3),rownames(down3), rownames(up4),rownames(down4), rownames(up5),rownames(down5), rownames(up6),rownames(down6)) #5473 all_genes <- unique(all_genes) #2971 RNASeq.NoCellLine_ <- RNASeq.NoCellLine[all_genes,] write.csv(as.data.frame(RNASeq.NoCellLine_), file ="significant_gene_expressions.txt") #clustering methods: "ward.D", "ward.D2", "single", "complete", "average" (= UPGMA), "mcquitty" (= WPGMA), "median" (= WPGMC) or "centroid" (= UPGMC). pearson or spearman datamat = RNASeq.NoCellLine_ hr <- hclust(as.dist(1-cor(t(datamat), method="pearson")), method="complete") hc <- hclust(as.dist(1-cor(datamat, method="spearman")), method="complete") mycl = cutree(hr, h=max(hr$height)/1.08) mycol = c("YELLOW", "DARKBLUE", "DARKORANGE", "DARKMAGENTA", "DARKCYAN", "DARKRED", "MAROON", "DARKGREEN", "LIGHTBLUE", "PINK", "MAGENTA", "LIGHTCYAN","LIGHTGREEN", "BLUE", "ORANGE", "CYAN", "RED", "GREEN"); mycol = mycol[as.vector(mycl)] #sampleCols <- rep('GREY',ncol(RNASeq.NoCellLine_)) #names(sampleCols) <- c("mock_r1", "mock_r2", "mock_r3", "mock_r4", "WAP_r1", "WAP_r2", "WAP_r3", "WAP_r4", "WAC_r1","WAC_r2") #sampleCols[substr(colnames(RNASeq.NoCellLine_),1,4)=='mock'] <- 'GREY' #sampleCols[substr(colnames(RNASeq.NoCellLine_),1,3)=='WAP'] <- 'RED' #sampleCols[substr(colnames(RNASeq.NoCellLine_),1,3)=='dM_'] <- 'CYAN' #sampleCols[substr(colnames(RNASeq.NoCellLine_),1,3)=='dP_'] <- 'BLUE' #sampleCols[substr(colnames(RNASeq.NoCellLine_),1,3)=='WAC'] <- 'GREEN' png("DEGs_heatmap.png", width=900, height=1010) heatmap.2(as.matrix(datamat),Rowv=as.dendrogram(hr),Colv = NA, dendrogram = 'row', scale='row',trace='none',col=bluered(75), RowSideColors = mycol, labRow="", srtCol=30, keysize=0.72, cexRow = 2, cexCol = 1.4) dev.off() #### cluster members ##### write.csv(names(subset(mycl, mycl == '1')),file='cluster1_YELLOW.txt') write.csv(names(subset(mycl, mycl == '2')),file='cluster2_DARKBLUE.txt') write.csv(names(subset(mycl, mycl == '3')),file='cluster3_DARKORANGE.txt') ~/Tools/csv2xls-0.4/csv_to_xls.py \ cluster1_YELLOW.txt \ cluster2_DARKBLUE.txt \ cluster3_DARKORANGE.txt \ -d$',' -o gene_culsters.xls; ~/Tools/csv2xls-0.4/csv_to_xls.py \ significant_gene_expressions.txt \ -d',' -o DEGs_heatmap_data.xls;
Tools for SNP Visualization
For SNP visualization in Python, we can consider using the following packages:
-
Matplotlib: Matplotlib is a popular plotting library in Python that can be used to create various types of visualizations, including chromosome plots. You can plot SNPs as points or markers on the chromosomes using Matplotlib’s scatter or plot functions.
-
Bokeh: Bokeh is a powerful interactive visualization library in Python. It provides tools for creating interactive plots and offers features like zooming, panning, and tooltips. You can use Bokeh to create chromosome plots and represent SNPs as interactive data points.
-
PyGenomeTracks: PyGenomeTracks is a Python package specifically designed for visualizing genomic data. It provides a convenient interface for plotting genomic features, including SNPs, on chromosomes. It offers customization options for styling and annotating the plots.
In R, we can consider using the following packages for SNP visualization:
-
Gviz: Gviz is an R package that allows you to create interactive and customizable genome visualizations. It provides functions for plotting SNPs on chromosomes and offers features like zooming, panning, and linking to external data sources.
-
karyoploteR: karyoploteR is an R package designed for high-quality karyotype and genomic region plots. It provides functions for visualizing SNPs on chromosomes, allowing you to customize colors, sizes, and other plot attributes.
-
ggplot2: ggplot2 is a widely used plotting package in R that offers a grammar of graphics approach for creating various types of plots. You can use ggplot2 to create SNP plots on chromosomes, leveraging its flexibility and customization options.
Both Python and R provide a wide range of plotting and visualization packages, and the choice depends on your familiarity with the programming language and specific requirements for SNP visualization.
Calculate AT content of the flanking regions of peaks
#!/usr/bin/env python3
#python3 plot_peaks.py peaks.bed /home/jhuang/REFs/gencode.v43.annotation.gtf.db /ref/Homo_sapiens/UCSC/hg38/Sequence/WholeGenomeFasta/genome.fa
#~/Tools/csv2xls-0.4/csv_to_xls.py _peaks_and_motifs_in_promoters_with_LT_peak.txt peak_tss_distances_.txt -d$'\t' -o peaks_and_motifs_in_promoters_with_LT_peak.xls;
#rename the name of sheet1 to "peaks_and_motifs_in_promoters_with_LT_peak" and "all_LT_peaks_NHDF"
#mv peak_tss_distances_.xlsx all_LT_peaks_NHDF.xlsx
import pprint
import argparse
import matplotlib.pyplot as plt
import pandas as pd
import gffutils
import pyfaidx
import numpy as np
def calculate_at_content(seq):
# Function to calculate the AT-content of a given sequence
at_count = seq.count('A') + seq.count('T')
total_count = len(seq)
at_content = round((at_count / total_count) * 100, 1)
return at_content
def plot_peak_distribution(peaks_file, gencode_db, fasta_file, upstream_length=500, downstream_length=500):
db = gffutils.FeatureDB(gencode_db, keep_order=True)
peaks = pd.read_table(peaks_file, header=None)
peaks.columns = ['chrom', 'start', 'end', 'name', 'score']
tss_positions = []
tss_genes = []
for gene in db.features_of_type('gene'):
if gene.strand == '+':
tss_positions.append(gene.start)
else:
tss_positions.append(gene.end)
tss_genes.append(gene)
closest_tss_indices = []
peak_tss_distances = []
peak_names = []
closest_genes = []
closest_gene_names = []
closest_strands = []
closest_gene_types = [] # New list for gene_type
start_positions = [] # New list for Start Positions
end_positions = [] # New list for End Positions
start_distance_to_tss = [] # New list for Distance to TSS of Start Positions
end_distance_to_tss = [] # New list for Distance to TSS of End Positions
peak_starts = []
peak_ends = []
# Load the FASTA file using pyfaidx
fasta = pyfaidx.Fasta(fasta_file)
# New lists for AT-content
start_flank_seqs = []
end_flank_seqs = []
start_at_content = []
end_at_content = []
for _, peak in peaks.iterrows():
peak_center = (peak['start'] + peak['end']) // 2
peak_start = peak['start']
peak_end = peak['end']
closest_tss_index = np.argmin([abs(tss - peak_center) for tss in tss_positions])
closest_tss_indices.append(closest_tss_index)
distance_to_tss = peak_center - tss_positions[closest_tss_index]
peak_tss_distances.append(distance_to_tss)
peak_names.append(peak['name'])
closest_genes.append(tss_genes[closest_tss_index].id)
if 'gene_name' in tss_genes[closest_tss_index].attributes:
closest_gene_name = tss_genes[closest_tss_index].attributes['gene_name'][0]
else:
closest_gene_name = 'N/A' # Set a default value if 'gene_name' attribute is not present
closest_gene_names.append(closest_gene_name)
closest_strands.append(tss_genes[closest_tss_index].strand)
if 'gene_type' in tss_genes[closest_tss_index].attributes:
closest_gene_type = tss_genes[closest_tss_index].attributes['gene_type'][0]
else:
closest_gene_type = 'N/A' # Set a default value if 'gene_type' attribute is not present
closest_gene_types.append(closest_gene_type)
#start_distance_to_tss.append(peak_start - tss_positions[closest_tss_index]) # Add Distance to TSS of Start Position to the list
#end_distance_to_tss.append(peak_end - tss_positions[closest_tss_index]) # Add Distance to TSS of End Position to the list
peak_starts.append(peak_start)
peak_ends.append(peak_end)
# Get the flanking region coordinates
start_flank_start = peak_start - upstream_length
start_flank_end = peak_start - 1
end_flank_start = peak_end + 1
end_flank_end = peak_end + downstream_length
# Retrieve the flanking region sequences from the FASTA file
start_flank_seq = fasta[peak['chrom']][start_flank_start:start_flank_end].seq.upper()
end_flank_seq = fasta[peak['chrom']][end_flank_start:end_flank_end].seq.upper()
start_flank_seqs.append(start_flank_seq)
end_flank_seqs.append(end_flank_seq)
# # Calculate the AT-content of the flanking regions based on the gene strand
# if tss_genes[closest_tss_index].strand== '+':
# start_at = calculate_at_content(start_flank_seq)
# end_at = calculate_at_content(end_flank_seq)
# else:
# start_at = calculate_at_content(end_flank_seq)
# end_at = calculate_at_content(start_flank_seq)
start_at = calculate_at_content(start_flank_seq)
end_at = calculate_at_content(end_flank_seq)
start_at_content.append(start_at)
end_at_content.append(end_at)
df = pd.DataFrame({
'Peak Name': peak_names,
'Peak Start': peak_starts,
'Peak End': peak_ends,
#'Closest TSS Index': closest_tss_indices,
'Distance to TSS': peak_tss_distances,
'Closest Gene (Ensemble ID)': closest_genes,
'Closest Gene (Gene name)': closest_gene_names,
'Gene Strand': closest_strands,
'Gene Type': closest_gene_types, # Add new column for gene_type
#'start_distance_to_tss': start_distance_to_tss,
#'end_distance_to_tss': end_distance_to_tss,
'Start-Flanking AT-content (500 nt)': start_at_content,
'End-Flanking AT-content (500 nt)': end_at_content,
'Start-Flanking Sequence (500 nt)': start_flank_seqs,
'End-Flanking Sequence (500 nt)': end_flank_seqs
})
df.to_excel('peak_tss_distances_.xlsx', index=False)
df.to_csv('peak_tss_distances_.txt', sep='\t', index=False) # Save as tab-separated text file
counts, bin_edges = np.histogram(peak_tss_distances, bins=1000)
bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2
hist_df = pd.DataFrame({'Bin Center': bin_centers, 'Count': counts})
hist_df.to_csv('peak_distribution_.csv', index=False)
total_peaks = hist_df['Count'].sum()
with open('peak_distribution_.csv', 'a') as f:
f.write(f'\nTotal number of peaks: {total_peaks}')
plt.hist(peak_tss_distances, bins=1000)
plt.xlabel('Distance to TSS')
plt.ylabel('Number of peaks')
plt.title('Distribution of peaks around TSS')
plt.savefig('peak_distribution_.png')
plt.show()
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Plot peak distribution around TSS.')
parser.add_argument('peaks_file', type=str, help='Path to the peaks.bed file')
parser.add_argument('gencode_db', type=str, help='Path to the gencode.v43.annotation.db file')
parser.add_argument('fasta_file', type=str, help='Path to the FASTA file')
args = parser.parse_args()
plot_peak_distribution(args.peaks_file, args.gencode_db, args.fasta_file)
梅尔克细胞病毒小抗原通过类型I干扰素信号传导而促进免疫逃逸
详细解释:梅尔克细胞多瘤病毒(Merkel cell polyomavirus,MCPyV)的小肿瘤抗原(small tumor antigen,sT)是该病毒的一个重要组分,通过干扰类型I干扰素(type I interferon)信号传导来促进免疫逃逸。
类型I干扰素是免疫系统中的一类重要信号分子,对抗病毒感染和肿瘤发展起着关键作用。然而,MCPyV的小肿瘤抗原通过干扰类型I干扰素信号的正常传导,从而导致免疫系统无法有效识别和清除感染细胞。
具体而言,小肿瘤抗原通过抑制关键分子的表达,如干扰素调节因子9(interferon regulatory factor 9,IRF9)和信号转导和转录激活因子1(signal transducer and activator of transcription 1,STAT1),来干扰类型I干扰素所引发的免疫应答。这种干扰导致了干扰素诱导基因(interferon-stimulated genes,ISGs)的抑制,从而削弱了宿主细胞对病毒感染的免疫反应。
这些发现对于理解梅尔克细胞多瘤病毒的感染机制和免疫逃逸至关重要。进一步研究小肿瘤抗原如何干扰类型I干扰素信号传导以及相关的免疫调节途径,有助于揭示肿瘤发展的分子机制,并为开发针对梅尔克细胞多瘤病毒感染的治疗策略提供新的目标和思路。
副标题:sT介导的TBK1-IRF3信号传导调控对MCPyV感染和发病机制的重要性
关键词:sT,病毒致癌蛋白,免疫逃逸,TBK1-IRF3
摘要
梅尔克细胞多瘤病毒(MCPyV)是导致大多数梅尔克细胞癌(MCC)的病因。MCPyV的编码能力有限,早期病毒蛋白大T(LT)和小T(sT)具有多种功能,对感染和转化起重要作用。感染和肿瘤发生之间早期病毒基因表达存在一个根本差异,肿瘤中表达截断的LT(LTtr),而sT在两种情况下都有表达,并促进肿瘤发生。本研究旨在识别早期病毒蛋白引发的感染和发病关键机制。我们利用原代人类细胞进行全基因组转录组和染色质研究。由于目前在感染和肿瘤发生模型方面的限制,我们通过过表达早期病毒蛋白的实验来模拟这些情况,可以单独进行实验或组合进行实验。
除了调节细胞周期和炎症过程外,我们还揭示了sT的一个重要新功能:下调参与识别和调控先天免疫系统的基因。我们发现,sT在TANK结合激酶1(TBK1)水平上降低了类型I干扰素(IFN)反应,从而有效减少了TBK1依赖的IFN-beta启动子的激活,但对IFN-beta诱导的ISRE元件的激活没有影响。同时,我们还发现LT和LTtr可以激活类型I干扰素反应,而sT则起到了抵消这一功能的作用。我们首次提供了由早期病毒蛋白引发和调控的类型I干扰素激活的机制理解。我们假设TBK1-IRF3调控是一种由sT修饰的重要机制,对感染、持续感染和肿瘤微环境调控具有影响。
引言
人类多聚病毒(hPyVs)具有狭窄的宿主范围和高度的细胞类型限制。因此,目前的体外细胞培养系统受到限制,而用于研究病毒生命周期和/或持续感染的体内动物模型不存在[1-5]。hPyVs可终身持续感染,然而,对于包括BKPyV、JCPyV和MCPyV在内的大多数hPyVs,它们在自然宿主中的持续感染部位/细胞类型尚未完全揭示,导致持续感染建立的机制也仅了解甚少[1, 3, 5-9]。尽管目前的持续感染模型认为,BKPyV虽然能感染多种细胞类型,但仅在具有受控病毒DNA复制、病毒后代产生和细胞再感染的内源性免疫应答细胞中建立持续感染[7],MCPyV则被认为能独立于病毒后代产生建立持续感染形式[3, 5]。
DNA病毒作为入侵病原体的第一道防线被先天免疫系统所识别。因此,hPyVs在入侵过程中被内源性体内的Toll样受体(TLRs)如TLR9在内体袋中识别,TLR9能够识别CpG未甲基化的双链DNA[10, 11]。此外,细胞质核酸传感器也参与对DNA病毒感染的早期识别。细胞质DNA识别的主要调节因子是干扰素基因刺激因子(STING),其受到cGAS、AIM2或IFI16等多种DNA传感器的调控[12]。
对于MCPyV感染和免疫调节的了解非常有限,只有少数几篇研究报告,主要是基于过表达研究。例如,LT的表达降低了上皮细胞和MCC细胞系中的TLR-9表达[11]。BJ-5ta细胞中LT和sT的表达增加了调节细胞生长和细胞运动的基因表达[13]。与此同时,还诱导了多种炎症反应基因的表达,包括干扰素刺激基因(ISGs)如OAS1和ISG20,细胞因子如IL-1β和IL-6,以及趋化因子如CXCL1和CXCL6 [13]。在缺乏LT的情况下,sT的表达通过与PP4C和NEMO的相互作用抑制NF-κB炎症信号传导[14, 15]。
我们应用了一种体外感染模型,将原代皮肤成纤维细胞感染MCPyV [16],观察到在感染的后期阶段cGAS-STING的激活以及NF-κB信号通路的诱导,随后表达了抗病毒的ISGs和内源性炎症细胞因子。有趣的是,在感染的早期阶段并未检测到cGAS/STING的激活[17, 18]。然而,在MCC细胞系中也观察到STING的激活和其逃避机制,MCPyV阳性的MCC细胞系中STING的表达明显降低[17, 18]。此外,MCC肿瘤的HLA-I表达较低,通过给予组蛋白去乙酰化酶抑制剂[19-21]或减少sT的表达[22]可以逆转MCC细胞系中的HLA-I表达。
本研究系统分析了早期病毒基因产物诱导的转录和表观遗传学变化。我们通过在原代细胞中进行过表达分析,并在感染和发病过程中不同时间点进行全基因组RNASeq和ChIPSeq实验,模拟早期病毒区域的病毒基因表达以及在感染和发病过程中对宿主基因表达的调控。因此,我们发现了sT在MCPyV感染和发病过程中的新功能,即sT在很大程度上参与免疫逃逸。有趣的是,sT是一种拮抗剂,可以通过在胞浆中抑制TBK1信号级联来降低由LT或肿瘤表达的LTtr触发的免疫活化。我们的研究结果对于理解MCPyV的感染和持续感染以及MCC的发病机制具有重要意义,肿瘤细胞在肿瘤浸润性T细胞中的识别可能会受到肿瘤微环境中TBK-1/IRF3免疫活化的抑制。
结果
-
MCPyV T抗原的过表达导致nHDFs转录谱的扰动
为了研究MCPyV T抗原在相关的人类细胞类型中引发的宿主转录和组蛋白修饰的变化,我们使用了之前发表的方法[16]对新生儿皮肤真皮成纤维细胞(nHDFs)进行了MCPyV颗粒的感染。我们成功感染了约5-8%的细胞,通过7天后免疫染色中LT和VP1阳性细胞的比例来衡量(suppl.图S1)。然而,感染细胞的相对数量过低,无法解析宿主基因表达的变化,并且没有出现释放新颗粒和再感染等允许性感染的证据,因为感染细胞的频率随时间不变。类似地,半允许性的体外复制系统也无法获得更高比例的感染细胞或病毒传播[4, 5, 23]。
为了研究早期病毒基因产物LT、LTtr和sT引发的潜在基因表达变化和染色质改变,我们建立了一个过表达协议,利用慢病毒转导,可以在相关细胞类型中单独或组合转导单个T抗原(图1A)。因此,我们可以控制T抗原表达的量以及转导后的时间点,从而模拟具有早期病毒T-Ag表达的可能的初始感染、长期感染或肿瘤细胞表达模式(图1A-B)。
我们使用两个独立供体的nHDFs研究了由sT、LT或LTtr诱导的转录和组蛋白修饰的变化,通过Western Blotting和RT-qPCR进行了控制(图1B和suppl.图S2A-E)。随后,我们评估了转导后3天和8天(dpt)的即时转录和组蛋白修饰变化,并对转导后9天(供体I)或12天(供体II)进行下游分析(图1A)模拟癌前状态或癌变状态下的基因表达模式。
使用RNASeq对nHDFs进行的转录组分析显示,MCPyV T抗原在所有时间点均引发了显著的扰动,表现为大量的差异表达基因(DEGs)(图1C、1D、suppl.表S1)。主成分分析(PCA)显示样本在供体间呈现显著的聚类,表明对慢病毒转导和T抗原表达的敏感性存在差异(suppl.图S2F)。然而,在不同的T抗原中,宿主基因表达的明显差异也在供体特异性聚类中表现出来。
DEG分析包括具有log2FC >1或<-1且padj. <0.05的基因,显示与向量对照相比,几百个基因在所有情况下都有差异调控的趋势,其中表达增加的基因多于表达下降的基因,尤其是在表达LT或LTtr的细胞中(图1C)。LT表达导致985和837个基因在3或8dpt时显著上调。类似地,LTtr也呈现出相似的情况,在3或9dpt时有659和944个基因显著上调,而在转导了sT、sT+LT或sT+LTtr的细胞中,上调的基因数量较少。有趣的是,在表达LTtr的nHDFs(3d)中,仅有相对较少的基因(n=74)呈现显著下调(图1C)。通过比较热图,我们可以看到在所有情况下存在四个显著的DEG簇(图1D)。根据基因本体论(GO)分析,具有相似生物过程的基因被归类到簇I(蓝色),其中包括先天免疫和类型I干扰素(IFN)反应以及NF-kB信号通路(suppl.图S3和suppl.表S2)。其中大多数基因在sT表达下被下调,而在LT或LTtr表达下被上调(图1D)。另一方面,簇II(品红色)包含参与细胞外基质(ECM)组织、细胞凋亡过程和蛋白稳定化的基因。在簇III(橙色)中,包含参与信号转导、炎症和趋化因子信号、化学运动、细胞分裂和细胞黏附的基因被发现富集(suppl.图S3和suppl.表S2)。其中大多数基因在表达sT 8天或与LTtr共同表达9-12天的细胞中高度上调(图1D),尽管它们在表达模式上有变异。同时,簇IV(黄色)显示与簇III相反的表达模式。这一簇中的基因表达调控细胞黏附炎症反应和增殖(suppl.图S3和suppl.表S2)。
-
T抗原在宿主基因表达模式中展示了共同的但也有独特的改变。
我们分别对每个包含重复样本的数据集进行了分析,以研究各个T抗原共享的基因集和特定靶向的基因(图2)。不同条件下的DEG,以火山图的形式呈现,如suppl.图S4和表S1所示。其中,我们着重强调了在表达sT的细胞与对照组相比的DEG结果(图3A)。我们对每个条件进行了GO分析(生物过程),使用显著(padj. < 0.05)上调(log2FC > 1)或下调(log2FC < -1)的基因集合,并总结在suppl.表S3中。结果经过筛选,选取了最显著的10个GO术语(FDR < 0.05且基因数量至少为10)(图2)。比较每个情况下获得的GO术语,揭示了LT、LTtr和sT调控的生物过程有相当大的重叠(图2),例如细胞增殖和细胞分裂。值得注意的是,尽管我们和其他研究组先前发现完整LT而非LTtr在包括nHDFs在内的多种细胞系中具有一般的生长抑制作用,但在我们的GO分析中,LT和LTtr在细胞周期调控或细胞分裂方面没有明显差异[5, 24]。
除了细胞周期调控,我们发现DNA复制和DNA修复等GO术语在所有条件下均高度富集,其中LT和LTtr对DNA复制的影响显著高于sT。有趣的是,LT和LTtr,而不是sT,引起了对DNA损伤响应基因的适度上调。此外,LT、sT和LTtr的存在对转录相关机制产生影响。值得注意的是,sT仅对受其抑制的基因富集了与转录调控相关的GO术语。
我们的研究结果与其他研究团队进行的先前研究相一致,表明T抗原导致了一组与先天免疫和炎症相关的基因的上调[13, 25]。我们发现,同时表达sT和LT或LTtr的细胞中,与趋化性、对肿瘤坏死因子(TNF)的响应和NfκB信号相关的基因明显增加。此外,我们的研究结果与先前的研究一致,表明sT影响与血管生成和细胞黏附有关的基因[26-28]。我们的研究进一步揭示了LT和LTtr在这些生物过程中的类似破坏作用。
我们在类型I干扰素(IFN)信号通路基因和先天免疫信号通路中检测到最显著的转录变化。虽然LT/LTtr和sT+LT在早期时间点(3天后)增强并调节了这些过程,但它们在后续时间点与对照组细胞没有显著差异。然而,与LT/LTtr或sT+LT表达相比,单独表达sT反而产生了相反的效果,抑制了与类型I干扰素反应相关的基因,不论时间点如何。
-
MCPyV的sT稳定地抑制了类型I干扰素反应基因的转录。
图3A中的火山图总结了与短T抗原(sT)在3天和8天后与空载体对照组相比在nHDFs中检测到的所有差异表达基因(DEGs)。值得注意的是,最显著上调的基因包括CCL7、CXCL6和IL1B,这些基因与炎症反应相关。对sT上调基因的基因本体(GO)分析(图S5)证实了先前的研究结果,即sT促进细胞分裂和增殖,干扰转录调控和蛋白质生物合成,以及趋化性[13, 26, 28]。有趣的是,与其他T抗原蛋白相比,sT的表达导致更多的基因表达下调(3天后449个,8天后795个),如图3A所示。对所有被sT显著下调的基因进行的GO分析显示出15个GO术语的富集,其中至少有10个基因,最小的FDR值为0.05,包括与负性转录调控和病毒基因组复制抑制相关的基因(图3B)。此外,细胞黏附、血管生成和发育等GO术语在较晚的时间点(8天后)富集。有趣的是,sT还导致与MHC-I类抗原呈递相关的基因的下调(图3B-C)。我们从参与先天免疫和类型I干扰素信号通路的GO术语中选择了一部分基因,并根据其功能进行分类(图3C)。我们发现,sT抑制了STING1、TRIM56、DDX60和TLR3等与细胞质DNA和RNA感知有关的基因。然而,根据它们对干扰素的响应,大多数基因被归类为干扰素刺激基因(ISGs)[29]。我们还发现了一组与MHC-I信号传导相关的基因簇,如HLA-A、HLA-B、HLA-E、TAP1和B2M(图3C)。
当我们将经过逆转录聚合酶链反应(RT-qPCR)验证的sT表达的nHDFs中的PRR感知、ISGs和抗原呈递相关基因的特定基因位点与未处理的nHDFs进行比较时,发现sT有效地对抗了通过逆转录聚合酶链反应介导的暂时性抗病毒反应,表明lentiviral transduction (Fig. 3C, 左图)。sT有效地对抗了这种抗病毒反应信号,尤其是在8天后的时间点,通过抑制先天免疫基因的表达(Fig. 3C, 右图)。通过RT-qPCR验证了在sT表达的nHDFs中检测到的基因表达下调,包括PRR感知、ISGs和抗原呈递相关基因(suppl. Fig. S6A),并在MCC细胞系中得到了证实,这些细胞系中sT的表达可以通过诱导sT特异性shRNA的多西环素进行下调(suppl. Fig. S6B)。
-
MCPyV的sT特异性地抑制ISGF3依赖基因的转录。
我们发现,在对lentiviral转导的反应中,sT稳定地抑制了与先天免疫有关的基因的转录,特别是ISG,它们在响应I/II型干扰素信号时被激活。确实,将这些基因与干扰素数据库[29]进行交叉引用后发现,25-35%的下调基因被分类为I/II型干扰素依赖基因(Fig. 4A)。
为了确定sT调控的基因是否受干扰素刺激基因因子3(ISGF3)的调控,这是激活ISG所必需的关键转录因子复合物,我们将sT调控的ISGs与先前报道的ISG数据集进行了比较。我们选择了来自人类WT和IRF9-KO巨噬细胞的RNASeq数据集[30, 31]。重组IFNbeta的处理结果显示出四个主要的基因簇:IRF9独立上调和下调基因以及IRF9依赖的上调和下调基因。通过计算奇比(OR)和p值,我们将这些基因集与sT在3和8天后上调和下调的基因进行了比较(Fig. 4B)。我们观察到IRF9依赖和IFNbeta上调基因与sT下调基因的重叠,3天后的OR为15,8天后的OR为6.9(Fig. 4C)。总共,381个IRF9依赖和被IFNbeta处理上调的基因中,有65个与sT在3和8天后下调的基因重叠。相反,只有4个基因被sT上调(Fig. 4D和suppl. Tab. S5)。有趣的是,在由IFNbeta处理上调的IRF9独立基因中,有18个基因被sT下调,包括STAT1和IRF9,以及PARP9和DTX3L,这些基因据报道也参与了ISG的调控[32]。
在案例对照研究或横断面研究中,比值比(Odds Ratio,OR)是衡量暴露因素(自变量)与结果(因变量)之间关联程度的一种指标。比值比的计算公式如下: OR = (a/c) / (b/d) 其中: a:暴露组中患病案例的数量 b:未暴露组中患病案例的数量 c:暴露组中未患病案例的数量 d:未暴露组中未患病案例的数量 比值比可以使用统计软件或在线计算器进行计算。或者,您可以使用上述公式和研究中的相关数据手动计算比值比。以下是一个示例: 假设在一项研究中,我们想要研究吸烟与肺癌之间的关系。我们随机选择了500名肺癌患者和500名非肺癌患者,并观察他们的吸烟史。结果如下: 暴露组(吸烟者)中有200名肺癌患者(a) 未暴露组(非吸烟者)中有50名肺癌患者(b) 暴露组中有100名非肺癌患者(c) 未暴露组中有250名非肺癌患者(d) 根据上述数据,我们可以计算比值比如下: OR = (200/100) / (50/250) = 4 这表示吸烟与肺癌之间的关联程度为4,即吸烟者患肺癌的几率是非吸烟者的4倍。
-
MCPyV sT诱导的转录改变与激活性组蛋白修饰相关。
基于先前发表的有关sT干扰转录过程的数据,例如sT通过与MYC-MAX-EP400转录因子复合物在特定基因启动子上的结合来触发转录的激活 [24],或者通过转激活赖氨酸特异性去甲基化酶LSD-1抑制复合物而导致转录抑制 [1, 33],我们进一步研究了组蛋白修饰在sT依赖的差异表达基因的启动子调控中的作用。我们在表达sT的nHDF细胞中使用针对H3K4me3和H3K27ac的抗体进行ChIP-Seq实验,这两种抗体在活性基因启动子中均存在。此外,我们使用针对H3K27me3和H3K9me3的抗体来识别顺式异染色质和构成性异染色质的变化。大部分H3K4me3的变化发生在启动子区域,而差异化的H3K27ac、H3K27me3和H3K9me3信号则在基因体内或基因间区域内检测到。我们只观察到少数几个具有抑制性组蛋白修饰标记的变化,log2FC <-1; >1相反地,我们检测到大量具有激活性组蛋白修饰(H3K4me3和H3K27ac)变化的基因(图5A,附表S6)。有趣的是,仅有19个注释基因检测到H3K4me3的增加(log2FC >1),而678个基因检测到H3K4me3的减少。同样,H3K27ac增加了1707个基因的修饰,而减少了6853个基因的修饰(图5A)。
对比转录状态和每种分析的组蛋白修饰的富集度,发现H3K4me3、H3K27ac和H3K27me3之间存在较强的相关性,但H3K9me3没有(图5B)。为了测试GO术语中的基因是否与组蛋白修饰信号的变化相关,我们量化了所有贡献于特定GO术语的基因的转录起始位点(TSSs)(<= 10 kb)周围的组蛋白修饰信号,这些基因列在附表S6中。我们发现,sT下调基因与天然免疫过程、转录调控和细胞粘附等GO术语的减少水平的H3K4me3和H3K27ac水平相关。然而,在这些特定基因集的启动子附近或位置上,我们没有检测到任何抑制性组蛋白修饰标记。相反,sT上调基因和富集在细胞周期与分裂以及蛋白质生物合成等GO术语中的基因在激活性组蛋白修饰中没有显示出显著增加。这些观察结果也适用于一个随机选择的在sT表达下没有显著变化的基因子集(图5C)。对于nHDF的供体II获得的数据进行分析时,得到了类似的结果(附图S7)。
-
MCPyV sT通过特异性靶向ISGF3来抑制ISG转录,而不干扰ISGF3的功能。
转录组分析显示,MCPyV sT抑制多种ISG的转录,这些ISG的激活是由包括pSTAT1、pSTAT2和IRF9的ISGF3转录因子复合物介导的。在定位到细胞核后,ISGF3结合ISG启动子中的ISRE并激活其转录。另外,pSTAT1可以二聚化并导致一些ISG启动子中GAS元件的激活。基于sT下调的ISGF3依赖基因的强烈信号,我们假设sT可能通过特异性靶向ISGF3来抑制ISG转录。我们通过RT-qPCR验证了ISGF3成员IRF9和STAT1的mRNA水平以及OAS2和IRF7作为ISG下游表达的代表性基因的显著降低(图6A)。此外,免疫荧光研究显示,在表达sT的细胞中,整体IRF9信号减弱,并且与向量对照相比,表达sT的细胞中3天后核内的IRF9信号显著减弱(图6B)。然而,当我们对转染sT并在IFNbeta处理16小时的H1299细胞进行细胞分离时,我们并未发现核内或细胞质中IRF9水平的改变(图S8,lane 10-12和4-6),这表明sT并不干扰IRF9的核转位。当我们检测STAT1的磷酸化时,在2天后与向量对照相比,sT表达的nHDFs中仅有轻微的降低。此外,IRF9的蛋白浓度在2天后也稍有降低。然而,在8天后观察到IRF9蛋白水平的显著降低(图6C)。这些结果强烈表明,sT通过在转录水平上影响ISGF3成员的表达,进而在后续时间点降低总蛋白水平(图6C)。
-
MCPyV sT通过干扰TBK1水平及上游来对抗I型干扰素(IFN)信号。
MCPyV sT在细胞核中具有许多功能,如转录调控,例如通过招募转录因子和染色质重构复合物EP400/MYC/MAX。然而,它也具有胞质功能,包括与适配蛋白NF-κB必需调节因子(NEMO)蛋白的结合,导致炎症基因的表达抑制。为了澄清sT在启动子水平上的转录调控是否直接导致干扰素刺激基因(ISG)的转录抑制,还是由于IFNARI/II的上游,我们在H1299细胞中进行了基于荧光素酶的启动子实验,使用人类IFNbeta启动子和三个代表性的ISG启动子:人类IFIT1和IFIT2启动子以及小鼠MX1启动子。在这里,我们调查了当IFNARI/II通路被刺激时,sT是否可以对抗ISG启动子的活性。与已知的IFNAR信号拮抗剂小鼠巨细胞病毒CMV-M27蛋白不同[12],过表达未标记或带V5标记的MCPyV sT并未显著降低H1299细胞中ISG启动子的活性(图7A-C)。
接下来,我们评估了人类IFNbeta启动子,该启动子包含多个转录因子的结合位点,包括NF-κB、IRF3或IRF7。当过表达一个构成活化的IRF3形式(IRF3-5D)[34]时,在表达sT的H1299细胞中,并未观察到IFNbeta启动子活性的显著降低,这表明sT并不干扰IRF3驱动的IFNbeta启动子活化(图7D)。有趣的是,当我们过表达TBK1,这是IRF3上游的关键信号因子时,我们发现IFNbeta启动子的激活显著降低,未标记的sT与带有V5标记的sT相比,降低更为明显(图7E),这表明sT干扰IFNbeta信号在TBK1上游或TBK1水平。同样地,当表达已知在TBK1水平调节I型IFN响应的人类巨细胞病毒UL35时,在共表达TBK1时,IFNbeta启动子活性显著降低(图7E),但在表达IRF3-5D时没有影响(图7D)。如预期,在两个实验中,作为已知IRF3和RIG-I拮抗剂的流感A病毒(IAV)NS1蛋白都对IFNbeta启动子产生显著影响(图7D-E)。
TBK1是活化的胞质模式识别受体(PRR)刺激下游的关键激酶,例如通过cGAS-STING介导的DNA感知和RIG-I-MAVS介导的RNA感知。为了研究MCPyV sT表达对TBK1下游信号事件的影响,我们检查了cGAS-STING和RIG-I信号传导,这两个信号都汇合于激活TBK1。我们在HEK-293细胞中过表达cGAS-STING或RIG-I,并使用IFNbeta启动子进行荧光素酶活性测定。共表达sT导致DNA(图7F)和RNA(图7G)激活信号的荧光素酶活性显著降低。类似的效应也观察到了IAV NS1蛋白,验证了它已知的拮抗IRF3和RIG-I的能力[36, 37]。此外,在HEK293细胞中过表达TBK1也证实了MCPyV sT显著降低IFNbeta启动子的活性(图7H),从而表明sT对不同细胞系中的IFNbeta启动子具有一致的调控。
为了研究BKPyV sT对IFNbeta和ISG启动子活性的影响,已报道其干扰RIG-I信号[38],我们检查了过表达带V5标记的BKPyV sT在H1299细胞中的影响。细胞分离显示,BKPyV与MCPyV sT一样,在转染后同时在胞质和细胞核分离物中表达(图S9)。同样,在表达V5标记的BKPyV sT时,我们没有观察到IFIT1、IFIT2或MX1启动子活性的显著降低。然而,在共表达TBK1时,我们观察到IFNbeta启动子活性显著降低,而在表达IRF3-5D时没有影响(图S10)。这些发现表明,BKPyV和MCPyV sT可能在干扰胞质DNA和/或RNA感知引发的I型IFN信号中具有保守的作用。
总的来说,我们的发现表明,MCPyV sT通过干扰TBK1的上游或TBK1水平的信号事件来影响I型干扰素(IFN)信号传导,特别是在胞质DNA或RNA感知的情况下。这种干扰最终导致IFNbeta的产生减少,并进而抑制ISG的产生。
-
MCPyV sT抵消了LT和LTtr对ISG转录的刺激作用。
观察到MCPyV sT干扰宿主抗病毒免疫反应,促使我们研究其在感染和发病过程中的功能后果。在感染过程中,sT和LT均表达并且对病毒基因组复制是必需的。此外,在肿瘤细胞中,sT和LTtr的表达对于细胞增殖至关重要。考虑到我们实验设置中的这两种情况,我们专注于一组基因,这些基因在与先天免疫应答、对病毒的应答和对病毒的防御等基因本体学术语中富集(图2)。我们发现这些基因展示了相反的调控模式。通过比较它们的log2FC,我们证实LT、LTtr以及sT+LT的共表达在3天后的转录过程中导致了大多数基因的显著上调。相反,仅表达sT导致这些基因的下调。此外,在8-12天后,我们观察到仅sT的表达导致了这些基因的抑制(图8)。在所示的基因本体学术语中(suppl. Tab. S4)的大部分基因被分类为干扰素刺激基因(ISGs)[39]。值得注意的是,当比较IFNalpha和IFNbeta基因的调控时,我们发现LT和LTtr在3天后特异性上调了IFNbeta的表达(图8)。为了验证所选ISG(IFI6、IRF9、ISG15、OAS2和STAT1)的相反调控,我们对RNASeq中分析的两个不同nHDF供体进行了RT-qPCR(suppl. Fig. S11A)。有趣的是,我们观察到供体I中T抗原表达效应的稍微延迟与供体II相比。然而,在PCA中,包括图8中显示的ISG子集,我们没有检测到单独的供体簇(suppl. Fig. S11B)。
总而言之,我们发现LT和LTtr在原代成纤维细胞中增强了抗病毒反应,而sT通过抑制IRF3-TBK1信号传导特异性地抑制ISG的表达。我们认为通过这种机制,sT可以在感染情况下抵消LT引起的短暂I型干扰素反应,并在MCC的情况下抵消LTtr引起的干扰素反应。
讨论
Merkel细胞多瘤病毒具有有限的编码能力,包括五种病毒蛋白和一种病毒miRNA。其中,早期蛋白sT和LT在感染和发病过程中起着关键作用,具有多功能性。在感染过程中,LT在病毒转录和病毒DNA复制中起着重要作用[2, 4, 40, 41],sT在其中发挥支持作用,尽管其确切作用尚未完全理解[2, 41]。在发病方面,目前的数据表明,sT主要推动细胞转化和疾病进展。然而,肿瘤细胞的生存能力取决于sT和肿瘤特异性的LTtr[42, 43]。
sT和LT在感染过程中以紧密调节的平衡状态共同工作,促进病毒复制。然而,在发病过程中,这种相互作用发生了显著变化,部分原因是由于LT的C-末端选择压力和LT蛋白的截短版本的表达[1, 24, 44]。为了解决先前感染模型的局限性,研究主要通过在BJ-5ta细胞、hTERT永生化成纤维细胞中进行过表达实验来研究早期病毒蛋白对宿主反应的影响[13, 45]。这些研究仅考虑了早期病毒蛋白表达的特定组合,可能不能完全代表实际的感染或转化条件。类似地,对于sT的转化特性的研究主要集中在理解sT单独调控转录激活和抑制的机制,而没有考虑LT或LTtr表达的背景[45, 46]。 在这项研究中,我们对MCPyV-T抗原存在时发生的转录和表观遗传变化进行了彻底的分析。我们的研究重点放在原代皮肤成纤维细胞上,这是一种与感染和转化相关的细胞类型[16]。通过研究仅早期抗原的效应以及它们的组合效应,我们对病毒与宿主免疫反应之间复杂的相互作用有了新的认识。sT在多个层面上操纵宿主特异性免疫反应,例如作为强效的类型I干扰素(IFN)信号的拮抗剂。在我们的实验设置中,我们观察到sT对PRR感应分子(包括TLR3、STING、TRIM56和DDX60)的基因表达进行了负调节。此外,我们还发现sT降低了ISG的表达水平,并减少了抗原呈递和抗原加工因子的水平,包括多种HLA分子和TAP1。这些发现进一步阐明了病毒与免疫系统之间的复杂相互作用,这对于建立感染以及病毒的发病过程都是重要的。
我们的研究结果确认并扩展了Krump等人最近发表的数据的范围和机制,他们在原代HDF细胞中应用了复杂而具有挑战性的MCPyV感染模型,导致细胞亚群的感染,但未普遍感染(参见补充图S1)[25, 41]。通过使用RT-qPCR,Krump等人发现,在MCPyV感染后六天,关键的抗病毒ISG和炎症细胞因子表达水平增加,STING-TBK1-IRF3成为MCPyV感染触发的重要轴。值得注意的是,在感染的早期阶段,这些基因的表达水平没有明显上调。在我们的研究中,我们提供了证据表明,sT功能是阻断诱导ISG的信号级联。具体而言,我们发现这种调控发生在细胞质中,在TBK1的水平上。我们的结果为类型I干扰素反应的触发时间延迟和激活机制提供了可能的解释。在感染的早期阶段,当sT和LT的表达水平保持平衡时,sT有效地抑制了LT引发的PRR信号传导。sT与病毒因子之间的这种动态平衡使得sT能够有效地削弱干扰素反应。然而,随着感染的进行和LT诱导病毒DNA复制,sT与其他病毒因子(如LT和新产生的病毒基因组)之间的平衡发生变化。在这个阶段,平衡转向可能触发PRR信号的因子,使得sT单独无法有效地抑制干扰素反应。事实上,已经证明BKPyV和JCPyV感染在不同细胞类型中可能引发不同的ISG反应[7, 47, 48]。例如,在某些细胞类型(如小鼠成纤维细胞或内皮血管细胞)中,BKPyV和JCPyV或BKPyV感染分别诱导ISG的表达[48]。然而,在原代RPTE细胞中,BKPyV无法诱导ISG的表达,而JCPyV可以[7, 47]。这表明这些多瘤病毒引发ISG反应的能力可能与细胞类型相关。
我们对sT调控的ISG签名进行了更详细的分析,并与IRF9敲除实验的数据集进行了比较[49]令人惊讶的是,除了ISGF3诱导的ISG之间存在高度一致性外,我们还确定了一个较小的IRF9独立调控的ISG亚组(Fig. 4,suppl. Tab. S5)。值得注意的是,在这些IRF9独立的ISG中,我们确定了PARPL和DDX3L。这两个因子已知形成一个复合物,在病毒感染引发的STAT1介导的类型I干扰素反应中作为伴侣能增强干扰素的反应。有待进一步研究来调查该复合物如何影响MCPyV引发的干扰素反应以及所涉及的分子相互作用。
在我们对sT调控的基因的转录起始位点的组蛋白修饰研究中,我们观察到sT通过失去激活标记来负调控基因表达。具体来说,我们发现H3K27乙酰化(H3K27ac)减少以及H3K4三甲基化(H3K4me3)水平降低。这些观察结果表明,sT可能参与调控开放和可及性较高的启动子,而不是在这些时间点诱导转录抑制性组蛋白修饰的招募。
与先前关于Merkel细胞癌(MCC)细胞中HLA分子下调的发现一致,我们的研究证实sT在调节各种细胞表面分子的呈现方面起到一定作用,影响转移和免疫识别[19-22]。特别是,我们能够重新确认sT在MCC肿瘤细胞上调控各种HLA分子的表达,从而影响其免疫识别,正如我们最近所展示的[22]。其他研究也显示了MCC肿瘤中HLA分子的下调[19-21]。虽然我们的工作证明了这种调控是发生在转录水平,并与激活性组蛋白修饰的丧失相关,但李和他的同事显示在HLA启动子上有一个依赖PRC1.1的H3K27me3修饰的Polycomb复合物的沉积,该修饰可以通过抑制Usp7而逆转[21]。我们观察结果与在MCC细胞系中获得的结果之间的差异可能归因于不同的细胞类型,更重要的是,MCC细胞系是长期变化的反映,其中一些变化是由其他机制调控的。
与MCC细胞系中HLA表达的失调类似,STING也被报道在MCC细胞系和MCC组织中受到抑制,类似于其他癌症类型[6, 18, 25]。我们在这里提供的证据表明,MCC和MCPyV感染中观察到的STING调控失调是sT诱导的IFN依赖基因包括STING的抑制的结果。相应地,当我们在MCC细胞系中下调sT的表达时,我们观察到STING的表达增加。sT调控STING表达的能力表明MCPyV逃避宿主免系统的能力,从而促进MCC的进展。因此,通过靶向sT或其相关的通路,以抵制病毒阳性MCC的进展,可能具有潜在的治疗意义。
虽然已经证明SV40、BKPyV和JCPyV的LT表达能在不同细胞系中诱导ISG的表达[51, 52],但我们是首次展示MCPyV LT表达具有类似的效应。值得注意的是,在先前的研究中,这种ISG反应被特别归因于LT的表达,而不是sT,这是通过cDNA过表达实验得出的结果[48]。虽然增加的ISG表达被归因于SV40 LT诱导的DNA损伤[48, 51],但我们还不知道MCPyV LT诱导的ISG反应的确切机制。因此,进一步探索这一点将是至关重要的,特别是因为MCPyV LTtr也能够诱导ISG反应,尽管与LT不同,LTtr不会增加基因毒性应激[1, 5, 24, 44]。
我们还首次展示了LT或LTtr诱导的一型干扰素反应基因表达受到sT的平衡作用,使sT成为一个关键的免疫逃逸因子。如果有一个全程感染的模型,未来的研究将有必要阐明病毒PAMPs以及sT在感染周期中的全面功能。有趣的是,最近的研究表明JCPyV和BKPyV的sT,但不包括SV40和MCPyV的sT,抑制TRIM25诱导的RIG-I激活以扰乱先天免疫反应,这表明病毒RNA中间体可能是潜在的PAMPs[38]。在我们的实验设置中,慢病毒转导诱导了一型干扰素反应,而LT/LTtr进一步增加了该反应,而sT则扰乱了该反应。我们的IFNbeta和ISG启动子的荧光酶分析以及sT的过表达结果证实了这一点,表明MCPyV和BKPyV的sT能够拮抗TBK1介导的信号传导,暗示着细胞质DNA和RNA的PAMPs识别。
我们的研究的局限性包括由于缺乏感染模型(suppl. Fig. S1),我们进行了过表达实验。然而,我们进行了慢病毒转导,从而避免了非生理性高水平的病毒蛋白表达和随后的细胞反应改变。还可能存在由于供体间的差异而产生的其他限制。这些变异,特别是在免疫
反应方面的差异可能会影响实验结果。因此,我们选取了两个独立供体作为nHDF细胞的来源,在免疫反应方面,我们只在供体I和II之间检测到微小差异。我们在转录组分析中也没有包括其他细胞类型,这些细胞类型可能对病毒蛋白呈现不同的反应,因此这些结果可能不适用于不同的细胞类型或组织。然而,我们已经确认了荧光酶分析的结果,以阐明sT在H1299和293细胞中参与一型干扰素反应的功能,从而证明了sT在多个细胞系中的功能。
这项研究首次显示sT是MCPyV中拮抗一型干扰素信号传导的介质。此外,sT通过IRF3-TBK1通路有效调节核酸诱导的识别途径,并抑制了IFNbeta和ISG的产生。我们通过BKPyV sT的确认结果表明,PyV的sT抗原在微调宿主对病毒复制产物的先天免疫反应方面具有保守的作用。
我们的研究结果表明,MCPyV sT抵消了LT或在肿瘤中由LTtr引起的宿主反应,使sT成为病毒通过免疫逃逸在体内持续存在的关键因素,同时也在肿瘤细胞的免疫逃逸中起到重要作用。这些发现改进了我们对PyV,尤其是MCPyV如何绕过先天免疫反应的理解,特别是在MCC中,并为MCC的治疗提供了潜在的靶点。
材料和方法
-
质粒和试剂
第三代慢病毒质粒LeGO-MCPyV sT-iC2 (sT-IRES-mCherry)的详细描述见我们之前的研究 [22]。 MCPyV LT或MCC350衍生的LTtr(MCC350)的cDNA通过限制性酶切与LeGO-iG2(eGFP)进行克隆。 MCPyV sT-co(优化密码子)[41]或BKPyV sT的cDNA通过Gibson装配克隆(ThermoFisher)插入到pcDNA3.1中。
Luc质粒:…
-
细胞培养
从两个不同人类供体购买的新生儿人皮肤成纤维细胞(nHDF)来自Lonza(#CC-2509)。HEK293细胞(ATCC),LentiX-293T细胞(ATCC),H1299细胞(ATCC)和新生儿人皮肤成纤维细胞在标准培养条件下以Dulbecco’s Modified Eagle Medium(DMEM;Gibco)补充10%胎牛血清(FBS;Gibco)和1%青霉素链霉素(Gibco)培养。
患者来源的MCC细胞系WaGa在Roswell Park Memorial Institute(RPMI;Gibco)培养基中培养。H1299细胞通过聚乙烯亚胺(PEI)转染:将10 µg DNA与不含补充物的DMEM以及PEI按1:10的比例混合,加入到10 cm培养皿中的单层细胞中。对于荧光素酶基于启动子的检测,使用Fugene HD(Promega)按照制造商的说明书,在DNA和Fugene HD的3:1比例下转染H1299或HEK293细胞。慢病毒颗粒的制备是在Lenti-X 293T细胞中进行的,方法如先前描述的那样[23]。 nHDFs的感染方法与先前描述的方法相同[16]。在WaGa细胞中使用shRNA介导的sT敲低方法与最近发表的方法相同[22]。
-
nHDFs的慢病毒转导
将nHDFs种植在6孔板中,并以MOI为1的慢病毒颗粒转导。为了增加病毒摄取,添加聚苯乙烯磺胺(Polybrene,Sigma)(最终浓度为8 µg/ml),并在37°C下以1000 xg离心30分钟。转导后的两天,对nHDFs进行荧光激活细胞分选(FACS),并继续培养。
-
基于荧光素酶的启动子测定
将H1299或HEK293细胞种植在96孔板中,并共转染10 ng pEF1α-RL,100 ng荧光素酶基于pIFNβ-FL或pISG-FL报告质粒,以及100 ng MCPyV sT,BKPyV sT,流感A病毒(IAV)NS1,人巨细胞病毒(CMV)UL35或小鼠CMV M27。对于每个实验,不同的刺激物要么共转染,要么在稍后的时间点添加,如下所述:
- ISG启动子测定:共转染pGL3basic-hIFIT1-Luc,pGL3basic-hIFIT2-Luc或pGL3basic-mMX1-Luc,并在转染后24小时加入人重组IFNβ(1 ng/ml;PeproTech,#300-02BC)进行16小时,然后在1×被动裂解缓冲(PLB;Promega)中裂解细胞。
- IFNβ启动子测定:将pGL3basic-IFNβ-Luc与100 ng TBK1或IRF3-5D共转染。另外,转染混合物中还包括100 ng RIG-I-N或50 ng cGAS和50 ng STING。转染后20小时,细胞在1× PLB中裂解。
使用双荧光素酶报告试剂盒系统(Promega)按照制造商的说明测量荧光素酶活性。在Tecan Infinite® 200 Pro微孔读数器(Tecan,Männedorf,瑞士)中测量荧光素酶信号。萤火虫荧光素酶信号标准化到Renilla荧光素酶信号。
-
西方印迹分析
使用RIPA缓冲液(50 mM Tris-HCl,pH 7.5,150 mM NaCl,1 mM EGTA,1%NP-40,0.5%Na-脱氧胆酸,0.1%SDS,1 mM NaF,2 mM β-甘油磷酸,1 mM Na3VO4,0.4 mM PMSF,cOmplete蛋白酶抑制剂混合物(Roche))进行细胞裂解,如先前描述的方法[23]。或者,将细胞在1× PLB中裂解20分钟,然后在4°C下以11,000 xg离心10分钟。使用核和细胞质提取试剂盒(ThermoFisher)进行细胞亚细胞分离。每个SDS凝胶中加载30-50 µg蛋白质,并使用以下一抗体进行Western印迹:鼠抗体,α-LT Cm2B4(1:1000;Santa Cruz),α-sT 2T2 [53]和α-actin C4(1:10,000;Chemicon)。此外,使用兔抗体,α-pSTAT1(1:1000;Cell Signaling),α-STAT1(1:000;Cell Signaling),α-IRF9(1:1000;Cell Signaling),α-V5(1:1000;ThermoFisher),α-Lamin B1(1:2000;abcam,由Stefan Linder提供),以及α-HSP90(1:1000;abcam)。次级抗体使用如下:辣根过氧化物酶(HRP)结合的鼠IgG(1:10,000;GE Healthcare)或兔IgG(1:3000;Cell Signaling)抗体。
-
免疫荧光染色和共聚焦显微镜
对于免疫荧光分析,nHDFs生长在明胶涂层的玻璃片上。使用4%PFA固定细胞,并使用以下一抗体进行免疫荧光染色:鼠抗体α-LT Cm2B4(1:1000;Santa Cruz),兔抗体α-VP1(1:500;Christopher B. Buck,NCI)和α-IRF9(1:1000;Cell Signaling)。作为二级抗体,使用Alexa-Fluor标记的α兔IgG-647,α兔IgG-560或α鼠IgG-488,浓度为1:500。使用共聚焦激光扫描显微镜(DMI 6000,Leica TCS SP5双层扫描器,Leica,Wetzlar,德国)获取图像,并使用油浸40HCX或63HCX PL CS plan-apo目镜。图像分析使用Volocity Demo Version 6.1.1(Perkin Elmer,Waltham,MA)进行。使用QuPath软件[54]对IRF9核信号进行定量分析。从每个条件中分析50-150个细胞,至少分析三个玻璃片。
-
定量实时聚合酶链反应(qPCR)
用RNeasy提取试剂盒(Qiagen)和DNA- free™ DNA removal kit(Invitrogen)从细胞中提取总RNA。对于逆转录(RT)-qPCR,使用Superscript IV(Invitrogen)按照制造商的说明将1.7 µg RNA转录为cDNA。使用SYBR Green(Thermo Fisher)进行定量实时聚合酶链反应。
- HPRT:5′-TGACCTTGATTTATTTTGCATACC-3’(前),5′-CTCGAGCAAGACGTTCAGTC-3’(后),
- TBP:…
此外,使用以下探针进行TaqMan实验(Thermo Fisher):MCPyV VP1:VIC-GTGGCGCTGAGTACGTCGTGGAGTC,人类GAPDH:6FAM-GATCTGGAGATGATCCCTTTGGCTG。
反应在Rotor-Gene Q-plex(Qiagen)中进行。使用Rotor-Gene Q Series软件(版本2.3.1)对数据进行分析,并以至少两个参考基因(包括TBP、HPRT和GAPDH)进行归一化。通过qPCR分析感染nHDFs培养液中的MCPyV基因组拷贝数,以VP1与GAPDH归一化。本研究使用的引物和探针列在补充表格S7中。
-
RNA测序
使用Bioanalyzer和RNA Nano试剂盒(Agilent)检测分离的RNA。对通过RNA完整性评分(RIN)为0.7的所有样本,进行1µg RNA的RNA-Seq文库制备。使用NEBNext Poly(A)磁性分离模块(NEB)进行mRNA富集。根据制造商的方案,使用NEXTFlex快速定向qRNA-Seq试剂盒(Bioo Scientific)进行RNA文库制备。采用Illumina HiSeq 2500平台(SR50)进行测序。使用STAR 2.6.0c工具将测序读数映射到人类基因组(hg19)。采用DESeq2进行差异基因表达(DEG)分析,如前所述。使用DAVID工具进行基因本体(GO)分析,使用padj <0.05和log2FC >1/<-1的DEGs。选择与生物过程相关的GO术语,其FDR或p值<0.05,并且在GO术语中至少包含五个基因。GO术语分析的所有未经筛选的表格均显示在补充表S3中。使用R Studio生成火山图、热图和气泡图。使用Python创建PCA图。
IRF9 ko比较
-
染色质免疫沉淀测序
进行原代染色质免疫沉淀(nChIP),根据[58]的描述,每个IP使用20万个nHDFs作为起始材料。以下抗体用于nChIP反应:兔mAb α-H3K4me3(1:200;Merck Millipore)、α-H3K27me3(1:300;Cell Signaling)、α-H3K9me3(1:200;Active Motif)、α-H3K27ac(1:200;abcam)和IgG(1:200;Merck Millipore)。根据制造商的说明,使用NEXTFlexTM ChIP-Seq文库制备试剂盒(Bioo Scientific)制备提取的DNA的文库。样品在Illumina HiSeq2500上进行测序,使用BWA工具[59]将读数映射到hg19。使用MACS2 [60]进行H3K4me3和H3K27ac峰值调用,使用SICER [61]调用更广泛的峰值,例如H3K9me3和H3K27me3。根据[62]的描述进行diffReps分析。分析和可视化使用R Studio和EaSeq [63]进行。
-
统计分析
使用GraphPad Prism(版本9.0,GraphPad Software,San Diego,CA)进行统计分析。
Draw Venn Diagrams using matplotlib
#peaks_PFSK-1.txt peaks_HEK293.txt peaks_NHDF.txt Total Name
# X 950 peaks_NHDF.txt
# X 964 peaks_HEK293.txt
# X X 68 peaks_HEK293.txt|peaks_NHDF.txt
#X 6155 peaks_PFSK-1.txt
#X X 653 peaks_PFSK-1.txt|peaks_NHDF.txt
#X X 920 peaks_PFSK-1.txt|peaks_HEK293.txt
#X X X 267 peaks_PFSK-1.txt|peaks_HEK293.txt|peaks_NHDF.txt
#-------
import matplotlib.pyplot as plt
from matplotlib_venn import venn3
# Define the sizes of the sets
set1_only = 950 # Size of NHDF
set2_only = 964 # Size of HEK293
set3_only = 6155 # Size of PFSK-1
# Define the sizes of the overlapping regions
shared_elements_12 = 68 # Size of the overlap between set 1 and set 2
shared_elements_13= 653 # Size of the overlap between set 1 and set 3
shared_elements_23 = 920 # Size of the overlap between set 2 and set 3
shared_elements_123 = 267 # Size of the overlap between all three sets
## Define the sizes of the datasets
#set1_size = 100
#set2_size = 80
#set3_size = 60
#shared_elements_12 = 30
#shared_elements_13 = 20
#shared_elements_23 = 15
#shared_elements_123 = 10
## Calculate the sizes of the individual and overlapping sets
#set1_only = set1_size - shared_elements_12 - shared_elements_13 - shared_elements_123
#set2_only = set2_size - shared_elements_12 - shared_elements_23 - shared_elements_123
#set3_only = set3_size - shared_elements_13 - shared_elements_23 - shared_elements_123
# Create the Venn diagram
venn3(subsets=(set1_only, set2_only, shared_elements_12, set3_only, shared_elements_13, shared_elements_23, shared_elements_123), set_labels=('NHDF LT', 'HEK293 LT+sT', 'PFSK-1 LT+sT'))
#venn3(subsets=(set1_size, set2_size, set1_set2_size, set3_size, set1_set3_size, set2_set3_size, set1_set2_set3_size), set_labels=('NHDF', 'HEK293', 'PFSK-1'))
# Set the title and labels
plt.title('', fontsize=16) #Venn Diagram of Three Datasets
plt.xlabel('Samples')
plt.ylabel('Count')
# Show the Venn diagram
#plt.show()
plt.savefig('Venn_Diagram_NHDF_vs_HEK293_vs_PFSK-1.png', dpi=300, bbox_inches='tight')
#peaks_K331A.txt peaks_NHDF.txt Total Name
# X 1818 peaks_NHDF.txt
#X 261 peaks_K331A.txt
#X X 120 peaks_K331A.txt|peaks_NHDF.txt
import matplotlib.pyplot as plt
from matplotlib_venn import venn2
# Define the sizes of the datasets
set1_size = 1938
set2_size = 381
shared_elements = 120
# Calculate the sizes of the individual and overlapping sets
set1_only = set1_size - shared_elements
set2_only = set2_size - shared_elements
# Create the Venn diagram
venn2(subsets=(set1_only, set2_only, shared_elements), set_labels=('NHDF LT', 'NHDF LT K331A'))
# Set the title and labels
plt.title('') #NHDF LT vs NHDF K331A
plt.xlabel('Number of Elements')
plt.ylabel('')
# Display the Venn diagram
#plt.show()
plt.savefig('Venn_Diagram_NHDF-LT_vs_NHDF-LT-K331A.png', dpi=300, bbox_inches='tight')
## under directory "results_ChIPseq_NHDF_LT_K331A_PFSK-1_HEK293_hg38/homer"
#grep "NHDF_K331A_DonorI/peaks.txt" peaks_K331A_LT.txt | wc -l # 224-186=38
#grep "NHDF_K331A_DonorII/peaks.txt" peaks_K331A_LT.txt | wc -l # 343-186=157
#grep "NHDF_K331A_DonorI/peaks.txt|NHDF_K331A_DonorII/peaks.txt" peaks_K331A_LT.txt | wc -l #186
##38+157+186=381
import matplotlib.pyplot as plt
from matplotlib_venn import venn2
venn2(subsets=(38, 157, 186), set_labels=('Donor I', 'Donor II'))
plt.title('Peak Consistency Between NHDF LT K331A Donors (Total 381 peaks)')
plt.xlabel('Number of Elements')
plt.ylabel('')
plt.savefig('Venn_Diagram_Peak_Consistency_Between_NHDF-LT-K331A_Donors.png', dpi=300, bbox_inches='tight')
#grep "NHDF_LT_DonorI/peaks.txt" peaks_NHDF_LT.txt | wc -l # 1440-969=471
#grep "NHDF_LT_DonorII/peaks.txt" peaks_NHDF_LT.txt | wc -l # 1467-969=498
#grep "NHDF_LT_DonorI/peaks.txt|NHDF_LT_DonorII/peaks.txt" peaks_NHDF_LT.txt | wc -l #969
##471+498+969=1938
import matplotlib.pyplot as plt
from matplotlib_venn import venn2
venn2(subsets=(471, 498, 969), set_labels=('Donor I', 'Donor II'))
plt.title('Peak Consistency Between NHDF LT Donors (Total 1938 peaks)')
plt.xlabel('Number of Elements')
plt.ylabel('')
plt.savefig('Venn_Diagram_Peak_Consistency_Between_NHDF-LT_Donors.png', dpi=300, bbox_inches='tight')
#grep "PFSK-1B_LT+sT_r1/peaks.txt" peaks_PFSK-1_LT+sT.txt | wc -l # 4814-2927=1887
#grep "PFSK-1B_LT+sT_r2/peaks.txt" peaks_PFSK-1_LT+sT.txt | wc -l # 6108-2927=3181
#grep "PFSK-1B_LT+sT_r1/peaks.txt|PFSK-1B_LT+sT_r2/peaks.txt" peaks_PFSK-1_LT+sT.txt | wc -l #2927
##1887+3181+2927=7995
import matplotlib.pyplot as plt
from matplotlib_venn import venn2
venn2(subsets=(1887, 3181, 2927), set_labels=('Replicate 1', 'Replicate 2'))
plt.title('Peak Consistency Between PFSK-1 LT+sT Replicates (Total 7995 peaks)')
plt.xlabel('Number of Elements')
plt.ylabel('')
plt.savefig('Venn_Diagram_Peak_Consistency_Between_PFSK-1-LT+sT_Replicates.png', dpi=300, bbox_inches='tight')
#grep "HEK293_LT+sT_r2/peaks.txt" peaks_HEK293_LT+sT.txt | wc -l # 1896-580=1316
#grep "HEK293_LT+sT_r3/peaks.txt" peaks_HEK293_LT+sT.txt | wc -l # 903-580=323
#grep "HEK293_LT+sT_r2/peaks.txt|HEK293_LT+sT_r3/peaks.txt" peaks_HEK293_LT+sT.txt | wc -l #580
##1316+323+580=2219
import matplotlib.pyplot as plt
from matplotlib_venn import venn2
venn2(subsets=(1316, 323, 580), set_labels=('Replicate 1', 'Replicate 2'))
plt.title('Peak Consistency Between HEK293 LT+sT Replicates (Total 2219 peaks)')
plt.xlabel('Number of Elements')
plt.ylabel('')
plt.savefig('Venn_Diagram_Peak_Consistency_Between_HEK293-LT+sT_Replicates.png', dpi=300, bbox_inches='tight')
# Draw Venn Diagram using Set rather than Set Size
##38+186+157=381
#import matplotlib.pyplot as plt
#from matplotlib_venn import venn2
#
#set1 = set(range(1, 225))
#set2 = set(range(39, 382))
#
#venn2([set1, set2], set_labels=('Replicate 1', 'Replicate 2'), set_colors=('skyblue', 'lightgreen'))
#plt.title('Peak Consistency Between NHDF K331A Replicates (total 381 peaks)', fontsize=16)
#plt.xticks(fontsize=12)
#plt.yticks(fontsize=12)
#
##plt.show() # Display the Venn diagram
#plt.savefig('Venn_Diagram_Peak_Consistency_Between_NHDF_K331A_Replicates.png') # Save the Venn diagram as an image file
Install and configure Docker for ‘nextflow run nf-core/rnaseq’
Install and configure that the packages for Docker (docker-ce, docker-ce-cli, and containerd.io):
-
Update your system’s package information:
sudo apt-get update
-
Uninstall any old versions of Docker:
sudo apt-get remove docker docker-engine docker.io containerd runc
-
Set up the Docker repository to get the latest version of Docker. First, install packages to allow apt to use a repository over HTTPS:
sudo apt-get install apt-transport-https ca-certificates curl gnupg lsb-release
-
Add Docker’s official GPG key:
curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg
-
Use the following command to set up the stable repository (Note the use of the signed-by option to point to the keyring file that you have just downloaded):
echo \ "deb [arch=amd64 signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/ubuntu \ $(lsb_release -cs) stable" | sudo tee /etc/apt/sources.list.d/docker.list > /dev/null
-
Update the apt package index again, and then install Docker:
sudo apt-get update sudo apt-get install docker-ce docker-ce-cli containerd.io
The Docker installation includes the Docker service (daemon), which allows you to start containers, and the Docker CLI client. The Docker CLI uses the Docker API to interact with the Docker service.
-
Finally, add your user to the Docker group with the command. The following command cannot solve permission errors, rather than running step 8 can solve the problem.
sudo usermod -aG docker ${USER}
-
Run nf-core/rnaseq under dock
sudo chmod 666 /var/run/docker.sock xxxxx@xxx:~/DATA/Data_Manja_RNAseq_Organoids$ nextflow run nf-core/rnaseq --input samplesheet.csv --outdir results_GRCh38 --with_umi --umitools_bc_pattern 'NNNNNNNNNNNN' --fasta "/home/jhuang/REFs/Homo_sapiens/Ensembl/GRCh38/Sequence/WholeGenomeFasta/genome.fa" --gtf "/home/jhuang/REFs/Homo_sapiens/Ensembl/GRCh38/Annotation/Genes/genes.gtf" --skip_rseqc --skip_dupradar --skip_preseq -profile docker -resume
-
During the installation of singularity, we got a error as follows.
sudo apt-get update && sudo apt-get instal -y \ build-essential \ libssl-dev \ uuid-dev \ libgpgme11-dev \ squashfs-tools \ libseccomp-dev \ wget \ pkg-config \ git \ cryptsetup ... Processing triggers for initramfs-tools (0.130ubuntu3.6) ... update-initramfs: Generating /boot/initrd.img-5.4.0-150-generic I: The initramfs will attempt to resume from /dev/dm-9 I: (UUID=698e2e03-7762-4764-b890-86d234beb938) I: Set the RESUME variable to override this.
The changes here are related to enabling your system to resume from a specific device after suspend. The line “The initramfs will attempt to resume from /dev/dm-9” indicates that if your system goes into a suspend or hibernate state, it’ll try to resume the system state from the device /dev/dm-9. This setting is typically safe and should not cause any problems.
PiCRUST2 Pipeline for Functional Prediction and Pathway Analysis in Metagenomics
How to run the software package PiCRUST2 (Phylogenetic Investigation of Communities by Reconstruction of Unobserved States version 2), and to visualize its output data using STAMP (Statistical Analysis of Metagenomic Profiles) and ALDEx2 (Analysis of Differential Abundance taking sample Variation into Account version 2) for Functional Prediction and Pathway Analysis in Metagenomics.
-
Environment Setup: It sets up a Conda environment named picrust2, using the conda create command and then activates this environment using conda activate picrust2.
#https://github.com/picrust/picrust2/wiki/PICRUSt2-Tutorial-(v2.2.0-beta)#minimum-requirements-to-run-full-tutorial conda create -n picrust2 -c bioconda -c conda-forge picrust2 #=2.2.0_b conda activate picrust2
-
Data Preparation: The script creates a new directory called picrust2_out, then enters it using mkdir and cd commands. It then identifies input files that are needed for the analysis: metadata.tsv, seqs.fna, table.biom. The biom commands are used to inspect and convert the BIOM format files.
mkdir picrust2_out cd picrust2_out # Identifying input data # Note: Replace the paths and filenames with your actual data if different # metadata.tsv == ../map_corrected.txt # seqs.fna == ../clustering/seqs.fna # table.biom == ../core_diversity_e42369/table_even42369.biom # Inspect and convert the BIOM format files biom head -i ../core_diversity_e42369/table_even42369.biom biom summarize-table -i ../core_diversity_e42369/table_even42369.biom biom convert -i ../core_diversity_e42369/table_even42369.biom -o table_even42369.tsv --to-tsv
-
Running PiCRUST2: The place_seqs.py command aligns the input sequences to a reference tree. The hsp.py commands generate hidden state prediction for multiple functional categories.
#insert reads into reference tree using EPA-NG cp ../clustering/rep_set.fna ./ grep ">" rep_set.fna | wc -l #44238 vim table_even42369.tsv #40596-2 samtools faidx rep_set.fna cut -f1-1 table_even42369.tsv > table_even42369.id #manually modify table_even42369.id by replacing "\n" with " >> seqs.fna\nsamtools faidx rep_set.fna " run table_even42369.id rm -rf intermediate/ place_seqs.py -s seqs.fna -o out.tre -p 4 --intermediate intermediate/place_seqs #castor: Efficient Phylogenetics on Large Trees #https://github.com/picrust/picrust2/wiki/Hidden-state-prediction hsp.py -i 16S -t out.tre -o 16S_predicted_and_nsti.tsv.gz -p 15 -n hsp.py -i COG -t out.tre -o COG_predicted.tsv.gz -p 15 hsp.py -i PFAM -t out.tre -o PFAM_predicted.tsv.gz -p 15 hsp.py -i KO -t out.tre -o KO_predicted.tsv.gz -p 15 hsp.py -i EC -t out.tre -o EC_predicted.tsv.gz -p 15 hsp.py -i TIGRFAM -t out.tre -o TIGRFAM_predicted.tsv.gz -p 15 hsp.py -i PHENO -t out.tre -o PHENO_predicted.tsv.gz -p 15
In this table the predicted copy number of all Enzyme Classification (EC) numbers is shown for each ASV. The NSTI values per ASV are not in this table since we did not specify the -n option. EC numbers are a type of gene family defined based on the chemical reactions they catalyze. For instance, EC:1.1.1.1 corresponds to alcohol dehydrogenase. In this tutorial we are focusing on EC numbers since they can be used to infer MetaCyc pathway levels (see below).
zless -S EC_predicted.tsv.gz sequence EC:1.1.1.1 EC:1.1.1.10 EC:1.1.1.100 ... 20e568023c10eaac834f1c110aacea18 2 0 3 ... 23fe12a325dfefcdb23447f43b6b896e 0 0 1 ... 288c8176059111c4c7fdfb0cd5afce64 1 0 1 ... ... ##Why import the tsv file to MyData? #MyData <- read.csv(file="./COG_predicted.tsv", header=TRUE, sep="\t", row.names=1) #6806 4598 e.g. COG5665 #MyData <- read.csv(file="./PFAM_predicted.tsv", header=TRUE, sep="\t", row.names=1) #6806 11089 e.g. PF17225 #MyData <- read.csv(file="./KO_predicted.tsv", header=TRUE, sep="\t", row.names=1) #6806 10543 e.g. K19791 #MyData <- read.csv(file="./EC_predicted.tsv", header=TRUE, sep="\t", row.names=1) #6806 2913 e.g. EC.6.6.1.2 #MyData <- read.csv(file="./16S_predicted.tsv", header=TRUE, sep="\t", row.names=1) #6806 1 e.g. X16S_rRNA_Count #MyData <- read.csv(file="./TIGRFAM_predicted.tsv", header=TRUE, sep="\t", row.names=1) #6806 4287 e.g. TIGR04571 #MyData <- read.csv(file="./PHENO_predicted.tsv", header=TRUE, sep="\t", row.names=1) #6806 41 e.g. Use_of_nitrate_as_electron_acceptor, Xylose_utilizing
-
The metagenome_pipeline.py commands perform metagenomic prediction for several functional categories. Predicted gene families weighted by the relative abundance of ASVs in their community. In other words, we are interested in inferring the metagenomes of the communities.
#Generate metagenome predictions using EC numbers https://en.wikipedia.org/wiki/List_of_enzymes#Category:EC_1.1_(act_on_the_CH-OH_group_of_donors) metagenome_pipeline.py -i ../core_diversity_e42369/table_even42369.biom -m 16S_predicted_and_nsti.tsv.gz -f COG_predicted.tsv.gz -o COG_metagenome_out --strat_out metagenome_pipeline.py -i ../core_diversity_e42369/table_even42369.biom -m 16S_predicted_and_nsti.tsv.gz -f EC_predicted.tsv.gz -o EC_metagenome_out --strat_out metagenome_pipeline.py -i ../core_diversity_e42369/table_even42369.biom -m 16S_predicted_and_nsti.tsv.gz -f KO_predicted.tsv.gz -o KO_metagenome_out --strat_out metagenome_pipeline.py -i ../core_diversity_e42369/table_even42369.biom -m 16S_predicted_and_nsti.tsv.gz -f PFAM_predicted.tsv.gz -o PFAM_metagenome_out --strat_out metagenome_pipeline.py -i ../core_diversity_e42369/table_even42369.biom -m 16S_predicted_and_nsti.tsv.gz -f TIGRFAM_predicted.tsv.gz -o TIGRFAM_metagenome_out --strat_out
-
Pathway-level inference: By default this script infers MetaCyc pathway abundances based on EC number abundances, although different gene families and pathways can also be optionally specified. This script performs a number of steps by default, which are based on the approach implemented in HUMAnN2:
- Regroups EC numbers to MetaCyc reactions.
- Infers which MetaCyc pathways are present based on these reactions with MinPath.
-
Calculates and returns the abundance of pathways identified as present.
#pathway_pipeline.py -i EC_metagenome_out/pred_metagenome_contrib.tsv.gz -o pathways_out -p 15 #Note that the path of map files is under /home/jhuang/anaconda3/envs/picrust2/lib/python3.6/site-packages/picrust2/default_files/pathway_mapfiles (picrust2) pathway_pipeline.py -i COG_metagenome_out/pred_metagenome_contrib.tsv.gz -o KEGG_pathways_out -p 15 --no_regroup --map /home/jhuang/anaconda3/envs/picrust2/lib/python3.6/site-packages/picrust2/default_files/pathway_mapfiles/KEGG_pathways_to_KO.tsv #Mapping predicted KO abundances to legacy KEGG pathways (with stratified output that represents contributions to community-wide abundances): (picrust2) pathway_pipeline.py -i KO_metagenome_out/pred_metagenome_strat.tsv.gz -o KEGG_pathways_out --no_regroup --map /home/jhuang/anaconda3/envs/picrust2/lib/python3.6/site-packages/picrust2/default_files/pathway_mapfiles/KEGG_pathways_to_KO.tsv #Map EC numbers to MetaCyc pathways and get stratified output corresponding to contribution of predicted gene family abundances within each predicted genome: #BUG: CANNOT FINISH in 1 day! (picrust2) pathway_pipeline.py -i EC_metagenome_out/pred_metagenome_unstrat.tsv.gz -o pathways_out_per_seq --per_sequence_contrib --per_sequence_abun EC_metagenome_out/seqtab_norm.tsv.gz --per_sequence_function EC_predicted.tsv.gz (picrust2) pathway_pipeline.py -i EC_metagenome_out/pred_metagenome_unstrat.tsv.gz -o pathways_out -p 6
-
Add functional descriptions: Finally, it can be useful to have a description of each functional id in the output abundance tables. The below commands will add these descriptions as new column in gene family and pathway abundance tables
#--6.1. Add descriptions in gene family tables add_descriptions.py -i COG_metagenome_out/pred_metagenome_unstrat.tsv.gz -m COG -o COG_metagenome_out/pred_metagenome_unstrat_descrip.tsv.gz add_descriptions.py -i EC_metagenome_out/pred_metagenome_unstrat.tsv.gz -m EC -o EC_metagenome_out/pred_metagenome_unstrat_descrip.tsv.gz add_descriptions.py -i KO_metagenome_out/pred_metagenome_unstrat.tsv.gz -m KO -o KO_metagenome_out/pred_metagenome_unstrat_descrip.tsv.gz # EC and METACYC is a pair, EC for gene_annotation and METACYC for pathway_annotation add_descriptions.py -i PFAM_metagenome_out/pred_metagenome_unstrat.tsv.gz -m PFAM -o PFAM_metagenome_out/pred_metagenome_unstrat_descrip.tsv.gz add_descriptions.py -i TIGRFAM_metagenome_out/pred_metagenome_unstrat.tsv.gz -m TIGRFAM -o TIGRFAM_metagenome_out/pred_metagenome_unstrat_descrip.tsv.gz #--6.2. Add descriptions in pathway abundance tables add_descriptions.py -i pathways_out/path_abun_unstrat.tsv.gz -m METACYC -o pathways_out/path_abun_unstrat_descrip.tsv.gz #Error - no rows remain after regrouping input table. The default pathway and regroup mapfiles are meant for EC numbers. Note that KEGG pathways are not supported since KEGG is a closed-source database, but you can input custom pathway mapfiles if you have access. If you are using a custom function database did you mean to set the --no-regroup flag and/or change the default pathways mapfile used? #If ERROR --> USE the METACYC for downstream analyses!!! add_descriptions.py -i pathways_out/path_abun_unstrat.tsv.gz -o KEGG_pathways_out/path_abun_unstrat_descrip.tsv.gz --custom_map_table /home/jhuang/anaconda3/envs/picrust2/lib/python3.6/site-packages/picrust2/default_files/description_mapfiles/KEGG_pathways_info.tsv.gz
-
Visualization
#7.1 install and open STAMP #https://github.com/picrust/picrust2/wiki/STAMP-example #install and open STAMP conda deactivate conda install -c bioconda stamp #sudo pip install pyqi #sudo apt-get install libblas-dev liblapack-dev gfortran #sudo apt-get install freetype* python-pip python-dev python-numpy python-scipy python-matplotlib #sudo pip install STAMP #conda install -c bioconda stamp conda create -n stamp -c bioconda/label/cf201901 stamp brew install pyqt #DEBUG the environment conda install pyqt=4 #conda install icu=56 (stamp) jhuang@hamburg:~$ STAMP #7.2 unzip path_abun_unstrat_descrip.tsv.gz gunzip path_abun_unstrat_descrip.tsv.gz #7.3 prepare metadata.tsv from map_corrected.txt vim metadata.tsv #SampleID Genotype Description #S1 Before_non-reducers IDFrancesco1 #S2 After_non-reducers IDFrancesco2 #S3 Before_Reducers IDFrancesco4 #S4 After_Reducers IDFrancesco5 cut -d$'\t' -f1 map_corrected.txt > 1 cut -d$'\t' -f5 map_corrected.txt > 5 cut -d$'\t' -f6 map_corrected.txt > 6 paste -d$'\t' 1 5 > 1_5 paste -d$'\t' 1_5 6 > metadata.tsv #SampleID --> SampleID SampleID Facility Genotype 100CHE6KO PaloAlto KO 101CHE6WT PaloAlto WT #7.4(optional) use ALDEx2 rather than STAMP: https://bioconductor.org/packages/release/bioc/html/ALDEx2.html
-
Explanation of the generated plot from Step 7: Extended error bar.
The difference in mean proportions is a statistical measurement that is often used in comparing the proportions of a certain outcome between two groups.
Here's a simple example to explain the concept:
Imagine you conducted a survey on two groups of people, Group A and Group B, asking whether they like a specific brand of chocolate. In Group A, 70 out of 100 people said yes (proportion = 0.7). In Group B, 80 out of 100 people said yes (proportion = 0.8). The difference in proportions is 0.8 - 0.7 = 0.1. This means that in your sample, the proportion of people who like the specific brand of chocolate is 10% higher in Group B compared to Group A.
Statistically speaking, we often want to know whether this difference is significant (i.e., is it likely to be due to chance, or is there a real difference between the groups?). We can use a statistical test, such as a two-proportion z-test, to answer this question.
It's important to note that the difference in proportions is sensitive to the size of your sample. If you have very large groups, even a small difference in proportions can be statistically significant. If you have small groups, only a large difference will be statistically significant.
Evaluating the Proximity of Genomic Features (=integration sites) to Peaks Using a Permutation Test genomic features
The study revolves around the evaluation of the proximity of integration sites to peaks in the human genome, using a permutation-based approach. It involves three primary steps:
-
Observed Data: We begin by considering our observed data, which are the distances from each integration site to the nearest peak. We compute the mean of these observed distances.
-
Null Distribution: To generate a null distribution, we perform a permutation test by randomizing the integration sites. For each iteration (in this case, 1,000 iterations to create a robust distribution), we randomly select a number of integration sites equivalent to the count of unique observed distances from the human genome and calculate the distance to the nearest peak. The random integration sites are represented in a BED format (chromosome, start, end). These integration sites, termed as ‘random_integration_sites’, are chosen from defined chromosomal regions that provide the lengths of human chromosomes.
The calculation of distances proceeds as follows:
- For each feature (randomly generated integration site), we find the closest peak using the ‘closest’ function from the ‘pybedtools’ library, resulting in a ‘random_closest’ BEDTool object.
- We extract the distances from each random integration site to its closest peak. These distances are stored in ‘random_distances’.
- We then calculate the mean of these random distances and store it in ‘random_mean_distances’.
-
Comparison: We then compare our observed mean distance to the null distribution of mean distances. The output p-value serves as an indicator of statistical significance. If the p-value is small (conventionally, less than 0.05 is considered significant), it suggests that the observed mean distance is significantly different from what would be expected by random chance, indicating that the features are not randomly distributed but are likely to be located closer to peaks in the genome.
Observed mean distance: 1218081 The statistical results from the 1000 iterations of the permutation test, in which we randomly generated potential integration sites: Mean: 1445755 Standard deviation: 400673 Minimum: 509078 Maximum: 2978797 P-value: 0.313
(Optional) If your observed mean distance is smaller than the smallest mean distance from your null distribution (i.e., the 1000 permutations), this indeed suggests that your observed integration sites are significantly closer to the peaks than would be expected under the null hypothesis.
(Optional) In other words, your observed integration sites are more closely located to the peaks than random integration sites, which supports the conclusion that there is a non-random association between your integration sites and the peaks.
(Optional) A small p-value suggests that the observed mean distance is significantly different from what is expected by random chance. This could mean that the integration sites under study are preferentially located near peaks in the genome. The null distribution is the collection of mean distances we calculated for each permutation. The proportion of mean distances in the null distribution that is greater than or equal to our observed mean distance serves as our p-value.
So the p-value calculation should look something like this:
#pip install statsmodels
import numpy as np
from pybedtools import BedTool
import pprint
from statsmodels.stats.multitest import multipletests
pp = pprint.PrettyPrinter(indent=4)
#sort -k1,1 -k2,2n peaks_on_integrationsites.csv > peaks_on_integrationsites_sorted.bed
#=898046
#1406936,133333333
# Observed distances
#observed_distances = [-4045231,563541,1118767,-1779287,0,-5347653,3935720,1146367,1507718,0,-1826,-7456,81323,68056,1386933,0,-545651,-84468,-652642,351958,218160,5644455,320101,2050624,-418508,-1061416,-351892,-33175,-296551,-138858,2221723,-658351,3419047,-2701162,1295321,4712290,0,1434626,-5479512,1918341,465313,-986431,190096,-566869,-736100,3579169,1087322,-2696342,-1866390,-14123,1250899,-1424025,-929436,232285,232338,3962087,1042645,728148,-163988,-188515,-1445728,-198270,-116532,267672,924015,735666,-1705528,147724,-122133,261167]
observed_distances = [4045231,563541,1118767,1779287,0,5347653,3935720,1146367,1507718,0,1826,7456,81323,68056,1386933,0,545651,84468,652642,351958,218160,5644455,320101,2050624,418508,1061416,351892,33175,296551,138858,2221723,658351,3419047,2701162,1295321,4712290,0,1434626,5479512,1918341,465313,986431,190096,566869,736100,3579169,1087322,2696342,1866390,14123,1250899,1424025,929436,232285,232338,3962087,1042645,728148,163988,188515,1445728,198270,116532,267672,924015,735666,1705528,147724,122133,261167]
#unique_observed_distances = list(set(observed_distances))
observed_mean = np.mean(observed_distances)
print('observed_mean:', observed_mean)
# Load peak ranges from the BED file
peaks = BedTool('peaks_NHDF_.bed').sort()
# Define chrom regions
# 'chrM': 16569,
#175187.58208955225
chrom_regions = {
'chr1': 248956422, 'chr2': 242193529, 'chr3': 198295559, 'chr4': 190214555, 'chr5': 181538259,
'chr6': 170805979, 'chr7': 159345973, 'chr8': 145138636, 'chr9': 138394717, 'chr10': 133797422,
'chr11': 135086622, 'chr12': 133275309, 'chr13': 114364328, 'chr15': 101991189,
'chr16': 90338345, 'chr17': 83257441, 'chr18': 80373285, 'chr20': 64444167,
'chr21': 46709983, 'chr22': 50818468, 'chr14': 107043718, 'chr19': 58617616, 'chrX': 156040895, 'chrY': 57227415
}
# Permutation test parameters 16.4
#620708 --> 42208198/47=898046 --> 1406936
num_permutations = 1000
num_features = len(observed_distances)
random_mean_distances = []
for _ in range(num_permutations):
# Generate random integration sites
random_integration_sites = []
for _ in range(num_features):
chrom = np.random.choice(list(chrom_regions.keys()))
position = np.random.randint(0, chrom_regions[chrom])
# where the range from end to start is always 898046, meaning these represent an average length of the integration sites in the genome
random_integration_sites.append((chrom, position, position+898046))
random_integration_sites = BedTool(random_integration_sites)
random_integration_sites = random_integration_sites.sort()
#print(random_integration_sites)
# Find closest peaks for each feature
random_closest = random_integration_sites.closest(peaks, d=True)
# Extract distances
random_distances = [int(i[-1]) for i in random_closest]
# Calculate distances and their mean
random_mean_distances.append(np.mean(random_distances))
# Calculate p-value
p_values = [np.mean([mean_dist <= observed_mean for mean_dist in random_mean_distances])]
p_values_corrected = multipletests(p_values, method='fdr_bh')[1] # Apply BH correction
#pp.pprint(random_mean_distances)
print("Mean: ", np.mean(random_mean_distances))
print("Standard deviation: ", np.std(random_mean_distances))
print("Minimum: ", np.min(random_mean_distances))
print("Maximum: ", np.max(random_mean_distances))
print("Uncorrected p-value: ", p_values[0])
print("Corrected p-value: ", p_values_corrected[0]) # After BH correction
#[mean_dist >= observed_mean for mean_dist in random_mean_distances]
#This is a list comprehension that returns a boolean list. For each mean distance in the list random_mean_distances (i.e., mean distances calculated from randomly selected genomic positions), it checks if the mean distance is greater than or equal to observed_mean (i.e., the mean of observed distances from the nearest peak for each integration site). If the condition is met, it returns True (which is equivalent to 1), else it returns False (equivalent to 0).
#np.mean([...])
#This is calculating the mean of the boolean list. In this context, the mean of the boolean list is equivalent to the proportion of random mean distances that are greater than or equal to the observed mean distance. This is because True is considered as 1 and False as 0 when calculating the mean. This proportion is used as the p-value.
#The p-value is a measure of the probability that an observed difference could have occurred just by random chance. The lower the p-value, the greater the statistical significance because it tells the investigator that the hypothesis under consideration may not adequately explain the observation. In most cases, a threshold (alpha) is set at 0.05, meaning that there is a 5% chance that the difference is due to chance alone. If the p-value is less than 0.05, the null hypothesis is rejected.
I performed a statistical test aimed at evaluating whether the observed distances from integration sites to the nearest peaks are significantly different from what might be expected by random chance.
Here are the results:
- Observed mean distance (calculated from a total of 70 integration sites to their nearest peaks): 1,218,081 nt.
The statistical results from the 1000 iterations of the permutation test are as follows:
- Mean: 1,445,755
- Standard deviation: 400,673
- Minimum: 509,078
- Maximum: 2,978,797
The P-value is 0.313. It suggests that our observed integration sites are not significantly closer to these peaks than would be expected if the sites were distributed randomly.
The statistical test involves three steps:
- Observed Data: We start by considering our observed data, which are the distances from each integration site to the nearest peak. We compute the mean of these observed distances.
- Null Distribution: To generate a null distribution, we perform a permutation test with 1000 iterations. For each iteration, we randomly generate 70 positions (as potential integration sites) and then calculate the distance from these positions to the nearest peak.
- Comparison: We then compare our observed mean distance to the 1000 mean distances generated in the last step.
Epidome processing
-
Raw_Data
#Here are some more information on the two sample collections for epidome sequencing: 9+10+33=52 #Samples 7N-15N are nose swabs of 9 individual patients, and 16-20F/N are feet (F) and nose (N) swabs of 5 more patients. #All of these patients were hospitalized for endoprosthesis surgery. #nose swaps of 9 individual patients ln -s ./230425_M03701_0301_000000000-KNW3N/hr1436/7N_S1_R1_001.fastq.gz P7-Nose_R1.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1436/7N_S1_R2_001.fastq.gz P7-Nose_R2.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1437/8N_S2_R1_001.fastq.gz P8-Nose_R1.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1437/8N_S2_R2_001.fastq.gz P8-Nose_R2.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1438/9N_S3_R1_001.fastq.gz P9-Nose_R1.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1438/9N_S3_R2_001.fastq.gz P9-Nose_R2.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1439/10N_S4_R1_001.fastq.gz P10-Nose_R1.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1439/10N_S4_R2_001.fastq.gz P10-Nose_R2.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1440/11N_S5_R1_001.fastq.gz P11-Nose_R1.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1440/11N_S5_R2_001.fastq.gz P11-Nose_R2.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1441/12N_S6_R1_001.fastq.gz P12-Nose_R1.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1441/12N_S6_R2_001.fastq.gz P12-Nose_R2.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1442/13N_S7_R1_001.fastq.gz P13-Nose_R1.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1442/13N_S7_R2_001.fastq.gz P13-Nose_R2.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1443/14N_S8_R1_001.fastq.gz P14-Nose_R1.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1443/14N_S8_R2_001.fastq.gz P14-Nose_R2.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1444/15N_S9_R1_001.fastq.gz P15-Nose_R1.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1444/15N_S9_R2_001.fastq.gz P15-Nose_R2.fastq.gz #16-20F/N are feet (F) and nose (N) swabs of 5 more patients 10 ln -s ./230425_M03701_0301_000000000-KNW3N/hr1445/16F_S10_R1_001.fastq.gz P16-Foot_R1.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1445/16F_S10_R2_001.fastq.gz P16-Foot_R2.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1446/16N_S11_R1_001.fastq.gz P16-Nose_R1.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1446/16N_S11_R2_001.fastq.gz P16-Nose_R2.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1447/17F_S12_R1_001.fastq.gz P17-Foot_R1.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1447/17F_S12_R2_001.fastq.gz P17-Foot_R2.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1448/17N_S13_R1_001.fastq.gz P17-Nose_R1.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1448/17N_S13_R2_001.fastq.gz P17-Nose_R2.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1449/18F_S14_R1_001.fastq.gz P18-Foot_R1.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1449/18F_S14_R2_001.fastq.gz P18-Foot_R2.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1450/18N_S15_R1_001.fastq.gz P18-Nose_R1.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1450/18N_S15_R2_001.fastq.gz P18-Nose_R2.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1451/19F_S16_R1_001.fastq.gz P19-Foot_R1.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1451/19F_S16_R2_001.fastq.gz P19-Foot_R2.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1452/19N_S17_R1_001.fastq.gz P19-Nose_R1.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1452/19N_S17_R2_001.fastq.gz P19-Nose_R2.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1453/20F_S18_R1_001.fastq.gz P20-Foot_R1.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1453/20F_S18_R2_001.fastq.gz P20-Foot_R2.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1454/20N_S19_R1_001.fastq.gz P20-Nose_R1.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1454/20N_S19_R2_001.fastq.gz P20-Nose_R2.fastq.gz #Samples 1-108 are swabs of noses, lesioned skin (LH) and non-lesioned skin (NLH) of 11 patients suffering from atopic dermatitis. 33 #There are more details on these samples in the attached excel file. ln -s ./230425_M03701_0301_000000000-KNW3N/hr1455/1_S20_R1_001.fastq.gz RP-Nose_R1.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1455/1_S20_R2_001.fastq.gz RP-Nose_R2.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1456/2_S21_R1_001.fastq.gz RP-LH_R1.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1456/2_S21_R2_001.fastq.gz RP-LH_R2.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1457/3_S22_R1_001.fastq.gz RP-NLH_R1.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1457/3_S22_R2_001.fastq.gz RP-NLH_R2.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1458/4_S23_R1_001.fastq.gz AL-Nose_R1.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1458/4_S23_R2_001.fastq.gz AL-Nose_R2.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1459/5_S24_R1_001.fastq.gz AL-LH_R1.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1459/5_S24_R2_001.fastq.gz AL-LH_R2.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1460/6_S25_R1_001.fastq.gz AL-NLH_R1.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1460/6_S25_R2_001.fastq.gz AL-NLH_R2.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1461/22_S26_R1_001.fastq.gz MC-Nose_R1.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1461/22_S26_R2_001.fastq.gz MC-Nose_R2.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1462/23_S27_R1_001.fastq.gz MC-LH_R1.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1462/23_S27_R2_001.fastq.gz MC-LH_R2.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1463/24_S28_R1_001.fastq.gz MC-NLH_R1.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1463/24_S28_R2_001.fastq.gz MC-NLH_R2.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1464/25_S29_R1_001.fastq.gz SA-Nose_R1.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1464/25_S29_R2_001.fastq.gz SA-Nose_R2.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1465/26_S30_R1_001.fastq.gz SA-LH_R1.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1465/26_S30_R2_001.fastq.gz SA-LH_R2.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1466/27_S31_R1_001.fastq.gz SA-NLH_R1.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1466/27_S31_R2_001.fastq.gz SA-NLH_R2.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1467/28_S32_R1_001.fastq.gz HR-Nose_R1.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1467/28_S32_R2_001.fastq.gz HR-Nose_R2.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1468/29_S33_R1_001.fastq.gz HR-LH_R1.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1468/29_S33_R2_001.fastq.gz HR-LH_R2.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1469/30_S34_R1_001.fastq.gz HR-NLH_R1.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1469/30_S34_R2_001.fastq.gz HR-NLH_R2.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1470/34_S35_R1_001.fastq.gz XN-Nose_R1.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1470/34_S35_R2_001.fastq.gz XN-Nose_R2.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1471/35_S36_R1_001.fastq.gz XN-LH_R1.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1471/35_S36_R2_001.fastq.gz XN-LH_R2.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1472/36_S37_R1_001.fastq.gz XN-NLH_R1.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1472/36_S37_R2_001.fastq.gz XN-NLH_R2.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1487/AY_S52_R1_001.fastq.gz MR-Nose_R1.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1487/AY_S52_R2_001.fastq.gz MR-Nose_R2.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1473/50_S38_R1_001.fastq.gz MR-LH_R1.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1473/50_S38_R2_001.fastq.gz MR-LH_R2.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1474/51_S39_R1_001.fastq.gz MR-NLH_R1.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1474/51_S39_R2_001.fastq.gz MR-NLH_R2.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1475/58_S40_R1_001.fastq.gz CB-Nose_R1.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1475/58_S40_R2_001.fastq.gz CB-Nose_R2.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1476/59_S41_R1_001.fastq.gz CB-LH_R1.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1476/59_S41_R2_001.fastq.gz CB-LH_R2.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1477/60_S42_R1_001.fastq.gz CB-NLH_R1.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1477/60_S42_R2_001.fastq.gz CB-NLH_R2.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1478/94_S43_R1_001.fastq.gz KK-Nose_R1.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1478/94_S43_R2_001.fastq.gz KK-Nose_R2.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1479/95_S44_R1_001.fastq.gz KK-LH_R1.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1479/95_S44_R2_001.fastq.gz KK-LH_R2.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1480/96_S45_R1_001.fastq.gz KK-NLH_R1.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1480/96_S45_R2_001.fastq.gz KK-NLH_R2.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1481/103_S46_R1_001.fastq.gz AH-Nose_R1.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1481/103_S46_R2_001.fastq.gz AH-Nose_R2.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1482/104_S47_R1_001.fastq.gz AH-LH_R1.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1482/104_S47_R2_001.fastq.gz AH-LH_R2.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1483/105_S48_R1_001.fastq.gz AH-NLH_R1.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1483/105_S48_R2_001.fastq.gz AH-NLH_R2.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1484/106_S49_R1_001.fastq.gz PT2-Nose_R1.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1484/106_S49_R2_001.fastq.gz PT2-Nose_R2.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1485/107_S50_R1_001.fastq.gz PT2-LH_R1.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1485/107_S50_R2_001.fastq.gz PT2-LH_R2.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1486/108_S51_R1_001.fastq.gz PT2-NLH_R1.fastq.gz ln -s ./230425_M03701_0301_000000000-KNW3N/hr1486/108_S51_R2_001.fastq.gz PT2-NLH_R2.fastq.gz
-
Processing for Epidome yycH and g216
#-->92-301 #yycH: 476 + 44 -->minLen=200 (76) #g216: 448 + 44 -->minLen=185 (78) #16S: 410-427 + 44 -->minLen=170 (70) #360/2=180 * #200 and 200 * #>seq1; #TGGGTATGGCAATCACTTTACA #AGAATTCTATATTAAAGATGTTCTAATTGTGGAAAAGGGATCCATCGGTCATTCATTTAAACATTGGCCTCTATCAACAAAGACCATCACACCATCATTTACAACTAATGGTTTTGGCATGCCAGATATGAATGCAATAGCTAAAGATACATCACCTGCCTTCACTTTCAATGAAGAACATTTGTCTGGAAATAATTACGCTCAATACATTTCATTAGTAGCTGAGCATTACAATCTAAATGTCAAAACAAATACCAATGTTTCACGTGTAACATACATAGATGGTATATATCATGTATCAACGGACTATGGTGTTTATACCGCAGATTATATATTTATAGCAACTGGAGACTATTCATTCCCATATCATCCTTTTTCATATGGACGTCATTACAGTGAGATTCGAGCGTTCACTCAATTAAACGGTGACGCCTTTACAATTATTGGA GGTAATGAGAGTGCTTTTGATGC #>M03701:292:000000000-K9M88:1:1101:10277:1358:AAGAGGCA+TATCCTCT #TGGGTATGRCAATCACTTTACA #AGAATTCAATATTAAAGATGTTCTAATTGTTGAAAAGGGAACCATCGGTCATTCATTTAAACATTGGCCTCTATCAACAAAGACCATCACACCATCATTTACAACTAATGGTTTTGGCATGCCAGATATGAATGCAATAGCTAAAGATACATCACCTGCCTTCACTTTCAATGAAGAACATTTATCTGGAAAACGTTATGCTGAATACCTCTCACTAGTAGCTACGCATTACAATCTAAATGGCAAAACAAACACCAATGTTTCACGTGTAACATACATAGATGGTGTATATCATGTATCAACGGACTATGGTGTTTATACCGCAGATTATATATTTATAGCAACTGGAGACTATTCATTCCCATATCATCCTTTATCATATGGACGTCATTACAGTGAAATTCAAACATTCACTCAATTAAAAGGTGATGCTTTTACAATCATTGGT GGTAATGAGAGTGCTTTTGATGC #DIR: ~/DATA/Data_Holger_Epidome/testrun2 #Input: epidome->/home/jhuang/Tools/epidome and rawdata #Read in 37158 paired-sequences, output 31225 (84%) filtered paired-sequences. #Read in 82145 paired-sequences, output 78594 (95.7%) filtered paired-sequences. #--> #Overwriting file:/home/jhuang/DATA/Data_Holger_Epidome_myData2/cutadapted_yycH/filtered_R1/A10-1_R1.fastq.gz #Overwriting file:/home/jhuang/DATA/Data_Holger_Epidome_myData2/cutadapted_yycH/filtered_R2/A10-1_R2.fastq.gz #Read in 37158 paired-sequences, output 35498 (95.5%) filtered paired-sequences. #Overwriting file:/home/jhuang/DATA/Data_Holger_Epidome_myData2/cutadapted_yycH/filtered_R1/A10-2_R1.fastq.gz #Overwriting file:/home/jhuang/DATA/Data_Holger_Epidome_myData2/cutadapted_yycH/filtered_R2/A10-2_R2.fastq.gz #Read in 82145 paired-sequences, output 80918 (98.5%) filtered paired-sequences. # #Read in 46149 paired-sequences, output 22206 (48.1%) filtered paired-sequences. #Read in 197875 paired-sequences, output 168942 (85.4%) filtered paired-sequences. #Read in 230646 paired-sequences, output 201376 (87.3%) filtered paired-sequences. #Read in 175759 paired-sequences, output 149823 (85.2%) filtered paired-sequences. #Read in 147546 paired-sequences, output 128864 (87.3%) filtered paired-sequences.
2.1. quality controls (optional)
#under testrun2 should have
#BiocManager::install("dada2")
library(dada2); packageVersion("dada2")
path <- "/home/jhuang/DATA/Data_Luise_Epidome_batch3/raw_data" # CHANGE ME to the directory containing the fastq files after unzipping.
list.files(path)
# Forward and reverse fastq filenames have format: SAMPLENAME_R1_001.fastq and SAMPLENAME_R2_001.fastq
fnFs <- sort(list.files(path, pattern="_R1.fastq.gz", full.names = TRUE))
fnRs <- sort(list.files(path, pattern="_R2.fastq.gz", full.names = TRUE))
# Extract sample names, assuming filenames have format: SAMPLENAME_XXX.fastq
sample.names <- sapply(strsplit(basename(fnFs), "_"), `[`, 1)
png("quality_fnFs.png", width=800, height=800)
plotQualityProfile(fnFs[1:2])
dev.off()
png("quality_fnRs.png", width=800, height=800)
plotQualityProfile(fnRs[1:2])
dev.off()
2.2. cutadapt instead of Trimmomatic (namely demultiplexing, see epidome/scripts/EPIDOME_yycH_cutadapt_loop.sh)
#Output: cutadapted_yycH cutadapted_g216 cutadapted_16S
#Script: epidome/scripts/EPIDOME_yycH_cutadapt_loop.sh
#5′-CGATGCKAAAGTGCCGAATA-3′/5′-CTTCATTTAAGAAGCCACCWTGACT-3′ for yycH
#5′-TGGGTATGRCAATCACTTTACA-3′/5′-GCATCAAAAGCACTCTCATTACC-3′ for g216
#-p CCTACGGGNGGCWGCAG -q GACTACHVGGGTATCTAATCC -l 300 for 16S
mkdir cutadapted_yycH cutadapted_g216 cutadapted_16S
cd raw_data
#The default is --action=trim. With --action=retain, the read is trimmed, but the adapter sequence itself is not removed.
for file in *_R1.fastq.gz; do
cutadapt -e 0.06 -g CGATGCKAAAGTGCCGAATA -G CTTCATTTAAGAAGCCACCWTGACT --pair-filter=any -o ../cutadapted_yycH/${file} --paired-output ../cutadapted_yycH/${file/R1.fastq.gz/R2.fastq.gz} --discard-untrimmed $file ${file/R1.fastq.gz/R2.fastq.gz};
done
for file in *_R1.fastq.gz; do
cutadapt -e 0.06 -g TGGGTATGRCAATCACTTTACA -G GCATCAAAAGCACTCTCATTACC --pair-filter=any -o ../cutadapted_g216/${file} --paired-output ../cutadapted_g216/${file/R1.fastq.gz/R2.fastq.gz} --discard-untrimmed $file ${file/R1.fastq.gz/R2.fastq.gz};
done
for file in *_R1.fastq.gz; do
cutadapt -e 0.06 -g CCTACGGGNGGCWGCAG -G GACTACHVGGGTATCTAATCC --pair-filter=any -o ../cutadapted_16S/${file} --paired-output ../cutadapted_16S/${file/R1.fastq.gz/R2.fastq.gz} --discard-untrimmed $file ${file/R1.fastq.gz/R2.fastq.gz};
done
2.3. (IGNORED) regenerate filtered_R1 and filtered_R2 (under conda env qiime1 using pear) –> IGNORED, since we should use filterAndTrim from data2 in the next step!)
#DEPRECATED: mkdir pandaseq_16S pandaseq_yycH pandaseq_g216
#DEPRECATED: -p CCTACGGGNGGCWGCAG -q GACTACHVGGGTATCTAATCC
#DEPRECATED: for file in cutadapted_16S/*_R1.fastq.gz; do pandaseq -f ${file} -r ${file/_R1.fastq.gz/_R2.fastq.gz} -l 300 -w pandaseq_16S/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_merged.fasta >> LOG_pandaseq_16S; done
#https://learnmetabarcoding.github.io/LearnMetabarcoding/processing/pair_merging.html#
#conda install -c conda-forge -c bioconda -c defaults seqkit
mkdir pear_yycH pear_g216 pear_16S
for file in cutadapted_yycH/*_R1.fastq.gz; do pear -f ${file} -r ${file/_R1.fastq.gz/_R2.fastq.gz} -j 4 -q 26 -v 10 -o pear_yycH/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1) >> LOG_pear_yycH; done
for file in cutadapted_g216/*_R1.fastq.gz; do pear -f ${file} -r ${file/_R1.fastq.gz/_R2.fastq.gz} -j 2 -q 26 -v 10 -o pear_g216/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1) >> LOG_pear_g216; done
for file in cutadapted_16S/*_R1.fastq.gz; do pear -f ${file} -r ${file/_R1.fastq.gz/_R2.fastq.gz} -j 2 -q 26 -v 10 -o pear_16S/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1) >> LOG_pear_16S; done
for file in cutadapted_yycH/*_R1.fastq.gz; do
grep "@M0370" pear_yycH/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1).assembled.fastq > cutadapted_yycH/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_IDs.txt
sed -i -e 's/@//g' cutadapted_yycH/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_IDs.txt
cut -d' ' -f1 cutadapted_yycH/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_IDs.txt > cutadapted_yycH/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_IDs_.txt
seqkit grep -f cutadapted_yycH/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_IDs_.txt cutadapted_yycH/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_R1.fastq.gz -o cutadapted_yycH/filtered_R1/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_R1.fastq.gz
seqkit grep -f cutadapted_yycH/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_IDs_.txt cutadapted_yycH/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_R2.fastq.gz -o cutadapted_yycH/filtered_R2/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_R2.fastq.gz
done
#>>LOG_pear_yycH
for file in cutadapted_g216/*_R1.fastq.gz; do
grep "@M0370" pear_g216/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1).assembled.fastq > cutadapted_g216/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_IDs.txt
sed -i -e 's/@//g' cutadapted_g216/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_IDs.txt
cut -d' ' -f1 cutadapted_g216/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_IDs.txt > cutadapted_g216/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_IDs_.txt
seqkit grep -f cutadapted_g216/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_IDs_.txt cutadapted_g216/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_R1.fastq.gz -o cutadapted_g216/filtered_R1/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_R1.fastq.gz
seqkit grep -f cutadapted_g216/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_IDs_.txt cutadapted_g216/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_R2.fastq.gz -o cutadapted_g216/filtered_R2/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_R2.fastq.gz
done
for file in cutadapted_16S/*_R1.fastq.gz; do
grep "@M0370" pear_16S/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1).assembled.fastq > cutadapted_16S/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_IDs.txt
sed -i -e 's/@//g' cutadapted_16S/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_IDs.txt
cut -d' ' -f1 cutadapted_16S/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_IDs.txt > cutadapted_16S/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_IDs_.txt
seqkit grep -f cutadapted_16S/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_IDs_.txt cutadapted_16S/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_R1.fastq.gz -o cutadapted_16S/filtered_R1/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_R1.fastq.gz
seqkit grep -f cutadapted_16S/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_IDs_.txt cutadapted_16S/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_R2.fastq.gz -o cutadapted_16S/filtered_R2/$(echo $file | cut -d'/' -f2 | cut -d'_' -f1)_R2.fastq.gz
done
2.4. filtering+trimming+merging+chimera-removing (VERY_IMPORTANT: under conda env r4-base)
#Input: cutadapted_yycH, cutadapted_g216
#Outputs: yycH[g216|16S]_seqtab_from_dada2.rds
# yycH[g216|16S]_seqtab_from_dada2.csv
# yycH[g216|16S]_seqtab_nochim.rds
# yycH[g216|16S]_seqtab_nochim.csv
# yycH[g216|16S]_seqtab_image.RData
# track_yycH[g216|16S].csv -->
#The following scripts are modified from epidome/scripts/dada2_for_EPIDOME_yycH_runwise_pipeline.R
#./my_EPIDOME_g216_runwise_pipeline.R #minLen=185, with FILTERING, it does not work!
#./my_EPIDOME_yycH_runwise_pipeline.R #minLen=200, with FILTERING, it does not work!
./my_EPIDOME_g216_runwise_pipeline_.R > g216_runwise_pipeline_.LOG #minLen=185. NO FILTERING ANY MORE, since the input are filtered_R1 and filtered_R2.
./my_EPIDOME_yycH_runwise_pipeline_.R > yycH_runwise_pipeline_.LOG #minLen=200. NO FILTERING ANY MORE, since the input are filtered_R1 and filtered_R2.
./my_EPIDOME_16S_runwise_pipeline.R > 16S_runwise_pipeline.LOG #minLen=170. IGNORED since the 16S reads will be processed separately as below.
#END
#Overwriting file:/media/jhuang/Elements/Data_Luise_Epidome_batch3/cutadapted_yycH/filtered_R1/AH-LH_R1.fastq.gz
#Overwriting file:/media/jhuang/Elements/Data_Luise_Epidome_batch3/cutadapted_yycH/filtered_R2/AH-LH_R2.fastq.gz
#Read in 90261 paired-sequences, output 89026 (98.6%) filtered paired-sequences.
#Overwriting file:/media/jhuang/Elements/Data_Luise_Epidome_batch3/cutadapted_yycH/filtered_R1/AH-NLH_R1.fastq.gz
#Overwriting file:/media/jhuang/Elements/Data_Luise_Epidome_batch3/cutadapted_yycH/filtered_R2/AH-NLH_R2.fastq.gz
#Read in 74638 paired-sequences, output 73633 (98.7%) filtered paired-sequences.
#Overwriting file:/media/jhuang/Elements/Data_Luise_Epidome_batch3/cutadapted_yycH/filtered_R1/AH-Nose_R1.fastq.gz
#Overwriting file:/media/jhuang/Elements/Data_Luise_Epidome_batch3/cutadapted_yycH/filtered_R2/AH-Nose_R2.fastq.gz
#Read in 71311 paired-sequences, output 70542 (98.9%) filtered paired-sequences.
#Overwriting file:/media/jhuang/Elements/Data_Luise_Epidome_batch3/cutadapted_yycH/filtered_R1/AL-LH_R1.fastq.gz
#Overwriting file:/media/jhuang/Elements/Data_Luise_Epidome_batch3/cutadapted_yycH/filtered_R2/AL-LH_R2.fastq.gz
#Read in 94807 paired-sequences, output 93082 (98.2%) filtered paired-sequences.
~/Tools/csv2xls-0.4/csv_to_xls.py g216_track.csv yycH_track.csv 16S_track.csv -d$';' -o overview.xls;
# Read the CSV file into a DataFrame
df <- read.csv("g216_seqtab_from_dada2_nohead.csv", sep=";", row.name=1, header=FALSE)
#df <- read.csv("g216_seqtab_nochim.csv", sep=";", row.name=1)
# Calculate the sum for each row
row_sums <- rowSums(df)
# Print the row sums
print(row_sums)
> print(row_sums)
AH-LH AH-NLH AH-Nose AL-LH AL-NLH AL-Nose CB-LH CB-NLH
87232 69741 89689 76660 94636 108810 73814 56312
CB-Nose HR-LH HR-NLH HR-Nose KK-LH KK-NLH KK-Nose MC-LH
61740 76216 63165 55479 78550 87579 83826 73738
MC-NLH MC-Nose MR-LH MR-NLH MR-Nose P10-Nose P11-Nose P12-Nose
100338 94956 88158 63054 82361 103782 108533 90398
P13-Nose P14-Nose P15-Nose P16-Foot P16-Nose P17-Foot P17-Nose P18-Foot
87059 95656 110207 67058 77606 58339 95407 87775
P18-Nose P19-Foot P19-Nose P20-Foot P20-Nose P7-Nose P8-Nose P9-Nose
107560 79373 99571 104667 109457 101528 99565 147485
PT2-LH PT2-NLH PT2-Nose RP-LH RP-NLH RP-Nose SA-LH SA-NLH
81074 89345 77121 68946 71414 113722 53465 40966
SA-Nose XN-LH XN-NLH XN-Nose
33381 73842 76028 80630
AH-LH AH-NLH AH-Nose AL-LH AL-NLH AL-Nose CB-LH CB-NLH
79615 52143 57780 70801 89636 99428 52185 54848
CB-Nose HR-LH HR-NLH HR-Nose KK-LH KK-NLH KK-Nose MC-LH
50249 58365 45201 38747 56027 65755 53110 72355
MC-NLH MC-Nose MR-LH MR-NLH MR-Nose P10-Nose P11-Nose P12-Nose
97363 69881 68599 52386 48364 84634 78491 71877
P13-Nose P14-Nose P15-Nose P16-Foot P16-Nose P17-Foot P17-Nose P18-Foot
81694 84290 100606 62621 66484 50015 93498 73730
P18-Nose P19-Foot P19-Nose P20-Foot P20-Nose P7-Nose P8-Nose P9-Nose
93713 63755 70420 81030 105601 94352 70765 124449
PT2-LH PT2-NLH PT2-Nose RP-LH RP-NLH RP-Nose SA-LH SA-NLH
56869 74951 62339 68726 71145 111279 47754 36180
SA-Nose XN-LH XN-NLH XN-Nose
26130 55647 69993 50727
#wc -l cutadapted_yycH/filtered_R1$ vim Extraction-control-2_R1.fastq.gz #-->2696
#Read in 1138 paired-sequences, output 674 (59.2%) filtered paired-sequences.
#"Extraction-control-2";0;61;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;591;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0 -->600 sequences
#"Extraction-control-2";0;61;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;591;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0;0 #-->after chimera-removing, only 107 sequences
#Processing: Extraction-control-2
#Sample 1 - 674 reads in 209 unique sequences (What does the unique sequences mean???). #merged sequences are 209
#Sample 1 - 674 reads in 280 unique sequences.
#"";"input_read_count"*; "filtered_and_trimmed_read_count";"merged_after_dada2_read_count"; "non-chimeric_read_count"*
#"Extraction-control-2"; 1138*; 674;652; 652*
#biom convert -i table.txt -o table.from_txt_json.biom --table-type="OTU table" --to-json
#summarize_taxa_through_plots.py -i clustering/otu_table_mc2_w_tax_no_pynast_failures.biom -o plots/taxa_summary -s
#summarize_taxa.py -i otu_table.biom -o ./tax
#g216_track.csv
"A10-1";36009;493;367;367
"A10-2";175221;82727;30264;27890
"A10-3";110170;36812;13715;13065
"A10-4";142323;64398;24306;21628
#yycH_track.csv
"A10-1";37158;549;0;0 pandaseq:36791
"A10-2";82145;23953;6180;5956
"A10-3";53438;12480;2944;2944
"A10-4";64516;18361;12350;11797
#16S_track.csv
"A10-1";46149(in cutadapted_16S);13(in filtered_R1);8;8
"A10-2";197875;2218;1540;1391
"A10-3";230646;2429;1819;1752
"A10-4";175759;2001;1439;1366
#zcat A10-1_R1.fastq.gz | echo $((`wc -l`/4))
#121007 > 37158 + 36009 + 46149 = 119316
#458392 > 82145 + 175221 + 197875 = 455241
2.5. (IGNORED) stitching and removing chimeras (see ~/DATA/Data_Holger_Epidome/epidome/scripts/Combine_and_Remove_Chimeras_yycH.R)
#my_Combine_and_Remove_Chimeras_g216.R is a part of my_EPIDOME_yycH_runwise_pipeline.R (see lines 53-55) --> IGNORED!
2.6. Classification: epidome/scripts/ASV_blast_classification.py
#Input: g216_seqtab_nochim.csv using DATABASE epidome/DB/g216_ref_aln.fasta
#Output: g216_seqtab_ASV_seqs.fasta, g216_seqtab_ASV_blast.txt and g216_seqtab.csv.classified.csv
python3 epidome/scripts/ASV_blast_classification.py yycH_seqtab_nochim.csv yycH_seqtab_ASV_seqs.fasta epidome/DB/yycH_ref_aln.fasta yycH_seqtab_ASV_blast.txt yycH_seqtab.csv.classified.csv 99.5
python3 epidome/scripts/ASV_blast_classification.py g216_seqtab_nochim.csv g216_seqtab_ASV_seqs.fasta epidome/DB/g216_ref_aln.fasta g216_seqtab_ASV_blast.txt g216_seqtab.csv.classified.csv 99.5
#old: python3 epidome/scripts/ASV_blast_classification.py yycH_seqtab.csv yycH_seqtab.csv.ASV_seqs.fasta epidome/DB/yycH_ref_aln.fasta yycH_seqtab.csv.ASV_blast.txt yycH_seqtab.csv.classified.csv 99.5
#old: python3 epidome/scripts/ASV_blast_classification_combined.py -p1 190920_run1_yycH_seqtab_from_dada2.csv -p2 190920_run1_G216_seqtab_from_dada2.csv -p1_ref epidome/DB/yycH_ref_aln.fasta -p2_ref epidome/DB/g216_ref_aln.fasta
##rename "seqseq2" --> seq2
#sed -i -e s/seq//g 190920_run1_yycH_seqtab_from_dada2.csv.ASV_blast.txt
#sed -i -e s/seqseq/seq/g 190920_run1_yycH_seqtab_from_dada2.csv.classified.csv
#diff 190920_run1_yycH_seqtab_from_dada2.csv.ASV_seqs.fasta epidome/example_data/190920_run1_yycH_seqtab_from_dada2.csv.ASV_seqs.fasta
#diff 190920_run1_yycH_seqtab_from_dada2.csv.ASV_blast.txt epidome/example_data/190920_run1_yycH_seqtab_from_dada2.csv.ASV_blast.txt
#diff 190920_run1_yycH_seqtab_from_dada2.csv.classified.csv epidome/example_data/190920_run1_yycH_seqtab_from_dada2.csv.classified.csv
## WHY: 667 seqs in old calculation, but in our calculation only 108 seqs
## They took *_seqtab_from_dada2.csv, but we took *_seqtab_nochim.csv. (653 vs 108 records!)
##AAAT";"seq37,36";0;
sed -i -e s/seq//g yycH_seqtab_ASV_blast.txt #length=476
sed -i -e s/seq//g g216_seqtab_ASV_blast.txt #length=448
#;-->""
sed -i -e s/';'//g yycH_seqtab_ASV_blast.txt
sed -i -e s/';'//g g216_seqtab_ASV_blast.txt
sed -i -e s/seqseq/seq/g yycH_seqtab.csv.classified.csv
sed -i -e s/seqseq/seq/g g216_seqtab.csv.classified.csv
#;,seq --> ,seq
#;"; --> ";
sed -i -e s/";,seq"/",seq"/g yycH_seqtab.csv.classified.csv
sed -i -e s/";,seq"/",seq"/g g216_seqtab.csv.classified.csv
sed -i -e s/";\";"/"\";"/g yycH_seqtab.csv.classified.csv
sed -i -e s/";\";"/"\";"/g g216_seqtab.csv.classified.csv
#"ASV";"Seq_number";"even-mock3-1_S258_L001";"even-mock3-2_S282_L001";"even-mock3-3_S199_L001";"staggered-mock3-1_S270_L001";"staggered-mock3-2_S211_L001";"staggered-mock3-3_S223_L001"
#"ASV";"Seq_number";"Extraction_control_1";"Extraction_control_2";"P01_nose_1";"P01_nose_2";"P01_skin_1";"P01_skin_2";"P02_nose_1";"P02_nose_2";"P02_skin_1";"P02_skin_2";"P03_nose_1";"P03_nose_2";"P03_skin_1";"P03_skin_2";"P04_nose_1";"P04_nose_2";"P04_skin_1";"P04_skin_2";"P05_nose_1";"P05_nose_2";"P05_skin_1";"P05_skin_2";"P06_nose_1";"P06_nose_2";"P06_skin_1";"P06_skin_2";"P07_nose_1";"P07_nose_2";"P07_skin_1";"P07_skin_2";"P08_nose_1";"P08_nose_2";"P08_skin_1";"P08_skin_2";"P09_nose_1";"P09_nose_2";"P09_skin_1";"P09_skin_2";"P10_nose_1";"P10_nose_2";"P10_skin_1";"P10_skin_2";"P11_nose_1";"P11_nose_2";"P11_skin_1";"P11_skin_2";"even-mock3-1_S258_L001";"even-mock3-2_S282_L001";"even-mock3-3_S199_L001";"staggered-mock3-1_S270_L001";"staggered-mock3-2_S211_L001";"staggered-mock3-3_S223_L001"
grep -v ";NA;" g216_seqtab.csv.classified.csv > g216_seqtab.csv.classified_noNA.csv
grep -v ";NA;" yycH_seqtab.csv.classified.csv > yycH_seqtab.csv.classified_noNA.csv
https://github.com/ssi-dk/epidome/blob/master/example_data/190920_run1_G216_seqtab_from_dada2.csv.classified.csv
#DEBUG using LibreOffice, e.g. libreoffice --calc yycH_seqtab.csv.classified_noNA.csv after adding "ID"; at the corner.
seq24,seq21 --> seq24,21
#(OPTIONAL) TO reduce the unclassified, rename seq31,30 --> seq in g216, seq37,36 --> seq in yycH.
2.7. draw plot from three amplicons: cutadapted_g216, cutadapted_yycH, and cutadapted_16S
#Taxonomic database setup and classification
#- Custom databases of all unique g216 and yycH target sequences can be found at https://github.com/ssi-dk/epidome/tree/master/DB.
#- We formatted our g216 and yycH gene databases to be compatible with DADA2’s assign-Taxonomy function and used it to classify the S. epidermidis ASVs with the RDP naive Bayesian classifier method (https://github.com/ssi-dk/epidome/tree/master/scripts).
#- ST classification of samples was performed using the g216 target sequence as the primary identifier.
#- All g216 sequences unique to a single clonal cluster in the database were immediately classified as the matching clone, and in cases were the g216 sequence matched multiple clones, the secondary yycH target sequences were parsed to determine which clone was present. When this classification failed to resolve due to multiple potential combinations of sequences, ASVs were categorized as “Unclassified”. Similarly, g216 sequences not found in the database were labelled as “Novel”.
#~/Tools/csv2xls-0.4/csv_to_xls.py g216_seqtab.csv.classified_noNA.csv yycH_seqtab.csv.classified_noNA.csv -d$'\t' -o counts.xls
#under r4-base
source("epidome/scripts/epidome_functions.R")
ST_amplicon_table = read.table("epidome/DB/epidome_ST_amplicon_frequencies.txt",sep = "\t")
#"ST" "Group" "epi01_ASV" "epi02_ASV" "freq"
#"8888" "324" 113 1 1 2
#"815097" "5" 48 40 2 55
#"846906" "225" 61 3 3 1
#"846960" "-" 62 3 3 1
#"847064" "225" 63 3 3 2
#"847222" "225" 65 3 3 3
#"865555" "278" 37 5 3 4
epi01_table = read.table("g216_seqtab.csv.classified_noNA.csv",sep = "\t",header=TRUE,row.names=1)
epi02_table = read.table("yycH_seqtab.csv.classified_noNA.csv",sep = "\t",header=TRUE,row.names=1)
#> sum(epi01_table$AH.LH)
#[1] 78872
#> sum(epi02_table$AH.LH)
#[1] 86949
#construct metadata.txt as follows.
#"sample.ID" "patient.ID" "sample.site" "sample.type" "patient.sample.site"
#P7.Nose P7 Nose swab P7.Nose.swab
#P8.Nose P8 Nose swab P8.Nose.swab
#P9.Nose P9 Nose swab P9.Nose.swab
#P10.Nose P10 Nose swab P10.Nose.swab
#P11.Nose P11 Nose swab P11.Nose.swab
#P12.Nose P12 Nose swab P12.Nose.swab
#P13.Nose P13 Nose swab P13.Nose.swab
metadata_table = read.table("metadata.txt",header=TRUE,row.names=1)
metadata_table$patient.ID <- factor(metadata_table$patient.ID, levels=c("P7","P8","P9","P10","P11","P12","P13","P14","P15","P16","P17","P18","P19","P20", "AH","AL","CB","HR","KK", "MC","MR","PT2","RP","SA","XN"))
epidome_object = setup_epidome_object(epi01_table,epi02_table,metadata_table = metadata_table)
#Image1
primer_compare = compare_primer_output(epidome_object,color_variable = "sample.type")
png("image1.png")
primer_compare$plot
dev.off()
eo_ASV_combined = combine_ASVs_epidome(epidome_object)
eo_filtered = filter_lowcount_samples_epidome(eo_ASV_combined,500,500)
count_table = classify_epidome(eo_ASV_combined,ST_amplicon_table)
#count_df_ordered = count_table[order(rowSums(count_table),decreasing = T),]
#install.packages("pls")
#library(pls)
#install.packages("reshape")
#library(reshape)
#install.packages("vegan")
library('vegan')
library(scales)
library(RColorBrewer)
#Image2
#TODO: find out what are the combination 21006 in Aachen?
source("epidome/scripts/epidome_functions.R")
#count_table = count_table[-29,]
#row.names(count_table) <- c("-", "ST297", "ST170", "ST73", "ST225", "ST673", "ST215", "ST19", "Unclassified")
#row.names(count_table) <- c("NA", "-", "X297", "X170", "X73", "X225", "X673", "X215", "X19", "Unclassified")
row.names(count_table) <- c("ST215","ST130","ST278","ST200","ST5","ST59","ST83","ST114","ST297","ST384","ST14","ST89","ST210","-","ST328","ST331","ST73","ST2","ST88","ST100","ST10","ST290","ST87","ST23","ST218","ST329","ST19","ST225","ST170","Unclassified")
#colnames(count_table) <- c("A2.1","A2.2","A2.3","A3.1","A3.2","A3.3","A4.1","A4.2","A4.3","A4.4","A5.1","A5.2","A5.3","A5.4","A5.5","A5.6","A5.7","A10.1","A10.2","A10.3","A10.4","A17.1","A17.2","A17.3","A21.1","A21.2","A21.3","A22.1","A22.2","A22.3","A24.1","A24.2","A24.3","A25.1","A25.2","A25.3","A27.1","A27.2","A27.3","A28.1","A28.2","A28.3","LM.Nose","LM.Foot","LZ.Foot","LZ.Nose","NG.Foot","NG.Nose","VK.Foot","VK.Nose","AK.Foot","AK.Nose","MS.Foot","MS.Nose","AH.Nose","AH.Foot","AY_Nose","AY.Foot","JS.Nose","JS.Foot","PC.Nose","PC.Foot","SB.Nose","SB.Foot")
#col_order <- c("A2.1","A2.2","A2.3","A3.1","A3.2","A3.3","A4.1","A4.2","A4.3","A4.4","A5.1","A5.2","A5.3","A5.4","A5.5","A5.6","A5.7","A10.1","A10.2","A10.3","A10.4","A17.1","A17.2","A17.3","A21.1","A21.2","A21.3","A22.1","A22.2","A22.3","A24.1","A24.2","A24.3","A25.1","A25.2","A25.3","A27.1","A27.2","A27.3","A28.1","A28.2","A28.3", "LM.Nose","LM.Foot", "LZ.Nose","LZ.Foot", "NG.Nose","NG.Foot", "VK.Nose","VK.Foot", "AK.Nose","AK.Foot", "MS.Nose","MS.Foot", "AH.Nose","AH.Foot", "AY_Nose","AY.Foot", "JS.Nose","JS.Foot", "PC.Nose","PC.Foot", "SB.Nose","SB.Foot")
#count_table_reordered <- count_table[,col_order]
count_table_reordered <- count_table
write.csv(file="count_table.txt", count_table_reordered)
#NOTE to change rowname from '-' to 'Novel'
p = make_barplot_epidome(count_table_reordered,reorder=FALSE,normalize=TRUE)
#p = make_barplot_epidome(count_table_reordered,reorder=TRUE,normalize=TRUE)
png("Barplot_All.png", width=1600, height=900)
p
dev.off()
#Image3
eo_clinical = prune_by_variable_epidome(epidome_object,"sample.type",c("swab"))
#eo_Aachen = prune_by_variable_epidome(epidome_object,"sample.type",c("Aachen"))
epidome_object_clinical_norm = normalize_epidome_object(eo_clinical) ### Normalize counts to percent
#epidome_object_Aachen_norm = normalize_epidome_object(eo_Aachen)
png("PCA_by_patientID.png", width=1200, height=800)
PCA_patient_colored = plot_PCA_epidome(eo_filtered,color_variable = "patient.ID",colors=c(), plot_ellipse = FALSE)
PCA_patient_colored + ggtitle("PCA plot of all samples colored by patient ID")
dev.off()
png("PCA_Clinical_by_patientID.png", width=1200, height=800)
PCA_patient_colored = plot_PCA_epidome(epidome_object_clinical_norm,color_variable = "patient.ID",colors=c(), plot_ellipse = FALSE)
PCA_patient_colored + ggtitle("PCA plot of clinical samples colored by patient ID")
dev.off()
#png("PCA_Aachen_by_patientID.png", width=1200, height=800)
#PCA_sample_site_colored = plot_PCA_epidome(epidome_object_Aachen_norm,color_variable = "patient.ID",colors=c(), plot_ellipse = FALSE)
#PCA_sample_site_colored + ggtitle("PCA plot of nose and foot samples colored by patient ID")
#dev.off()
#png("PCA_Aachen_by_sampleSite.png", width=1200, height=800)
#PCA_sample_site_colored = plot_PCA_epidome(epidome_object_Aachen_norm,color_variable = "sample.site",colors = c("Red","Blue"),plot_ellipse = FALSE)
#PCA_sample_site_colored + ggtitle("PCA plot of nose and foot samples colored by sampling site")
#dev.off()
#Image4
eo_filter_lowcount = filter_lowcount_samples_epidome(epidome_object,p1_threshold = 500,p2_threshold = 500)
#-->[1] "1 low count samples removed from data: RP.Nose"
eo_filter_ASVs = epidome_filtered_ASVs = filter_lowcount_ASVs_epidome(epidome_object,percent_threshold = 1)
epidome_object_normalized = normalize_epidome_object(epidome_object)
epidome_object_ASV_combined = combine_ASVs_epidome(epidome_object)
epidome_object_clinical = prune_by_variable_epidome(epidome_object,variable_name = "sample.type",variable_values = c("swab"))
#epidome_object_Aachen= prune_by_variable_epidome(epidome_object,variable_name = "sample.type",variable_values = c("Aachen"))
eo_ASV_combined = combine_ASVs_epidome(epidome_object_clinical)
count_table_reordered = classify_epidome(eo_ASV_combined,ST_amplicon_table)
p = make_barplot_epidome(count_table_reordered,reorder=FALSE,normalize=TRUE)
png("Barplot_Clinical.png", width=1200, height=800)
p
dev.off()
#eo_ASV_combined = combine_ASVs_epidome(epidome_object_Aachen)
#count_table_reordered = classify_epidome(eo_ASV_combined,ST_amplicon_table)
#colnames(count_table_reordered) <- c("LM.Nose","LM.Foot","LZ.Nose","LZ.Foot","NG.Nose","NG.Foot","VK.Nose","VK.Foot","AK.Nose","AK.Foot","MS.Nose","MS.Foot","AH.Nose","AH.Foot","AY_Nose","AY.Foot","JS.Nose","JS.Foot","PC.Nose","PC.Foot","SB.Nose","SB.Foot")
#p = make_barplot_epidome(count_table_reordered,reorder=FALSE,normalize=TRUE)
#png("Barplot_Aachen.png", width=1200, height=800)
#p
#dev.off()
3.0. The methods for 16S
3.1. stitch
mkdir pandaseq.out
#-p CCTACGGGNGGCWGCAG -q GACTACHVGGGTATCTAATCC
for file in cutadapted_16S/filtered_R1/*_R1.fastq.gz; do echo "pandaseq -f ${file} -r ${file/_R1/_R2} -l 300 -w pandaseq.out/$(echo $file | cut -d'/' -f3 | cut -d'_' -f1)_merged.fasta >> LOG_pandaseq"; done
pandaseq -f cutadapted_16S/filtered_R1/AH-LH_R1.fastq.gz -r cutadapted_16S/filtered_R2/AH-LH_R2.fastq.gz -l 300 -w pandaseq.out/AH-LH_merged.fasta >> LOG_pandaseq
...
grep ">" AH-LH_merged.fasta | wc -l
...
jhuang@hamburg:~/DATA/Data_Luise_Epidome_batch3/core_diversity_e33778$ grep "AH.LH" biom_table_summary.txt
AH.LH: 75.566,000
...
3.2. create two QIIME mapping files
validate_mapping_file.py -m map2.txt
3.3. combine files into a labeled file
add_qiime_labels.py -i pandaseq.out -m map2_corrected.txt -c FileInput -o combined_fasta
3.4. remove chimeric sequences using usearch
cd combined_fasta
pyfasta split -n 100 combined_seqs.fna
for i in {000..099}; do echo "identify_chimeric_seqs.py -i combined_fasta/combined_seqs.fna.${i} -m usearch61 -o usearch_checked_combined.${i}/ -r ~/REFs/gg_97_otus_4feb2011_fw_rc.fasta --threads=14;" >> uchime_commands.sh; done
mv uchime_commands.sh ..
./uchime_commands.sh
cat usearch_checked_combined.000/chimeras.txt usearch_checked_combined.001/chimeras.txt usearch_checked_combined.002/chimeras.txt usearch_checked_combined.003/chimeras.txt usearch_checked_combined.004/chimeras.txt usearch_checked_combined.005/chimeras.txt usearch_checked_combined.006/chimeras.txt usearch_checked_combined.007/chimeras.txt usearch_checked_combined.008/chimeras.txt usearch_checked_combined.009/chimeras.txt usearch_checked_combined.010/chimeras.txt usearch_checked_combined.011/chimeras.txt usearch_checked_combined.012/chimeras.txt usearch_checked_combined.013/chimeras.txt usearch_checked_combined.014/chimeras.txt usearch_checked_combined.015/chimeras.txt usearch_checked_combined.016/chimeras.txt usearch_checked_combined.017/chimeras.txt usearch_checked_combined.018/chimeras.txt usearch_checked_combined.019/chimeras.txt usearch_checked_combined.020/chimeras.txt usearch_checked_combined.021/chimeras.txt usearch_checked_combined.022/chimeras.txt usearch_checked_combined.023/chimeras.txt usearch_checked_combined.024/chimeras.txt usearch_checked_combined.025/chimeras.txt usearch_checked_combined.026/chimeras.txt usearch_checked_combined.027/chimeras.txt usearch_checked_combined.028/chimeras.txt usearch_checked_combined.029/chimeras.txt usearch_checked_combined.030/chimeras.txt usearch_checked_combined.031/chimeras.txt usearch_checked_combined.032/chimeras.txt usearch_checked_combined.033/chimeras.txt usearch_checked_combined.034/chimeras.txt usearch_checked_combined.035/chimeras.txt usearch_checked_combined.036/chimeras.txt usearch_checked_combined.037/chimeras.txt usearch_checked_combined.038/chimeras.txt usearch_checked_combined.039/chimeras.txt usearch_checked_combined.040/chimeras.txt usearch_checked_combined.041/chimeras.txt usearch_checked_combined.042/chimeras.txt usearch_checked_combined.043/chimeras.txt usearch_checked_combined.044/chimeras.txt usearch_checked_combined.045/chimeras.txt usearch_checked_combined.046/chimeras.txt usearch_checked_combined.047/chimeras.txt usearch_checked_combined.048/chimeras.txt usearch_checked_combined.049/chimeras.txt usearch_checked_combined.050/chimeras.txt usearch_checked_combined.051/chimeras.txt usearch_checked_combined.052/chimeras.txt usearch_checked_combined.053/chimeras.txt usearch_checked_combined.054/chimeras.txt usearch_checked_combined.055/chimeras.txt usearch_checked_combined.056/chimeras.txt usearch_checked_combined.057/chimeras.txt usearch_checked_combined.058/chimeras.txt usearch_checked_combined.059/chimeras.txt usearch_checked_combined.060/chimeras.txt usearch_checked_combined.061/chimeras.txt usearch_checked_combined.062/chimeras.txt usearch_checked_combined.063/chimeras.txt usearch_checked_combined.064/chimeras.txt usearch_checked_combined.065/chimeras.txt usearch_checked_combined.066/chimeras.txt usearch_checked_combined.067/chimeras.txt usearch_checked_combined.068/chimeras.txt usearch_checked_combined.069/chimeras.txt usearch_checked_combined.070/chimeras.txt usearch_checked_combined.071/chimeras.txt usearch_checked_combined.072/chimeras.txt usearch_checked_combined.073/chimeras.txt usearch_checked_combined.074/chimeras.txt usearch_checked_combined.075/chimeras.txt usearch_checked_combined.076/chimeras.txt usearch_checked_combined.077/chimeras.txt usearch_checked_combined.078/chimeras.txt usearch_checked_combined.079/chimeras.txt usearch_checked_combined.080/chimeras.txt usearch_checked_combined.081/chimeras.txt usearch_checked_combined.082/chimeras.txt usearch_checked_combined.083/chimeras.txt usearch_checked_combined.084/chimeras.txt usearch_checked_combined.085/chimeras.txt usearch_checked_combined.086/chimeras.txt usearch_checked_combined.087/chimeras.txt usearch_checked_combined.088/chimeras.txt usearch_checked_combined.089/chimeras.txt usearch_checked_combined.090/chimeras.txt usearch_checked_combined.091/chimeras.txt usearch_checked_combined.092/chimeras.txt usearch_checked_combined.093/chimeras.txt usearch_checked_combined.094/chimeras.txt usearch_checked_combined.095/chimeras.txt usearch_checked_combined.096/chimeras.txt usearch_checked_combined.097/chimeras.txt usearch_checked_combined.098/chimeras.txt usearch_checked_combined.099/chimeras.txt > chimeras.txt
filter_fasta.py -f combined_fasta/combined_seqs.fna -o combined_fasta/combined_nonchimera_seqs.fna -s chimeras.txt -n;
rm -rf usearch_checked_combined.0*
grep ">AH.LH_" combined_nonchimera_seqs.fna | wc -l
...
3.5. create OTU picking parameter file, and run the QIIME open reference picking pipeline
echo "pick_otus:similarity 0.97" > clustering_params.txt
echo "assign_taxonomy:similarity 0.97" >> clustering_params.txt
echo "parallel_align_seqs_pynast:template_fp /home/jhuang/REFs/SILVA_132_QIIME_release/core_alignment/80_core_alignment.fna" >> clustering_params.txt
echo "assign_taxonomy:reference_seqs_fp /home/jhuang/REFs/SILVA_132_QIIME_release/rep_set/rep_set_16S_only/99/silva_132_99_16S.fna" >> clustering_params.txt
echo "assign_taxonomy:id_to_taxonomy_fp /home/jhuang/REFs/SILVA_132_QIIME_release/taxonomy/16S_only/99/consensus_taxonomy_7_levels.txt" >> clustering_params.txt
echo "alpha_diversity:metrics chao1,observed_otus,shannon,PD_whole_tree" >> clustering_params.txt
#with usearch61 for reference picking and usearch61_ref for de novo OTU picking
pick_open_reference_otus.py -r/home/jhuang/REFs/SILVA_132_QIIME_release/rep_set/rep_set_16S_only/99/silva_132_99_16S.fna -i combined_fasta/combined_nonchimera_seqs.fna -o clustering/ -p clustering_params.txt --parallel
3.6. (optional), for control data
summarize_taxa_through_plots.py -i clustering/otu_table_mc2_w_tax_no_pynast_failures.biom -o plots/taxa_summary -s
mv usearch_checked_combined usearch_checked_combined_ctrl
mv combined_fasta combined_fasta_ctrl
mv clustering clustering_ctrl
mv plots plots_ctrl
3.7. for other data: core diversity analyses
core_diversity_analyses.py -o./core_diversity_e33778 -i./clustering/otu_table_mc2_w_tax_no_pynast_failures.biom -m./map2_corrected.txt -t./clustering/rep_set.tre -e33778 -p./clustering_params.txt
4.0. using R-code to summarize all results.
rmarkdown::render('Phyloseq.Rmd',output_file='Phyloseq.html')