gene_x 0 like s 567 view s
Tags: plot, python, R, pipeline
(namely epidome-processing step 2.6) Classification using 98% similarity.
#Input Files: yycH_seqtab_nochim.csv and ../epidome/DB/yycH_ref_aln.fasta
#Output Files: yycH_seqtab_ASV_seqs.fasta, yycH_seqtab_ASV_blast.txt, and yycH_seqtab.csv.classified.csv
python3 ../epidome/scripts/ASV_blast_classification.py yycH_seqtab_nochim.csv yycH_seqtab_ASV_seqs.fasta ../epidome/DB/yycH_ref_aln.fasta yycH_seqtab_ASV_blast.txt yycH_seqtab.csv.classified.csv 98
python3 ../epidome/scripts/ASV_blast_classification.py g216_seqtab_nochim.csv g216_seqtab_ASV_seqs.fasta ../epidome/DB/g216_ref_aln.fasta g216_seqtab_ASV_blast.txt g216_seqtab.csv.classified.csv 98
Manually check the reults, ensuring no seq34;seq35 occurs.
#Delete all records in *_seqtab.csv.classified.csv the alignment length < 448 (delete records 110 and 111) in g216, or < 440 (del record 90) in yycH.
#Confirm the results
jhuang@hamburg:/media/jhuang/Elements/Data_Luise_Epidome_batch3/run_2023_98$ cut -d$'\t' -f4 g216_seqtab_ASV_blast.txt | sort -u
446
447
448
jhuang@hamburg:/media/jhuang/Elements/Data_Luise_Epidome_batch3/run_2023_98$ cut -d$'\t' -f4 yycH_seqtab_ASV_blast.txt | sort -u
475
476
#It is not neccesary to delete correponding records 90 in yycH_* and g216_seqtab_ASV_blast.txt, since the two files are not used.
#sed -i -e s/seq//g yycH_seqtab_ASV_blast.txt #length=476
#sed -i -e s/seq//g g216_seqtab_ASV_blast.txt #length=448
##;-->""
#sed -i -e s/';'//g yycH_seqtab_ASV_blast.txt
#sed -i -e s/';'//g g216_seqtab_ASV_blast.txt
cp g216_seqtab.csv.classified.csv g216_seqtab.csv.classified.backup
cp yycH_seqtab.csv.classified.csv yycH_seqtab.csv.classified.backup
# (Option 1): Processing *.classified.csv, ensuring the format seq24,21 or seq24.
#https://github.com/ssi-dk/epidome/blob/master/example_data/190920_run1_G216_seqtab_from_dada2.csv.classified.csv
##DEBUG using LibreOffice, e.g. libreoffice --calc yycH_seqtab.csv.classified_noNA.csv after adding "ID"; at the corner.
#Final goal: "seqseq31;,seq30;" --> "seq31,30"
#;,seq --> ,
sed -i -e s/";,seq"/","/g g216_seqtab.csv.classified.csv
sed -i -e s/";,seq"/","/g yycH_seqtab.csv.classified.csv
#;"; --> ";
sed -i -e s/";\";"/"\";"/g g216_seqtab.csv.classified.csv
sed -i -e s/";\";"/"\";"/g yycH_seqtab.csv.classified.csv
sed -i -e s/seqseq/seq/g g216_seqtab.csv.classified.csv
sed -i -e s/seqseq/seq/g yycH_seqtab.csv.classified.csv
#confirm!
cut -d';' -f3 g216_seqtab.csv.classified.csv
cut -d';' -f3 yycH_seqtab.csv.classified.csv
# (Option 2): To reduce the unclassified, rename seq31,seq30 --> seq31 in g216, seq37,seq36 --> seq36 in yycH.
sed -i -e s/"seq40,seq31,seq30"/"seq40"/g g216_seqtab.csv.classified.csv
sed -i -e s/"seq31,seq30"/"seq31"/g g216_seqtab.csv.classified.csv
sed -i -e s/"seq5,seq2"/"seq2"/g sed -i -e s/"seq40,seq31,seq30"/"seq40"/g g216_seqtab.csv.classified.csv
sed -i -e s/"seq31,seq30"/"seq31"/g g216_seqtab.csv.classified.csv
sed -i -e s/"seq5,seq2"/"seq2"/g g216_seqtab.csv.classified.csv
grep ",seq" g216_seqtab.csv.classified.csv #should return no record.
#sed -i -e s/"seq22,20"/"seq20"/g g216_seqtab.csv.classified.csv
#sed -i -e s/"seq24,21"/"seq21"/g g216_seqtab.csv.classified.csv
#sed -i -e s/"seq21,17"/"seq21"/g g216_seqtab.csv.classified.csv
> epidome_object$p1_seqs #vim g216_seqtab.csv.classified_noNA.csv
[1] "seq28" "seq5" "seq40" "seq31,30"(seq31) "seq14"
[6] "seq20" "seq22" "seq26" "seq37" "seq21"
[11] "seq29" "seq24" "seq1" "seq26" "seq40"
[16] "seq21" "seq18" "seq40" "seq16" "seq37"
[21] "seq5,2" "seq18" "seq11" "seq14" "seq26"
[26] "seq31,30"(seq31) "seq37" "seq26" "seq1" "seq21"
[31] "seq3" "seq40" "seq26" "seq28" "seq40"
[36] "seq10" "seq5" "seq40,31,30"(seq40) "seq37" "seq37"
[41] "seq37" "seq40" "seq28" "seq26" "seq37"
[46] "seq22" "seq28" "seq19" "seq5,2" "seq26"
[51] "seq40" "seq40" "seq26" "seq40" "seq28"
[56] "seq28" "seq28" "seq37" "seq26" "seq40"
[61] "seq26" "seq37" "seq28" "seq28" "seq1"
[66] "seq28" "seq5,2"(seq5) "seq40" "seq21" "seq28"
[71] "seq29"
#"seq37" "seq22" "seq22,20"(seq20) "seq31,30"(seq31)
[76] "seq22" "seq20" "seq28" "seq37" "seq14"
[81] "seq28" "seq20" "seq40" "seq28" "seq24,21"
[86] "seq40" "seq14" "seq31,30"(seq31) "seq1" "seq1"
[91] "seq31,30"(seq31) "seq1" "seq40" "seq20" "seq1"
[96] "seq21" "seq26" "seq20" "seq20" "seq21,17"
[101] "seq37" "seq26" "seq31,30"(seq31) "seq31,30"(seq31) "seq40"
[106] "seq5" "seq12"
sed -i -e s/"seq37,seq36"/"seq36"/g yycH_seqtab.csv.classified.csv
sed -i -e s/"seq58,seq25,seq21,seq10"/"seq21"/g yycH_seqtab.csv.classified.csv
sed -i -e s/"seq23,seq22"/"seq23"/g yycH_seqtab.csv.classified.csv
sed -i -e s/"seq65,seq63,seq61,seq60"/"seq63"/g yycH_seqtab.csv.classified.csv
sed -i -e s/"seq65,seq63,seq53"/"seq63"/g yycH_seqtab.csv.classified.csv
sed -i -e s/"seq61,seq55"/"seq55"/g yycH_seqtab.csv.classified.csv
sed -i -e s/"seq34,seq28,seq25,seq10,seq9"/"seq34"/g yycH_seqtab.csv.classified.csv
sed -i -e s/"seq25,seq10,seq9"/"seq9"/g yycH_seqtab.csv.classified.csv
sed -i -e s/"seq25,seq10"/"seq10"/g yycH_seqtab.csv.classified.csv
grep ",seq" yycH_seqtab.csv.classified.csv #should return no record.
> epidome_object$p2_seqs
[1] "34" "43" "seq37,seq36"-->36 "42"
[5] "26" "63" "3" "2"
[9] "11" "seq65,seq63,seq53"-->63 "14" "38"
[13] "55" "3" "65" "seq58,seq25,seq21,seq10"-->21
[17] "seq23,seq22"-->23 "53" "56" "34"
[21] "62" "14" "35" "seq61,seq55"-->55
[25] "52" "seq25,seq10"-->10 "seq65,seq63,seq61,seq60"-->63 "21"
[29] "55" "15" "34" "49"
[33] "19" "63" "63" "34"
[37] "43" "12" "34" "seq34,seq28,seq25,seq10,seq9"-->34
[41] "43" "61" "25" "34"
[45] "34" "5" "18" "43"
[49] "62" "34" "19" "63"
[53] "7"
--------------
"seq37,seq36"-->36 "seq25,seq10,seq9"-->9 "56"
[57] "11" "49" "10" "63"
[61] "seq37,seq36"-->36 "14" "55"
#-------------
"55"
[65] "14" "37,36" "43" "45"
[69] "34" "34" "43" "43"
[73] "25" "14" "65,64,61,56" "15,9"
[77] "43" "38" "43" "53"
[81] "11" "9" "34" "15"
[85] "9" "23,22" "62" "43"
[89] "37,36"
# Common step after option 1 or option 2.
grep -v ";NA;" g216_seqtab.csv.classified.csv > g216_seqtab.csv.classified_noNA.csv
grep -v ";NA;" yycH_seqtab.csv.classified.csv > yycH_seqtab.csv.classified_noNA.csv
#filtering the small ASV in which the number < 500: INPUT: *_noNA.csv; OUTPUT: *_seqtab.csv.classified_filtered.csv
python3 g216_filtering.py
import pandas as pd
# Load the file
data = pd.read_csv('g216_seqtab.csv.classified_noNA.csv', delimiter=';')
# Compute row sums excluding the first 2 columns
data['row_sum'] = data.iloc[:, 2:].sum(axis=1)
print(data.iloc[:, 2:])
print(data.iloc[:, 2:].sum(axis=1))
# Filter out rows where the sum is less than 100
filtered_data = data[data['row_sum'] >= 500].drop(columns=['row_sum'])
# Save the resulting dataframe to a new file
filtered_data.to_csv('g216_seqtab.csv.classified_filtered.csv', sep=';', index=False)
python3 yycH_filtering.py
draw plot from three amplicons: cutadapted_g216, cutadapted_yycH
#Taxonomic database setup and classification
#- Custom databases of all unique g216 and yycH target sequences can be found at https://github.com/ssi-dk/epidome/tree/master/DB.
#- We formatted our g216 and yycH gene databases to be compatible with DADA2’s assign-Taxonomy function and used it to classify the S. epidermidis ASVs with the RDP naive Bayesian classifier method (https://github.com/ssi-dk/epidome/tree/master/scripts).
#- ST classification of samples was performed using the g216 target sequence as the primary identifier.
#- All g216 sequences unique to a single clonal cluster in the database were immediately classified as the matching clone, and in cases were the g216 sequence matched multiple clones, the secondary yycH target sequences were parsed to determine which clone was present. When this classification failed to resolve due to multiple potential combinations of sequences, ASVs were categorized as “Unclassified”. Similarly, g216 sequences not found in the database were labelled as “Novel”.
#install.packages("pls")
#library(pls)
#install.packages("reshape")
#library(reshape)
#install.packages("vegan")
library('vegan')
library(scales)
library(RColorBrewer)
#~/Tools/csv2xls-0.4/csv_to_xls.py g216_seqtab.csv.classified_noNA.csv yycH_seqtab.csv.classified_noNA.csv -d$'\t' -o counts.xls
setwd("/media/jhuang/Elements/Data_Luise_Epidome_batch3/run_2023_98_2")
#under r4-base
source("../epidome/scripts/epidome_functions.R")
ST_amplicon_table = read.table("../epidome/DB/epidome_ST_amplicon_frequencies.txt",sep = "\t")
epi01_table = read.table("g216_seqtab.csv.classified_filtered.csv",sep = ";",header=TRUE,row.names=1)
epi02_table = read.table("yycH_seqtab.csv.classified_filtered.csv",sep = ";",header=TRUE,row.names=1)
#> sum(epi01_table$AH.LH)
#[1] 78872
#> sum(epi02_table$AH.LH)
#[1] 86949
metadata_table = read.table("../metadata.txt",header=TRUE,row.names=1)
metadata_table$patient.ID <- factor(metadata_table$patient.ID, levels=c("P7","P8","P9","P10","P11","P12","P13","P14","P15","P16","P17","P18","P19","P20", "AH","AL","CB","HR","KK", "MC","MR","PT2","RP","SA","XN"))
#epidome_object = setup_epidome_object(epi01_table,epi02_table, metadata_table=metadata_table)
primer1_table <- epi01_table
primer2_table <- epi02_table
#setup_epidome_object <- function(primer1_table,primer2_table,metadata_table) {
#DEBUG: 3:ncol --> 2:ncol
primer1_counts = primer1_table[,2:ncol(primer1_table)]
primer2_counts = primer2_table[,2:ncol(primer2_table)]
primer1_all_sample_names = colnames(primer1_counts)
primer2_all_sample_names = colnames(primer2_counts)
samples_with_both_primers = primer1_all_sample_names[which(primer1_all_sample_names %in% primer2_all_sample_names)]
samples_missing_primer1_data = primer2_all_sample_names[which(!primer2_all_sample_names %in% primer1_all_sample_names)]
samples_missing_primer2_data = primer1_all_sample_names[which(!primer1_all_sample_names %in% primer2_all_sample_names)]
primer1_seqs = as.vector(primer1_table$Seq_number)
primer2_seqs = as.vector(primer2_table$Seq_number)
primer1_seqs[which(is.na(primer1_seqs))] = "seqUnclassified"
primer2_seqs[which(is.na(primer2_seqs))] = "seqUnclassified"
primer1_counts = primer1_table[,which(colnames(primer1_table) %in% samples_with_both_primers)]
primer2_counts = primer2_table[,which(colnames(primer2_table) %in% samples_with_both_primers)]
if (!missing(metadata_table)) {
metadata_names = rownames(metadata_table)
samples_missing_metadata = samples_with_both_primers[which(!samples_with_both_primers %in% metadata_names)]
samples_missing_primer_data = metadata_names[which(!metadata_names %in% samples_with_both_primers)]
include_names = metadata_names[which(metadata_names %in% samples_with_both_primers)]
metadata_include = match(include_names,metadata_names)
metadata_include = metadata_include[which(!is.na(metadata_include))]
metadata_table = metadata_table[metadata_include,]
#primer1_include = match(colnames(primer1_counts),include_names)
primer1_include = match(include_names,colnames(primer1_counts))
primer1_include = primer1_include[which(!is.na(primer1_include))]
#primer2_include = match(colnames(primer2_counts),include_names)
primer2_include = match(include_names,colnames(primer2_counts))
primer2_include = primer2_include[which(!is.na(primer2_include))]
epi1_table = primer1_counts[,primer1_include]
epi2_table = primer2_counts[,primer2_include]
epidome_object = list('p1_seqs'=primer1_seqs,'p1_table'=epi1_table,'p2_seqs'=primer2_seqs,'p2_table'=epi2_table,'metadata'=metadata_table,'sample_names'=include_names,'meta_variables'=colnames(metadata_table))
print(paste0("Metadata loaded with ",length(metadata_names)," samples and ",ncol(metadata_table)," variables"))
print(paste0(length(samples_missing_metadata)," samples are found in both primer sets but not in metadata: ",paste0(samples_missing_metadata,collapse = " ")))
print(paste0(length(samples_missing_primer_data)," samples are found in metadata but is missing from one or both primer sets: ",paste0(samples_missing_primer_data,collapse = " ")))
print(paste0(length(include_names)," samples are found in metadata and both tables and are included in epidome object"))
} else {
epi1_table = primer1_counts[,match(colnames(primer1_counts),samples_with_both_primers)]
epi2_table = primer2_counts[,match(colnames(primer2_counts),samples_with_both_primers)]
epidome_object = list('p1_seqs'=primer1_seqs,'p1_table'=epi1_table,'p2_seqs'=primer2_seqs,'p2_table'=epi2_table,'sample_names'=samples_with_both_primers)
print(paste0("No metadata loaded"))
print(paste0(length(samples_missing_primer2_data)," samples are found in p1 table but not in p2 table: ",paste0(samples_missing_primer2_data,collapse = " ")))
print(paste0(length(samples_missing_primer1_data)," samples are found in p2 table but not in p1 table: ",paste0(samples_missing_primer1_data,collapse = " ")))
print(paste0(length(samples_with_both_primers)," samples are found in both tables and are included in epidome object"))
}
#return(epidome_object)
#}
str(epidome_object)
#List of 7
# $ p1_seqs : chr [1:57] "seq28" "seq5" "seq40" "seq31,30" ...
# $ p1_table :'data.frame': 57 obs. of 51 variables:
# $ p2_seqs : chr [1:53] "seq34" "seq43" "seq37,36" "seq42" ...
# $ p2_table :'data.frame': 53 obs. of 51 variables:
# $ metadata :'data.frame': 51 obs. of 4 variables:
# $ sample_names : chr [1:51] "P7.Nose" "P8.Nose" "P9.Nose" "P10.Nose" ...
# $ meta_variables: chr [1:4] "patient.ID" "sample.site" "sample.type" "patient.sample.site"
unique(sort(epidome_object$p1_seqs)) # 57
#[1] "seq28" "seq5" "seq40" "seq31,30" "seq14"
#[6] "seq20" "seq22" "seq26" "seq37" "seq21"
#[11] "seq29" "seq24" "seq1" "seq26" "seq40"
#[16] "seq21" "seq18" "seq40" "seq16" "seq37"
#[21] "seq5,2" "seq18" "seq11" "seq14" "seq26"
#[26] "seq31,30" "seq37" "seq26" "seq1" "seq21"
#[31] "seq3" "seq40" "seq26" "seq28" "seq40"
#[36] "seq10" "seq5" "seq40,31,30" "seq37" "seq37"
#[41] "seq37" "seq40" "seq28" "seq26" "seq37"
#[46] "seq22" "seq28" "seq19" "seq5,2" "seq26"
#[51] "seq40" "seq40" "seq26" "seq40" "seq28"
#[56] "seq28" "seq28"
unique(sort(epidome_object$p2_seqs))
#Image1
primer_compare = compare_primer_output(epidome_object,color_variable = "sample.type")
png("image1.png")
primer_compare$plot
dev.off()
eo_ASV_combined = combine_ASVs_epidome(epidome_object)
#> str(eo_ASV_combined)
#List of 7
#$ p1_seqs : chr [1:21] "seq28" "seq5" "seq40" "seq31,30" ...
#$ p1_table :'data.frame': 21 obs. of 51 variables:
#> eo_ASV_combined$p1_seqs #21
#[1] "seq28" "seq5" "seq40" "seq31,30" "seq14"
#[6] "seq20" "seq22" "seq26" "seq37" "seq21"
#[11] "seq29" "seq24" "seq1" "seq18" "seq16"
#[16] "seq5,2" "seq11" "seq3" "seq10" "seq40,31,30"
#[21] "seq19"
#eo_filtered = filter_lowcount_samples_epidome(eo_ASV_combined,500,500)
#Note that we run the following code instead run the method from the script due to modified code:
count_table_checked = classify_epidome(eo_ASV_combined,ST_amplicon_table)
write.csv(count_table_checked, file="count_table_checked.csv")
strict_classifier=FALSE
#classify_epidome = function(epidome_object,ST_amplicon_table,strict_classifier=FALSE) {
#Adapt the code to the true epi01 and epi02 values, please give the complete code:
epidome_object_norm = normalize_epidome_object(eo_ASV_combined)
p1_seqs = unlist(lapply(as.vector(eo_ASV_combined$p1_seqs),function(x) substr(x,4,nchar(x))))
#> p1_seqs
#[1] "28" "5" "40" "31,30" "14" "20"
#[7] "22" "26" "37" "21" "29" "24"
#[13] "1" "26" "40" "21" "18" "40"
#[19] "16" "37" "5,2" "18" "11" "14"
#[25] "26" "31,30" "37" "26" "1" "21"
#[31] "3" "40" "26" "28" "40" "10"
#[37] "5" "40,31,30" "37" "37" "37" "40"
#[43] "28" "26" "37" "22" "28" "19"
#[49] "5,2" "26" "40" "40" "26" "40"
#[55] "28" "28" "28"
p2_seqs = unlist(lapply(as.vector(eo_ASV_combined$p2_seqs),function(x) substr(x,4,nchar(x))))
n_samples = length(eo_ASV_combined$sample_names)
n_p1_seqs = length(p1_seqs)
count_table = matrix(nrow = 0, ncol = n_samples,dimnames = list('ST'=c(),'Samples'=eo_ASV_combined$sample_names))
match_type_table = matrix(nrow = n_p1_seqs, ncol = n_samples)
count_table_names = c()
unclassified_count_vec = rep(0,n_samples)
g1_unclassified_count_vec = rep(0,n_samples)
for (i in 1:n_p1_seqs) {
#i=4: "31,30" --> length(p1_seq_split)>1 --> possible_STs==("-","-","8","215") --> Unclassified!
p1_seq = p1_seqs[i]
p1_seq_split = strsplit(p1_seq,',')[[1]]
if (length(p1_seq_split)>1) {
possible_STs = as.vector(ST_amplicon_table$ST)[which(ST_amplicon_table$epi01_ASV %in% p1_seq_split)]
if (length(possible_STs)==1) {
p1_seq = p1_seq_split[1]
} else {
p1_seq = "Unclassified"
}
}
if (p1_seq != "Unclassified") {
p1_seq_ST_table = ST_amplicon_table[which(ST_amplicon_table$epi01_ASV==p1_seq),]
unique_p1_ASVs = unique(as.vector(p1_seq_ST_table$ST))
p2_ASVs_matching_p1 = p1_seq_ST_table$epi02_ASV
count_vec = rep(0,n_samples)
for (j in 1:n_samples) {
p1_percent = epidome_object_norm$p1_table[i,j]
p1_count = eo_ASV_combined$p1_table[i,j]
if (p1_percent > 0.01) {
p2_seqs_present_within_difference_threshold_idx = which(epidome_object_norm$p2_table[,j]>(p1_percent-10) & epidome_object_norm$p2_table[,j]>1)
p2_seqs_present_ASVs = p2_seqs[p2_seqs_present_within_difference_threshold_idx]
p2_seqs_present_ASVs_matching_p1 = p2_seqs_present_ASVs[which(p2_seqs_present_ASVs %in% p2_ASVs_matching_p1)]
p2_percent = epidome_object_norm$p2_table[which(p2_seqs %in% p2_seqs_present_ASVs_matching_p1),j]
p2_count = eo_ASV_combined$p2_table[which(p2_seqs %in% p2_seqs_present_ASVs_matching_p1),j]
if (length(p2_seqs_present_ASVs_matching_p1) == 0) {
ST_order = order(p1_seq_ST_table$freq,decreasing = T)
classification_group = as.vector(p1_seq_ST_table$ST)[ST_order[1]]
if (classification_group %in% count_table_names) {
classification_idx = which(count_table_names == classification_group)
count_table[classification_idx,j] = count_table[classification_idx,j]+p1_count
} else {
count_vec = rep(0,n_samples)
count_vec[j] = p1_count
count_table = rbind(count_table,count_vec)
count_table_names = c(count_table_names,classification_group)
}
#match_type_table[i,j] = "epi01 match without epi02 match"
# Replace the match type with actual epi01 and epi02 values
match_type_table[i, j] = paste0("epi01:", p1_seq, " epi02:NA", " ", classification_group)
}
else if (length(p2_seqs_present_ASVs_matching_p1) == 1) {
p1_p2_seq_ST_table = p1_seq_ST_table[which(p1_seq_ST_table$epi02_ASV==p2_seqs_present_ASVs_matching_p1[1]),]
ST_order = order(p1_p2_seq_ST_table$freq,decreasing = T)
classification_group = as.vector(p1_p2_seq_ST_table$ST)[ST_order[1]]
if (classification_group %in% count_table_names) {
classification_idx = which(count_table_names == classification_group)
count_table[classification_idx,j] = count_table[classification_idx,j]+p1_count
} else {
count_vec = rep(0,n_samples)
count_vec[j] = p1_count
count_table = rbind(count_table,count_vec)
count_table_names = c(count_table_names,classification_group)
}
#match_type_table[i,j] = "Unique epi01 epi02 combination"
# Replace the match type with actual epi01 and epi02 values
match_type_table[i, j] = paste0("epi01:", p1_seq, " epi02:", p2_seqs_present_ASVs_matching_p1[1], " ", classification_group)
} else {
if (strict_classifier) {
unclassified_count_vec[j] = unclassified_count_vec[j]+p1_count
} else {
p1_p2_seq_ST_table = p1_seq_ST_table[which(p1_seq_ST_table$epi02_ASV %in% p2_seqs_present_ASVs_matching_p1),]
ST_order = order(p1_p2_seq_ST_table$freq,decreasing = T)
classification_group = as.vector(p1_p2_seq_ST_table$ST)[ST_order[1]]
if (classification_group %in% count_table_names) {
classification_idx = which(count_table_names == classification_group)
count_table[classification_idx,j] = count_table[classification_idx,j]+p1_count
} else {
count_vec = rep(0,n_samples)
count_vec[j] = p1_count
count_table = rbind(count_table,count_vec)
count_table_names = c(count_table_names,classification_group)
}
}
##unclassified_count_vec[j] = unclassified_count_vec[j]+p1_count
#match_type_table[i,j] = c("Non unique epi01 epi02 combination")
# Concatenate all found epi02 values and replace the match type
epi02_values = paste(p2_seqs_present_ASVs_matching_p1, collapse = ",")
match_type_table[i, j] = paste0("epi01:", p1_seq, " epi02:", epi02_values, " ", classification_group)
}
} else {
match_type_table[i,j] = "Low counts"
unclassified_count_vec[j] = unclassified_count_vec[j]+p1_count
}
}
} else {
count_vec = as.numeric(as.vector(eo_ASV_combined$p1_table[i,]))
unclassified_count_vec = unclassified_count_vec+count_vec
g1_unclassified_count_vec = g1_unclassified_count_vec+count_vec
match_type_vec = rep("Unclassified epi01 match",n_samples)
match_type_table[i,] = match_type_vec
}
}
count_table = rbind(count_table,unclassified_count_vec)
rownames(count_table) = c(count_table_names,"Unclassified")
count_table_df <- as.data.frame(count_table)
write.csv(count_table_df, file="count_table_df.csv")
#TODO: delete the last record "Unclassified" due to the few reads!
#count_table_df = count_table_df[-32,]
# --> FOR STEP4: prepare the file match_type_table.csv for treeHeatmap drawing.
write.csv(match_type_table, file = "match_type_table.csv", row.names = TRUE)
count_table_df_ordered = count_table_df[order(rowSums(count_table_df),decreasing = T),]
row_sums <- apply(count_table_df_ordered, MARGIN=1, FUN=sum)
#TODO: the ST in which the counts <= 500 not shown!
#NOTE: ST225 is "epi01:3 epi02:3 225"!
# Since "epi01:3 epi02:NA 225","epi01:3 epi02:NA 225" are 225 disappeared in the match_type_table.csv_____
#Note that epi01 is g216, epi02 is yycH.
# 215 - 8 59 130 297
# 611637 401179 341187 337409 316389 316273
# 331 2 73 384 200 5
# 178616 125734 99899 97885 93613 92385
# 14 218 23 278 487 290
# 71417 62467 60192 57213 53630 51454
# 87 89 100 329 153 83
# 49000 23390 21880 21635 19453 10539
# 19 170 225* 570 184 88
# 9465 1839 1487 863 437 318
# 114 Unclassified
# 108 11
#> row_sums
# 215 - 59 Unclassified 297 130
# 611915 442192 359708 357868 335880 315734
# 5 73 331 384 200 14
# 145578 134301 127734 126465 94917 85440
# 2 218 23 278 290 87
# 85438 62492 60192 57213 51363 49000
# 89 100 329 210 83 19
# 23390 21880 21635 12663 10642 9465
# 170 225 570 88 114 10
# 1839 1487 863 318 108 91
# > unique(sort(eo_ASV_combined$p1_seqs))
# [1] "seq1" "seq10" "seq11" "seq14" "seq16" "seq18" "seq19" "seq2" "seq20"
# [10] "seq21" "seq22" "seq24" "seq26" "seq28" "seq29" "seq3" "seq31" "seq37"
# [19] "seq40" "seq5"
# > unique(sort(eo_ASV_combined$p2_seqs))
# [1] "seq10" "seq11" "seq12" "seq14" "seq15" "seq18" "seq19" "seq2" "seq21"
# [10] "seq23" "seq25" "seq26" "seq3" "seq34" "seq35" "seq36" "seq38" "seq42"
# [19] "seq43" "seq49" "seq5" "seq52" "seq53" "seq55" "seq56" "seq61" "seq62"
# [28] "seq63" "seq65" "seq7"
#row.names(count_table_df) <- c("ST215","ST130","ST278","ST200","ST184","ST5","ST59","ST83","ST487","ST114","ST8","ST297","ST153","ST384","ST2","ST14","ST89","ST570","-","ST290","ST331","ST73","ST88","ST100","ST87","ST23","ST218","ST329","ST19","ST225","ST170")
#row.names(count_table_df) <- c("215","130","278","200","184","5","59","83","487","114","8","297","153","384","2","14","89","570","-","290","331","73","88","100","87","23","218","329","19","225","170")
#row.names(count_table_df) <- c("ST215","ST130","ST278","ST200","ST5","ST59","ST83","ST114","ST297","ST384","ST14","ST89","ST210","-","ST328","ST331","ST73","ST2","ST88","ST100","ST10","ST290","ST87","ST23","ST218","ST329","ST19","ST225","ST170","Unclassified")
col_order <- c("P7.Nose","P8.Nose","P9.Nose","P10.Nose","P11.Nose","P12.Nose","P13.Nose","P14.Nose","P15.Nose", "P16.Foot","P17.Foot","P18.Foot","P19.Foot","P20.Foot","P16.Nose","P17.Nose","P18.Nose","P19.Nose","P20.Nose", "AH.LH","AH.NLH","AH.Nose", "AL.LH","AL.NLH","AL.Nose", "CB.LH","CB.NLH","CB.Nose", "HR.LH","HR.NLH","HR.Nose", "KK.LH","KK.NLH","KK.Nose", "MC.LH","MC.NLH","MC.Nose", "MR.LH","MR.NLH","MR.Nose", "PT2.LH","PT2.NLH","PT2.Nose", "RP.LH","RP.NLH","RP.Nose", "SA.LH","SA.NLH","SA.Nose", "XN.LH","XN.NLH","XN.Nose")
count_table_df_reordered <- count_table_df_ordered[,col_order]
p = make_barplot_epidome(count_table_df_reordered,reorder=FALSE,normalize=TRUE)
#p = make_barplot_epidome(count_table_df_reordered,reorder=TRUE,normalize=TRUE)
png("Barplot_All.png", width=1600, height=900)
p
dev.off()
#Change rowname from '-' to 'Novel'
rownames(count_table_df_reordered)[rownames(count_table_df_reordered) == "-"] <- "Novel"
write.csv(file="count_table_df_reordered.txt", count_table_df_reordered)
prepare data for plotTreeHeatmap: table+tree.
#replace "," to \n in match_type_table.csv
sed 's/\",\"/\n/g' match_type_table.csv > match_type_table.csv_
grep "epi02" match_type_table.csv_ > match_type_table.csv__
grep -v "NA" match_type_table.csv__ > match_type_table.csv___
#(Deprecated) awk -F' ' '{ split($2, a, ":"); split(a[2], b, ","); print $1 " epi02:" b[1] " " $3 }' match_type_table.csv___ > match_type_table.csv____
#(Deprecated) sed -i -e 's/\"//g' match_type_table.csv____
#(Deprecated) sort match_type_table.csv____ -u > match_type_table.csv_____
#Manually select and merge items from match_type_table.csv___ and save it to match_type_table.csv____ (see content as below)!
epi01:1 epi02:36 2
epi01:3 epi02:3 225
epi01:5 epi02:34 130
epi01:5 epi02:3 278
epi01:5 epi02:43 200
epi01:5 epi02:36 184
epi01:11 epi02:19 19
epi01:14 epi02:34 297
epi01:14 epi02:36 153
epi01:16 epi02:56 329
epi01:16 epi02:55 329
epi01:18 epi02:14 218
epi01:19 epi02:43 170
epi01:2 epi02:15 -
epi01:20 epi02:36 2
epi01:20 epi02:43 14
epi01:20 epi02:42 89
epi01:20 epi02:34 384
epi01:20 epi02:11 570
epi01:21 epi02:34 100
epi01:21 epi02:63 290
epi01:24 epi02:38 23
epi01:24 epi02:2 87
epi01:26 epi02:34 -
epi01:26 epi02:53 290
epi01:26 epi02:63 331
epi01:28 epi02:43 215
epi01:29 epi02:43 215
epi01:29 epi02:55 -
epi01:31 epi02:43 8
epi01:37 epi02:36 2
epi01:37 epi02:26 88
epi01:37 epi02:43 73
epi01:40 epi02:2 5
epi01:40 epi02:34 5
epi01:40 epi02:34 59
epi01:40 epi02:26 114
epi01:40 epi02:43 83
epi01:40 epi02:36 487
#---- v2 ('-' three times, ST215, ST290, ST329, ST59 two times) --> delete 'epi01:28 epi02:43 215' ----
epi01:1 epi02:34 -
epi01:3 epi02:3 225
epi01:5 epi02:3 278
epi01:5 epi02:34 130
epi01:5 epi02:43 200
epi01:11 epi02:19 19
epi01:14 epi02:34 297
epi01:16 epi02:55 329
epi01:16 epi02:56 329
epi01:18 epi02:14 218
epi01:19 epi02:43 170
epi01:20 epi02:42 89
epi01:20 epi02:43 14
epi01:20 epi02:34 384
epi01:20 epi02:11 570
epi01:20 epi02:3 210
epi01:21 epi02:38 10
epi01:21 epi02:34 100
epi01:21 epi02:63 290
epi01:24 epi02:38 23
epi01:24 epi02:2 87
epi01:26 epi02:34 -
epi01:26 epi02:53 290
epi01:26 epi02:63 331
epi01:29 epi02:43 215
epi01:29 epi02:55 -
epi01:37 epi02:62 2
epi01:37 epi02:26 88
epi01:37 epi02:43 73
epi01:37 epi02:34 59
epi01:40 epi02:2 5
epi01:40 epi02:26 114
epi01:40 epi02:43 83
epi01:40 epi02:34 59
python3 concat_epi01_epi02.py
# Define the function to extract fasta sequences
def fasta_to_dict(filename):
sequences = {}
header = None
sequence = []
with open(filename, 'r') as file:
for line in file:
line = line.strip()
if line.startswith('>'):
if header:
sequences[header] = ''.join(sequence)
header = line[1:]
sequence = []
else:
sequence.append(line)
if header:
sequences[header] = ''.join(sequence)
return sequences
# Read combinations from match_type_table.csv____
combinations = []
with open("match_type_table.csv____", 'r') as csv_file:
for line in csv_file:
parts = line.strip().split()
epi01 = parts[0].split(":")[1]
epi02 = parts[1].split(":")[1]
st = parts[2]
combinations.append((epi01, epi02, st))
# Read both fasta files into dictionaries
g216_sequences = fasta_to_dict("../epidome/DB/epi01_ref_aln.fasta")
yycH_sequences = fasta_to_dict("../epidome/DB/epi02_ref_aln.fasta")
output_filename = "concatenated_sequences.fasta"
with open(output_filename, 'w') as output_file:
for epi01, epi02, st in combinations:
if epi01 in g216_sequences and epi02 in yycH_sequences:
concatenated_seq = g216_sequences[epi01] + "NNN" + yycH_sequences[epi02]
header = f">g216-{epi01}_yycH-{epi02}_ST{st}"
output_file.write(f"{header}\n{concatenated_seq}\n")
print(f"Concatenated sequences saved to {output_filename}")
sed -i -e 's/ST-/-/g' concatenated_sequences.fasta
muscle -in concatenated_sequences.fasta -out aligned.fasta
muscle -clw -in concatenated_sequences.fasta -out aligned.clw
FastTree -nt < aligned.fasta > concatenated_sequences.tree
# Run plotTree.R to draw ggtree_and_gheatmap.png.
library(ggtree)
library(ggplot2)
setwd("/home/jhuang/DATA/Data_Luise_Epidome_batch3/run_2023_98_2/")
# -- edit tree --
#https://icytree.org/
info <- read.csv("typing_34.csv")
info$name <- info$Isolate
tree <- read.tree("concatenated_sequences.tree")
#cols <- c(infection='purple2', commensalism='skyblue2')
library(dplyr)
heatmapData2 <- info %>% select(Isolate, g216, ST)
rn <- heatmapData2$Isolate
heatmapData2$Isolate <- NULL
heatmapData2 <- as.data.frame(sapply(heatmapData2, as.character))
rownames(heatmapData2) <- rn
#https://bookdown.org/hneth/ds4psy/D-3-apx-colors-basics.html
#"g216:16","g216:18","g216:19","g216:20"(4),"g216:21"(2),"g216:24"(1),"g216:26"(2)
heatmap.colours <- c("darkgrey", "palegreen","seagreen","seagreen","seagreen4","seagreen4","yellowgreen", "orange4","orange4","orange4","orange4","orange4", "dodgerblue3","blue4","dodgerblue3","dodgerblue3", "azure4", "magenta2","maroon2","mediumorchid1","mediumorchid1","mediumorchid1","mediumorchid1","mediumorchid1", "mediumorchid3","mediumorchid3","mediumorchid3","mediumpurple1","mediumpurple4","mediumpurple4", "red4", "olivedrab3","olivedrab4","seagreen","seagreen4","yellowgreen", "orange4", "blue4","dodgerblue3", "magenta2","maroon2","mediumorchid1", "mediumorchid3","mediumpurple1","mediumpurple4", "red4")
names(heatmap.colours) <- c("-", "ST225","ST130","ST278","ST19","ST200", "ST297", "ST5","ST59","ST114","ST83","ST487", "ST2","ST215","ST73","ST88", "ST8", "ST329","ST170", "ST89","ST14","ST570","ST384","ST210", "ST10","ST100","ST290","ST23","ST87", "ST331", "ST218", "g216:1","g216:3","g216:5","g216:11","g216:14", "g216:40", "g216:29","g216:37", "g216:16","g216:19","g216:20","g216:21","g216:24","g216:26", "g216:18")
#mydat$Regulation <- factor(mydat$Regulation, levels=c("up","down"))
#circular
#geom_tippoint(aes(color=Type)) + scale_color_manual(values=cols) +
p <- ggtree(tree, layout='circular', branch.length='none') %<+% info + geom_tiplab2(aes(label=name), offset=1)
#, geom='text', align=TRUE, linetype=NA, hjust=1.8,check.overlap=TRUE, size=3.3
#difference between geom_tiplab and geom_tiplab2?
#+ theme(axis.text.x = element_text(angle = 30, vjust = 0.5)) + theme(axis.text = element_text(size = 20)) + scale_size(range = c(1, 20))
#font.size=10,
png("ggtree.png", width=1260, height=1260)
#svg("ggtree.svg", width=1260, height=1260)
p
dev.off()
desired_order <- c("-", "ST225","ST130","ST278","ST19","ST200", "ST297", "ST5","ST59","ST114","ST83","ST487", "ST2","ST215","ST73","ST88", "ST8", "ST329","ST170", "ST89","ST14","ST570","ST384","ST210", "ST10","ST100","ST290","ST23","ST87", "ST331", "ST218", "g216:1","g216:3","g216:5","g216:11","g216:14", "g216:40", "g216:29","g216:37", "g216:16","g216:19","g216:20","g216:21","g216:24","g216:26", "g216:18")
png("ggtree_and_gheatmap.png", width=1290, height=1000)
#svg("ggtree_and_gheatmap.svg", width=17, height=15)
gheatmap(p, heatmapData2, width=0.15,colnames_position="top", colnames_angle=90, colnames_offset_y = 0.1, hjust=0.5, font.size=4, offset = 9) + scale_fill_manual(values=heatmap.colours, breaks=desired_order) + theme(legend.text = element_text(size = 14)) + theme(legend.title = element_text(size = 14)) + guides(fill=guide_legend(title=""), color = guide_legend(override.aes = list(size = 5)))
dev.off()
prepre the table for the alpha diversity calculation
To calculate the alpha diversity values for each row based on the provided metrics (chao1, observed_otus, shannon, PD_whole_tree), you'll typically use specialized software packages such as QIIME or R packages like vegan. Here, I can provide a high-level approach and calculations for some of the metrics. However, some of the metrics like PD_whole_tree require phylogenetic trees, which can't be derived from the table you provided.
import pandas as pd
import numpy as np
# Read the data from CSV
df = pd.read_csv('count_table_df_reordered.csv', index_col=0)
def chao1(s):
s = np.array(s)
n = np.sum(s)
s_obs = np.count_nonzero(s)
s1 = np.sum(s == 1)
s2 = np.sum(s == 2)
return s_obs + ((s1 ** 2) / (2 * s2)) if s2 != 0 else s_obs
def observed_sts(s):
return np.count_nonzero(s)
def shannon2(s):
s = np.array(s)
p = s / np.sum(s)
return -np.sum(p * np.log(p + 1e-10)) # added small value to avoid log(0)
def shannon(s):
s = np.array(s)
total = np.sum(s)
p = s / total # Convert absolute counts to proportions
return -np.sum(p * np.log(p + 1e-10)) # added small value to avoid log(0)
# Calculate the metrics for each sample (column)
results = {
'sample': [],
'chao1': [],
'observed_sts': [],
'shannon': []
}
for column in df.columns:
results['sample'].append(column)
results['chao1'].append(chao1(df[column]))
results['observed_sts'].append(observed_sts(df[column]))
results['shannon'].append(shannon(df[column]))
# Convert results to DataFrame and save to a new CSV
result_df = pd.DataFrame(results)
result_df.to_csv('alpha_diversity_metrics_samples.csv', index=False)
For PD_whole_tree, you would require a phylogenetic tree of the OTUs and then compute the sum of branch lengths. This is more complex and requires specialized software and the phylogenetic tree information, which isn't present in the table you provided.
The input data being absolute counts versus relative abundance does influence certain alpha diversity metrics. Let's re-evaluate:
Observed OTUs: Whether the data is in absolute counts or relative abundance doesn't impact the "Observed OTUs" metric. It's simply counting how many OTUs have a non-zero count.
Chao1: Chao1 is intended for absolute counts. The metric is based on the number of singletons and doubletons, which are derived from raw counts and not relative abundances. Hence, the calculation remains accurate.
Shannon Diversity Index: The Shannon Diversity Index is based on the relative abundances of OTUs, not their absolute counts. The formula requires the proportion of each species (or OTU) in the sample. Thus, you'd need to convert absolute counts to proportions (or relative abundance) before calculating this metric.
calculate the alpha diversity in Phyloseq.Rmd. The code for the calculation is as follows.
\pagebreak
# Alpha diversity for ST
Plot Shannon index and observed STs.
Regroup together samples from the same group.
```{r, echo=TRUE, warning=FALSE}
hmp.div_st <- read.csv("alpha_diversity_metrics_samples.csv", sep=",")
colnames(hmp.div_st) <- c("sample", "chao1", "observed_sts", "shannon")
row.names(hmp.div_st) <- hmp.div_st$sample
div.df <- merge(hmp.div_st, hmp.meta, by.x="sample", by.y = "sam_name")
div.df2 <- div.df[, c("Group", "shannon", "observed_sts")]
colnames(div.df2) <- c("Group", "Shannon", "ST")
#colnames(div.df2)
options(max.print=999999)
stat.test.Shannon <- compare_means(
Shannon ~ Group, data = div.df2,
method = "t.test"
)
knitr::kable(stat.test.Shannon) %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
stat.test.ST <- compare_means(
ST ~ Group, data = div.df2,
method = "t.test"
)
knitr::kable(stat.test.ST) %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
div_df_melt <- reshape2::melt(div.df2)
p <- ggboxplot(div_df_melt, x = "Group", y = "value",
facet.by = "variable",
scales = "free",
width = 0.5,
fill = "gray", legend= "right")
#ggpar(p, xlab = FALSE, ylab = FALSE)
lev <- levels(factor(div_df_melt$Group)) # get the variables
L.pairs <- combn(seq_along(lev), 2, simplify = FALSE, FUN = function(i) lev[i]) #%>% filter(p.signif != "ns")
my_stat_compare_means <- function (mapping = NULL, data = NULL, method = NULL, paired = FALSE,
method.args = list(), ref.group = NULL, comparisons = NULL,
hide.ns = FALSE, label.sep = ", ", label = NULL, label.x.npc = "left",
label.y.npc = "top", label.x = NULL, label.y = NULL, tip.length = 0.03,
symnum.args = list(), geom = "text", position = "identity",
na.rm = FALSE, show.legend = NA, inherit.aes = TRUE, ...)
{
if (!is.null(comparisons)) {
method.info <- ggpubr:::.method_info(method)
method <- method.info$method
method.args <- ggpubr:::.add_item(method.args, paired = paired)
if (method == "wilcox.test")
method.args$exact <- FALSE
pms <- list(...)
size <- ifelse(is.null(pms$size), 0.3, pms$size)
color <- ifelse(is.null(pms$color), "black", pms$color)
map_signif_level <- FALSE
if (is.null(label))
label <- "p.format"
if (ggpubr:::.is_p.signif_in_mapping(mapping) | (label %in% "p.signif")) {
if (ggpubr:::.is_empty(symnum.args)) {
map_signif_level <- c(`****` = 1e-04, `***` = 0.001,
`**` = 0.01, `*` = 0.05, ns = 1)
} else {
map_signif_level <- symnum.args
}
if (hide.ns)
names(map_signif_level)[5] <- " "
}
step_increase <- ifelse(is.null(label.y), 0.12, 0)
ggsignif::geom_signif(comparisons = comparisons, y_position = label.y,
test = method, test.args = method.args, step_increase = step_increase,
size = size, color = color, map_signif_level = map_signif_level,
tip_length = tip.length, data = data)
} else {
mapping <- ggpubr:::.update_mapping(mapping, label)
layer(stat = StatCompareMeans, data = data, mapping = mapping,
geom = geom, position = position, show.legend = show.legend,
inherit.aes = inherit.aes, params = list(label.x.npc = label.x.npc,
label.y.npc = label.y.npc, label.x = label.x,
label.y = label.y, label.sep = label.sep, method = method,
method.args = method.args, paired = paired, ref.group = ref.group,
symnum.args = symnum.args, hide.ns = hide.ns,
na.rm = na.rm, ...))
}
}
p3 <- p +
stat_compare_means(
method="t.test",
comparisons = list(c("P16-P20.Foot", "P16-P20.Nose"), c("AH-XN.LH", "AH-XN.Nose"), c("AH-XN.NLH", "AH-XN.Nose")),
label = "p.signif",
symnum.args <- list(cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 1), symbols = c("****", "***", "**", "*", "ns")),
) +
theme(axis.text.x = element_text(angle = 30, hjust = 1)) # add this line to rotate x-axis text
print(p3)
ggsave("./figures/alpha_diversity_ST_Group2.png", device="png", height = 10, width = 12)
ggsave("./figures/alpha_diversity_ST_Group2.svg", device="svg", height = 10, width = 12)
```
点赞本文的读者
还没有人对此文章表态
没有评论
RNA-seq skin organoids on GRCh38+chrHsv1 (final)
MicrobiotaProcess Group2 vs Group6 (v1)
Small RNA sequencing processing in the example of smallRNA_7
© 2023 XGenes.com Impressum