-
(namely epidome-processing step 2.6) Classification using 98% similarity.
#Input Files: yycH_seqtab_nochim.csv and ../epidome/DB/yycH_ref_aln.fasta #Output Files: yycH_seqtab_ASV_seqs.fasta, yycH_seqtab_ASV_blast.txt, and yycH_seqtab.csv.classified.csv python3 ../epidome/scripts/ASV_blast_classification.py yycH_seqtab_nochim.csv yycH_seqtab_ASV_seqs.fasta ../epidome/DB/yycH_ref_aln.fasta yycH_seqtab_ASV_blast.txt yycH_seqtab.csv.classified.csv 98 python3 ../epidome/scripts/ASV_blast_classification.py g216_seqtab_nochim.csv g216_seqtab_ASV_seqs.fasta ../epidome/DB/g216_ref_aln.fasta g216_seqtab_ASV_blast.txt g216_seqtab.csv.classified.csv 98
-
Manually check the reults, ensuring no seq34;seq35 occurs.
#Delete all records in *_seqtab.csv.classified.csv the alignment length < 448 (delete records 110 and 111) in g216, or < 440 (del record 90) in yycH. #Confirm the results jhuang@hamburg:/media/jhuang/Elements/Data_Luise_Epidome_batch3/run_2023_98$ cut -d$'\t' -f4 g216_seqtab_ASV_blast.txt | sort -u 446 447 448 jhuang@hamburg:/media/jhuang/Elements/Data_Luise_Epidome_batch3/run_2023_98$ cut -d$'\t' -f4 yycH_seqtab_ASV_blast.txt | sort -u 475 476 #It is not neccesary to delete correponding records 90 in yycH_* and g216_seqtab_ASV_blast.txt, since the two files are not used. #sed -i -e s/seq//g yycH_seqtab_ASV_blast.txt #length=476 #sed -i -e s/seq//g g216_seqtab_ASV_blast.txt #length=448 ##;-->"" #sed -i -e s/';'//g yycH_seqtab_ASV_blast.txt #sed -i -e s/';'//g g216_seqtab_ASV_blast.txt cp g216_seqtab.csv.classified.csv g216_seqtab.csv.classified.backup cp yycH_seqtab.csv.classified.csv yycH_seqtab.csv.classified.backup # (Option 1): Processing *.classified.csv, ensuring the format seq24,21 or seq24. #https://github.com/ssi-dk/epidome/blob/master/example_data/190920_run1_G216_seqtab_from_dada2.csv.classified.csv ##DEBUG using LibreOffice, e.g. libreoffice --calc yycH_seqtab.csv.classified_noNA.csv after adding "ID"; at the corner. #Final goal: "seqseq31;,seq30;" --> "seq31,30" #;,seq --> , sed -i -e s/";,seq"/","/g g216_seqtab.csv.classified.csv sed -i -e s/";,seq"/","/g yycH_seqtab.csv.classified.csv #;"; --> "; sed -i -e s/";\";"/"\";"/g g216_seqtab.csv.classified.csv sed -i -e s/";\";"/"\";"/g yycH_seqtab.csv.classified.csv sed -i -e s/seqseq/seq/g g216_seqtab.csv.classified.csv sed -i -e s/seqseq/seq/g yycH_seqtab.csv.classified.csv #confirm! cut -d';' -f3 g216_seqtab.csv.classified.csv cut -d';' -f3 yycH_seqtab.csv.classified.csv # (Option 2): To reduce the unclassified, rename seq31,seq30 --> seq31 in g216, seq37,seq36 --> seq36 in yycH. sed -i -e s/"seq40,seq31,seq30"/"seq40"/g g216_seqtab.csv.classified.csv sed -i -e s/"seq31,seq30"/"seq31"/g g216_seqtab.csv.classified.csv sed -i -e s/"seq5,seq2"/"seq2"/g sed -i -e s/"seq40,seq31,seq30"/"seq40"/g g216_seqtab.csv.classified.csv sed -i -e s/"seq31,seq30"/"seq31"/g g216_seqtab.csv.classified.csv sed -i -e s/"seq5,seq2"/"seq2"/g g216_seqtab.csv.classified.csv grep ",seq" g216_seqtab.csv.classified.csv #should return no record. #sed -i -e s/"seq22,20"/"seq20"/g g216_seqtab.csv.classified.csv #sed -i -e s/"seq24,21"/"seq21"/g g216_seqtab.csv.classified.csv #sed -i -e s/"seq21,17"/"seq21"/g g216_seqtab.csv.classified.csv > epidome_object$p1_seqs #vim g216_seqtab.csv.classified_noNA.csv [1] "seq28" "seq5" "seq40" "seq31,30"(seq31) "seq14" [6] "seq20" "seq22" "seq26" "seq37" "seq21" [11] "seq29" "seq24" "seq1" "seq26" "seq40" [16] "seq21" "seq18" "seq40" "seq16" "seq37" [21] "seq5,2" "seq18" "seq11" "seq14" "seq26" [26] "seq31,30"(seq31) "seq37" "seq26" "seq1" "seq21" [31] "seq3" "seq40" "seq26" "seq28" "seq40" [36] "seq10" "seq5" "seq40,31,30"(seq40) "seq37" "seq37" [41] "seq37" "seq40" "seq28" "seq26" "seq37" [46] "seq22" "seq28" "seq19" "seq5,2" "seq26" [51] "seq40" "seq40" "seq26" "seq40" "seq28" [56] "seq28" "seq28" "seq37" "seq26" "seq40" [61] "seq26" "seq37" "seq28" "seq28" "seq1" [66] "seq28" "seq5,2"(seq5) "seq40" "seq21" "seq28" [71] "seq29" #"seq37" "seq22" "seq22,20"(seq20) "seq31,30"(seq31) [76] "seq22" "seq20" "seq28" "seq37" "seq14" [81] "seq28" "seq20" "seq40" "seq28" "seq24,21" [86] "seq40" "seq14" "seq31,30"(seq31) "seq1" "seq1" [91] "seq31,30"(seq31) "seq1" "seq40" "seq20" "seq1" [96] "seq21" "seq26" "seq20" "seq20" "seq21,17" [101] "seq37" "seq26" "seq31,30"(seq31) "seq31,30"(seq31) "seq40" [106] "seq5" "seq12" sed -i -e s/"seq37,seq36"/"seq36"/g yycH_seqtab.csv.classified.csv sed -i -e s/"seq58,seq25,seq21,seq10"/"seq21"/g yycH_seqtab.csv.classified.csv sed -i -e s/"seq23,seq22"/"seq23"/g yycH_seqtab.csv.classified.csv sed -i -e s/"seq65,seq63,seq61,seq60"/"seq63"/g yycH_seqtab.csv.classified.csv sed -i -e s/"seq65,seq63,seq53"/"seq63"/g yycH_seqtab.csv.classified.csv sed -i -e s/"seq61,seq55"/"seq55"/g yycH_seqtab.csv.classified.csv sed -i -e s/"seq34,seq28,seq25,seq10,seq9"/"seq34"/g yycH_seqtab.csv.classified.csv sed -i -e s/"seq25,seq10,seq9"/"seq9"/g yycH_seqtab.csv.classified.csv sed -i -e s/"seq25,seq10"/"seq10"/g yycH_seqtab.csv.classified.csv grep ",seq" yycH_seqtab.csv.classified.csv #should return no record. > epidome_object$p2_seqs [1] "34" "43" "seq37,seq36"-->36 "42" [5] "26" "63" "3" "2" [9] "11" "seq65,seq63,seq53"-->63 "14" "38" [13] "55" "3" "65" "seq58,seq25,seq21,seq10"-->21 [17] "seq23,seq22"-->23 "53" "56" "34" [21] "62" "14" "35" "seq61,seq55"-->55 [25] "52" "seq25,seq10"-->10 "seq65,seq63,seq61,seq60"-->63 "21" [29] "55" "15" "34" "49" [33] "19" "63" "63" "34" [37] "43" "12" "34" "seq34,seq28,seq25,seq10,seq9"-->34 [41] "43" "61" "25" "34" [45] "34" "5" "18" "43" [49] "62" "34" "19" "63" [53] "7" -------------- "seq37,seq36"-->36 "seq25,seq10,seq9"-->9 "56" [57] "11" "49" "10" "63" [61] "seq37,seq36"-->36 "14" "55" #------------- "55" [65] "14" "37,36" "43" "45" [69] "34" "34" "43" "43" [73] "25" "14" "65,64,61,56" "15,9" [77] "43" "38" "43" "53" [81] "11" "9" "34" "15" [85] "9" "23,22" "62" "43" [89] "37,36" # Common step after option 1 or option 2. grep -v ";NA;" g216_seqtab.csv.classified.csv > g216_seqtab.csv.classified_noNA.csv grep -v ";NA;" yycH_seqtab.csv.classified.csv > yycH_seqtab.csv.classified_noNA.csv #filtering the small ASV in which the number < 500: INPUT: *_noNA.csv; OUTPUT: *_seqtab.csv.classified_filtered.csv python3 g216_filtering.py import pandas as pd # Load the file data = pd.read_csv('g216_seqtab.csv.classified_noNA.csv', delimiter=';') # Compute row sums excluding the first 2 columns data['row_sum'] = data.iloc[:, 2:].sum(axis=1) print(data.iloc[:, 2:]) print(data.iloc[:, 2:].sum(axis=1)) # Filter out rows where the sum is less than 100 filtered_data = data[data['row_sum'] >= 500].drop(columns=['row_sum']) # Save the resulting dataframe to a new file filtered_data.to_csv('g216_seqtab.csv.classified_filtered.csv', sep=';', index=False) python3 yycH_filtering.py
-
draw plot from three amplicons: cutadapted_g216, cutadapted_yycH
#Taxonomic database setup and classification #- Custom databases of all unique g216 and yycH target sequences can be found at https://github.com/ssi-dk/epidome/tree/master/DB. #- We formatted our g216 and yycH gene databases to be compatible with DADA2’s assign-Taxonomy function and used it to classify the S. epidermidis ASVs with the RDP naive Bayesian classifier method (https://github.com/ssi-dk/epidome/tree/master/scripts). #- ST classification of samples was performed using the g216 target sequence as the primary identifier. #- All g216 sequences unique to a single clonal cluster in the database were immediately classified as the matching clone, and in cases were the g216 sequence matched multiple clones, the secondary yycH target sequences were parsed to determine which clone was present. When this classification failed to resolve due to multiple potential combinations of sequences, ASVs were categorized as “Unclassified”. Similarly, g216 sequences not found in the database were labelled as “Novel”. #install.packages("pls") #library(pls) #install.packages("reshape") #library(reshape) #install.packages("vegan") library('vegan') library(scales) library(RColorBrewer) #~/Tools/csv2xls-0.4/csv_to_xls.py g216_seqtab.csv.classified_noNA.csv yycH_seqtab.csv.classified_noNA.csv -d$'\t' -o counts.xls setwd("/media/jhuang/Elements/Data_Luise_Epidome_batch3/run_2023_98_2") #under r4-base source("../epidome/scripts/epidome_functions.R") ST_amplicon_table = read.table("../epidome/DB/epidome_ST_amplicon_frequencies.txt",sep = "\t") epi01_table = read.table("g216_seqtab.csv.classified_filtered.csv",sep = ";",header=TRUE,row.names=1) epi02_table = read.table("yycH_seqtab.csv.classified_filtered.csv",sep = ";",header=TRUE,row.names=1) #> sum(epi01_table$AH.LH) #[1] 78872 #> sum(epi02_table$AH.LH) #[1] 86949 metadata_table = read.table("../metadata.txt",header=TRUE,row.names=1) metadata_table$patient.ID <- factor(metadata_table$patient.ID, levels=c("P7","P8","P9","P10","P11","P12","P13","P14","P15","P16","P17","P18","P19","P20", "AH","AL","CB","HR","KK", "MC","MR","PT2","RP","SA","XN")) #epidome_object = setup_epidome_object(epi01_table,epi02_table, metadata_table=metadata_table) primer1_table <- epi01_table primer2_table <- epi02_table #setup_epidome_object <- function(primer1_table,primer2_table,metadata_table) { #DEBUG: 3:ncol --> 2:ncol primer1_counts = primer1_table[,2:ncol(primer1_table)] primer2_counts = primer2_table[,2:ncol(primer2_table)] primer1_all_sample_names = colnames(primer1_counts) primer2_all_sample_names = colnames(primer2_counts) samples_with_both_primers = primer1_all_sample_names[which(primer1_all_sample_names %in% primer2_all_sample_names)] samples_missing_primer1_data = primer2_all_sample_names[which(!primer2_all_sample_names %in% primer1_all_sample_names)] samples_missing_primer2_data = primer1_all_sample_names[which(!primer1_all_sample_names %in% primer2_all_sample_names)] primer1_seqs = as.vector(primer1_table$Seq_number) primer2_seqs = as.vector(primer2_table$Seq_number) primer1_seqs[which(is.na(primer1_seqs))] = "seqUnclassified" primer2_seqs[which(is.na(primer2_seqs))] = "seqUnclassified" primer1_counts = primer1_table[,which(colnames(primer1_table) %in% samples_with_both_primers)] primer2_counts = primer2_table[,which(colnames(primer2_table) %in% samples_with_both_primers)] if (!missing(metadata_table)) { metadata_names = rownames(metadata_table) samples_missing_metadata = samples_with_both_primers[which(!samples_with_both_primers %in% metadata_names)] samples_missing_primer_data = metadata_names[which(!metadata_names %in% samples_with_both_primers)] include_names = metadata_names[which(metadata_names %in% samples_with_both_primers)] metadata_include = match(include_names,metadata_names) metadata_include = metadata_include[which(!is.na(metadata_include))] metadata_table = metadata_table[metadata_include,] #primer1_include = match(colnames(primer1_counts),include_names) primer1_include = match(include_names,colnames(primer1_counts)) primer1_include = primer1_include[which(!is.na(primer1_include))] #primer2_include = match(colnames(primer2_counts),include_names) primer2_include = match(include_names,colnames(primer2_counts)) primer2_include = primer2_include[which(!is.na(primer2_include))] epi1_table = primer1_counts[,primer1_include] epi2_table = primer2_counts[,primer2_include] epidome_object = list('p1_seqs'=primer1_seqs,'p1_table'=epi1_table,'p2_seqs'=primer2_seqs,'p2_table'=epi2_table,'metadata'=metadata_table,'sample_names'=include_names,'meta_variables'=colnames(metadata_table)) print(paste0("Metadata loaded with ",length(metadata_names)," samples and ",ncol(metadata_table)," variables")) print(paste0(length(samples_missing_metadata)," samples are found in both primer sets but not in metadata: ",paste0(samples_missing_metadata,collapse = " "))) print(paste0(length(samples_missing_primer_data)," samples are found in metadata but is missing from one or both primer sets: ",paste0(samples_missing_primer_data,collapse = " "))) print(paste0(length(include_names)," samples are found in metadata and both tables and are included in epidome object")) } else { epi1_table = primer1_counts[,match(colnames(primer1_counts),samples_with_both_primers)] epi2_table = primer2_counts[,match(colnames(primer2_counts),samples_with_both_primers)] epidome_object = list('p1_seqs'=primer1_seqs,'p1_table'=epi1_table,'p2_seqs'=primer2_seqs,'p2_table'=epi2_table,'sample_names'=samples_with_both_primers) print(paste0("No metadata loaded")) print(paste0(length(samples_missing_primer2_data)," samples are found in p1 table but not in p2 table: ",paste0(samples_missing_primer2_data,collapse = " "))) print(paste0(length(samples_missing_primer1_data)," samples are found in p2 table but not in p1 table: ",paste0(samples_missing_primer1_data,collapse = " "))) print(paste0(length(samples_with_both_primers)," samples are found in both tables and are included in epidome object")) } #return(epidome_object) #} str(epidome_object) #List of 7 # $ p1_seqs : chr [1:57] "seq28" "seq5" "seq40" "seq31,30" ... # $ p1_table :'data.frame': 57 obs. of 51 variables: # $ p2_seqs : chr [1:53] "seq34" "seq43" "seq37,36" "seq42" ... # $ p2_table :'data.frame': 53 obs. of 51 variables: # $ metadata :'data.frame': 51 obs. of 4 variables: # $ sample_names : chr [1:51] "P7.Nose" "P8.Nose" "P9.Nose" "P10.Nose" ... # $ meta_variables: chr [1:4] "patient.ID" "sample.site" "sample.type" "patient.sample.site" unique(sort(epidome_object$p1_seqs)) # 57 #[1] "seq28" "seq5" "seq40" "seq31,30" "seq14" #[6] "seq20" "seq22" "seq26" "seq37" "seq21" #[11] "seq29" "seq24" "seq1" "seq26" "seq40" #[16] "seq21" "seq18" "seq40" "seq16" "seq37" #[21] "seq5,2" "seq18" "seq11" "seq14" "seq26" #[26] "seq31,30" "seq37" "seq26" "seq1" "seq21" #[31] "seq3" "seq40" "seq26" "seq28" "seq40" #[36] "seq10" "seq5" "seq40,31,30" "seq37" "seq37" #[41] "seq37" "seq40" "seq28" "seq26" "seq37" #[46] "seq22" "seq28" "seq19" "seq5,2" "seq26" #[51] "seq40" "seq40" "seq26" "seq40" "seq28" #[56] "seq28" "seq28" unique(sort(epidome_object$p2_seqs)) #Image1 primer_compare = compare_primer_output(epidome_object,color_variable = "sample.type") png("image1.png") primer_compare$plot dev.off() eo_ASV_combined = combine_ASVs_epidome(epidome_object) #> str(eo_ASV_combined) #List of 7 #$ p1_seqs : chr [1:21] "seq28" "seq5" "seq40" "seq31,30" ... #$ p1_table :'data.frame': 21 obs. of 51 variables: #> eo_ASV_combined$p1_seqs #21 #[1] "seq28" "seq5" "seq40" "seq31,30" "seq14" #[6] "seq20" "seq22" "seq26" "seq37" "seq21" #[11] "seq29" "seq24" "seq1" "seq18" "seq16" #[16] "seq5,2" "seq11" "seq3" "seq10" "seq40,31,30" #[21] "seq19" #eo_filtered = filter_lowcount_samples_epidome(eo_ASV_combined,500,500) #Note that we run the following code instead run the method from the script due to modified code: count_table_checked = classify_epidome(eo_ASV_combined,ST_amplicon_table) write.csv(count_table_checked, file="count_table_checked.csv") strict_classifier=FALSE #classify_epidome = function(epidome_object,ST_amplicon_table,strict_classifier=FALSE) { #Adapt the code to the true epi01 and epi02 values, please give the complete code: epidome_object_norm = normalize_epidome_object(eo_ASV_combined) p1_seqs = unlist(lapply(as.vector(eo_ASV_combined$p1_seqs),function(x) substr(x,4,nchar(x)))) #> p1_seqs #[1] "28" "5" "40" "31,30" "14" "20" #[7] "22" "26" "37" "21" "29" "24" #[13] "1" "26" "40" "21" "18" "40" #[19] "16" "37" "5,2" "18" "11" "14" #[25] "26" "31,30" "37" "26" "1" "21" #[31] "3" "40" "26" "28" "40" "10" #[37] "5" "40,31,30" "37" "37" "37" "40" #[43] "28" "26" "37" "22" "28" "19" #[49] "5,2" "26" "40" "40" "26" "40" #[55] "28" "28" "28" p2_seqs = unlist(lapply(as.vector(eo_ASV_combined$p2_seqs),function(x) substr(x,4,nchar(x)))) n_samples = length(eo_ASV_combined$sample_names) n_p1_seqs = length(p1_seqs) count_table = matrix(nrow = 0, ncol = n_samples,dimnames = list('ST'=c(),'Samples'=eo_ASV_combined$sample_names)) match_type_table = matrix(nrow = n_p1_seqs, ncol = n_samples) count_table_names = c() unclassified_count_vec = rep(0,n_samples) g1_unclassified_count_vec = rep(0,n_samples) for (i in 1:n_p1_seqs) { #i=4: "31,30" --> length(p1_seq_split)>1 --> possible_STs==("-","-","8","215") --> Unclassified! p1_seq = p1_seqs[i] p1_seq_split = strsplit(p1_seq,',')[[1]] if (length(p1_seq_split)>1) { possible_STs = as.vector(ST_amplicon_table$ST)[which(ST_amplicon_table$epi01_ASV %in% p1_seq_split)] if (length(possible_STs)==1) { p1_seq = p1_seq_split[1] } else { p1_seq = "Unclassified" } } if (p1_seq != "Unclassified") { p1_seq_ST_table = ST_amplicon_table[which(ST_amplicon_table$epi01_ASV==p1_seq),] unique_p1_ASVs = unique(as.vector(p1_seq_ST_table$ST)) p2_ASVs_matching_p1 = p1_seq_ST_table$epi02_ASV count_vec = rep(0,n_samples) for (j in 1:n_samples) { p1_percent = epidome_object_norm$p1_table[i,j] p1_count = eo_ASV_combined$p1_table[i,j] if (p1_percent > 0.01) { p2_seqs_present_within_difference_threshold_idx = which(epidome_object_norm$p2_table[,j]>(p1_percent-10) & epidome_object_norm$p2_table[,j]>1) p2_seqs_present_ASVs = p2_seqs[p2_seqs_present_within_difference_threshold_idx] p2_seqs_present_ASVs_matching_p1 = p2_seqs_present_ASVs[which(p2_seqs_present_ASVs %in% p2_ASVs_matching_p1)] p2_percent = epidome_object_norm$p2_table[which(p2_seqs %in% p2_seqs_present_ASVs_matching_p1),j] p2_count = eo_ASV_combined$p2_table[which(p2_seqs %in% p2_seqs_present_ASVs_matching_p1),j] if (length(p2_seqs_present_ASVs_matching_p1) == 0) { ST_order = order(p1_seq_ST_table$freq,decreasing = T) classification_group = as.vector(p1_seq_ST_table$ST)[ST_order[1]] if (classification_group %in% count_table_names) { classification_idx = which(count_table_names == classification_group) count_table[classification_idx,j] = count_table[classification_idx,j]+p1_count } else { count_vec = rep(0,n_samples) count_vec[j] = p1_count count_table = rbind(count_table,count_vec) count_table_names = c(count_table_names,classification_group) } #match_type_table[i,j] = "epi01 match without epi02 match" # Replace the match type with actual epi01 and epi02 values match_type_table[i, j] = paste0("epi01:", p1_seq, " epi02:NA", " ", classification_group) } else if (length(p2_seqs_present_ASVs_matching_p1) == 1) { p1_p2_seq_ST_table = p1_seq_ST_table[which(p1_seq_ST_table$epi02_ASV==p2_seqs_present_ASVs_matching_p1[1]),] ST_order = order(p1_p2_seq_ST_table$freq,decreasing = T) classification_group = as.vector(p1_p2_seq_ST_table$ST)[ST_order[1]] if (classification_group %in% count_table_names) { classification_idx = which(count_table_names == classification_group) count_table[classification_idx,j] = count_table[classification_idx,j]+p1_count } else { count_vec = rep(0,n_samples) count_vec[j] = p1_count count_table = rbind(count_table,count_vec) count_table_names = c(count_table_names,classification_group) } #match_type_table[i,j] = "Unique epi01 epi02 combination" # Replace the match type with actual epi01 and epi02 values match_type_table[i, j] = paste0("epi01:", p1_seq, " epi02:", p2_seqs_present_ASVs_matching_p1[1], " ", classification_group) } else { if (strict_classifier) { unclassified_count_vec[j] = unclassified_count_vec[j]+p1_count } else { p1_p2_seq_ST_table = p1_seq_ST_table[which(p1_seq_ST_table$epi02_ASV %in% p2_seqs_present_ASVs_matching_p1),] ST_order = order(p1_p2_seq_ST_table$freq,decreasing = T) classification_group = as.vector(p1_p2_seq_ST_table$ST)[ST_order[1]] if (classification_group %in% count_table_names) { classification_idx = which(count_table_names == classification_group) count_table[classification_idx,j] = count_table[classification_idx,j]+p1_count } else { count_vec = rep(0,n_samples) count_vec[j] = p1_count count_table = rbind(count_table,count_vec) count_table_names = c(count_table_names,classification_group) } } ##unclassified_count_vec[j] = unclassified_count_vec[j]+p1_count #match_type_table[i,j] = c("Non unique epi01 epi02 combination") # Concatenate all found epi02 values and replace the match type epi02_values = paste(p2_seqs_present_ASVs_matching_p1, collapse = ",") match_type_table[i, j] = paste0("epi01:", p1_seq, " epi02:", epi02_values, " ", classification_group) } } else { match_type_table[i,j] = "Low counts" unclassified_count_vec[j] = unclassified_count_vec[j]+p1_count } } } else { count_vec = as.numeric(as.vector(eo_ASV_combined$p1_table[i,])) unclassified_count_vec = unclassified_count_vec+count_vec g1_unclassified_count_vec = g1_unclassified_count_vec+count_vec match_type_vec = rep("Unclassified epi01 match",n_samples) match_type_table[i,] = match_type_vec } } count_table = rbind(count_table,unclassified_count_vec) rownames(count_table) = c(count_table_names,"Unclassified") count_table_df <- as.data.frame(count_table) write.csv(count_table_df, file="count_table_df.csv") #TODO: delete the last record "Unclassified" due to the few reads! #count_table_df = count_table_df[-32,] # --> FOR STEP4: prepare the file match_type_table.csv for treeHeatmap drawing. write.csv(match_type_table, file = "match_type_table.csv", row.names = TRUE) count_table_df_ordered = count_table_df[order(rowSums(count_table_df),decreasing = T),] row_sums <- apply(count_table_df_ordered, MARGIN=1, FUN=sum) #TODO: the ST in which the counts <= 500 not shown! #NOTE: ST225 is "epi01:3 epi02:3 225"! # Since "epi01:3 epi02:NA 225","epi01:3 epi02:NA 225" are 225 disappeared in the match_type_table.csv_____ #Note that epi01 is g216, epi02 is yycH. # 215 - 8 59 130 297 # 611637 401179 341187 337409 316389 316273 # 331 2 73 384 200 5 # 178616 125734 99899 97885 93613 92385 # 14 218 23 278 487 290 # 71417 62467 60192 57213 53630 51454 # 87 89 100 329 153 83 # 49000 23390 21880 21635 19453 10539 # 19 170 225* 570 184 88 # 9465 1839 1487 863 437 318 # 114 Unclassified # 108 11 #> row_sums # 215 - 59 Unclassified 297 130 # 611915 442192 359708 357868 335880 315734 # 5 73 331 384 200 14 # 145578 134301 127734 126465 94917 85440 # 2 218 23 278 290 87 # 85438 62492 60192 57213 51363 49000 # 89 100 329 210 83 19 # 23390 21880 21635 12663 10642 9465 # 170 225 570 88 114 10 # 1839 1487 863 318 108 91 # > unique(sort(eo_ASV_combined$p1_seqs)) # [1] "seq1" "seq10" "seq11" "seq14" "seq16" "seq18" "seq19" "seq2" "seq20" # [10] "seq21" "seq22" "seq24" "seq26" "seq28" "seq29" "seq3" "seq31" "seq37" # [19] "seq40" "seq5" # > unique(sort(eo_ASV_combined$p2_seqs)) # [1] "seq10" "seq11" "seq12" "seq14" "seq15" "seq18" "seq19" "seq2" "seq21" # [10] "seq23" "seq25" "seq26" "seq3" "seq34" "seq35" "seq36" "seq38" "seq42" # [19] "seq43" "seq49" "seq5" "seq52" "seq53" "seq55" "seq56" "seq61" "seq62" # [28] "seq63" "seq65" "seq7" #row.names(count_table_df) <- c("ST215","ST130","ST278","ST200","ST184","ST5","ST59","ST83","ST487","ST114","ST8","ST297","ST153","ST384","ST2","ST14","ST89","ST570","-","ST290","ST331","ST73","ST88","ST100","ST87","ST23","ST218","ST329","ST19","ST225","ST170") #row.names(count_table_df) <- c("215","130","278","200","184","5","59","83","487","114","8","297","153","384","2","14","89","570","-","290","331","73","88","100","87","23","218","329","19","225","170") #row.names(count_table_df) <- c("ST215","ST130","ST278","ST200","ST5","ST59","ST83","ST114","ST297","ST384","ST14","ST89","ST210","-","ST328","ST331","ST73","ST2","ST88","ST100","ST10","ST290","ST87","ST23","ST218","ST329","ST19","ST225","ST170","Unclassified") col_order <- c("P7.Nose","P8.Nose","P9.Nose","P10.Nose","P11.Nose","P12.Nose","P13.Nose","P14.Nose","P15.Nose", "P16.Foot","P17.Foot","P18.Foot","P19.Foot","P20.Foot","P16.Nose","P17.Nose","P18.Nose","P19.Nose","P20.Nose", "AH.LH","AH.NLH","AH.Nose", "AL.LH","AL.NLH","AL.Nose", "CB.LH","CB.NLH","CB.Nose", "HR.LH","HR.NLH","HR.Nose", "KK.LH","KK.NLH","KK.Nose", "MC.LH","MC.NLH","MC.Nose", "MR.LH","MR.NLH","MR.Nose", "PT2.LH","PT2.NLH","PT2.Nose", "RP.LH","RP.NLH","RP.Nose", "SA.LH","SA.NLH","SA.Nose", "XN.LH","XN.NLH","XN.Nose") count_table_df_reordered <- count_table_df_ordered[,col_order] p = make_barplot_epidome(count_table_df_reordered,reorder=FALSE,normalize=TRUE) #p = make_barplot_epidome(count_table_df_reordered,reorder=TRUE,normalize=TRUE) png("Barplot_All.png", width=1600, height=900) p dev.off() #Change rowname from '-' to 'Novel' rownames(count_table_df_reordered)[rownames(count_table_df_reordered) == "-"] <- "Novel" write.csv(file="count_table_df_reordered.txt", count_table_df_reordered)
-
prepare data for plotTreeHeatmap: table+tree.
#replace "," to \n in match_type_table.csv sed 's/\",\"/\n/g' match_type_table.csv > match_type_table.csv_ grep "epi02" match_type_table.csv_ > match_type_table.csv__ grep -v "NA" match_type_table.csv__ > match_type_table.csv___ #(Deprecated) awk -F' ' '{ split($2, a, ":"); split(a[2], b, ","); print $1 " epi02:" b[1] " " $3 }' match_type_table.csv___ > match_type_table.csv____ #(Deprecated) sed -i -e 's/\"//g' match_type_table.csv____ #(Deprecated) sort match_type_table.csv____ -u > match_type_table.csv_____ #Manually select and merge items from match_type_table.csv___ and save it to match_type_table.csv____ (see content as below)! epi01:1 epi02:36 2 epi01:3 epi02:3 225 epi01:5 epi02:34 130 epi01:5 epi02:3 278 epi01:5 epi02:43 200 epi01:5 epi02:36 184 epi01:11 epi02:19 19 epi01:14 epi02:34 297 epi01:14 epi02:36 153 epi01:16 epi02:56 329 epi01:16 epi02:55 329 epi01:18 epi02:14 218 epi01:19 epi02:43 170 epi01:2 epi02:15 - epi01:20 epi02:36 2 epi01:20 epi02:43 14 epi01:20 epi02:42 89 epi01:20 epi02:34 384 epi01:20 epi02:11 570 epi01:21 epi02:34 100 epi01:21 epi02:63 290 epi01:24 epi02:38 23 epi01:24 epi02:2 87 epi01:26 epi02:34 - epi01:26 epi02:53 290 epi01:26 epi02:63 331 epi01:28 epi02:43 215 epi01:29 epi02:43 215 epi01:29 epi02:55 - epi01:31 epi02:43 8 epi01:37 epi02:36 2 epi01:37 epi02:26 88 epi01:37 epi02:43 73 epi01:40 epi02:2 5 epi01:40 epi02:34 5 epi01:40 epi02:34 59 epi01:40 epi02:26 114 epi01:40 epi02:43 83 epi01:40 epi02:36 487 #---- v2 ('-' three times, ST215, ST290, ST329, ST59 two times) --> delete 'epi01:28 epi02:43 215' ---- epi01:1 epi02:34 - epi01:3 epi02:3 225 epi01:5 epi02:3 278 epi01:5 epi02:34 130 epi01:5 epi02:43 200 epi01:11 epi02:19 19 epi01:14 epi02:34 297 epi01:16 epi02:55 329 epi01:16 epi02:56 329 epi01:18 epi02:14 218 epi01:19 epi02:43 170 epi01:20 epi02:42 89 epi01:20 epi02:43 14 epi01:20 epi02:34 384 epi01:20 epi02:11 570 epi01:20 epi02:3 210 epi01:21 epi02:38 10 epi01:21 epi02:34 100 epi01:21 epi02:63 290 epi01:24 epi02:38 23 epi01:24 epi02:2 87 epi01:26 epi02:34 - epi01:26 epi02:53 290 epi01:26 epi02:63 331 epi01:29 epi02:43 215 epi01:29 epi02:55 - epi01:37 epi02:62 2 epi01:37 epi02:26 88 epi01:37 epi02:43 73 epi01:37 epi02:34 59 epi01:40 epi02:2 5 epi01:40 epi02:26 114 epi01:40 epi02:43 83 epi01:40 epi02:34 59 python3 concat_epi01_epi02.py # Define the function to extract fasta sequences def fasta_to_dict(filename): sequences = {} header = None sequence = [] with open(filename, 'r') as file: for line in file: line = line.strip() if line.startswith('>'): if header: sequences[header] = ''.join(sequence) header = line[1:] sequence = [] else: sequence.append(line) if header: sequences[header] = ''.join(sequence) return sequences # Read combinations from match_type_table.csv____ combinations = [] with open("match_type_table.csv____", 'r') as csv_file: for line in csv_file: parts = line.strip().split() epi01 = parts[0].split(":")[1] epi02 = parts[1].split(":")[1] st = parts[2] combinations.append((epi01, epi02, st)) # Read both fasta files into dictionaries g216_sequences = fasta_to_dict("../epidome/DB/epi01_ref_aln.fasta") yycH_sequences = fasta_to_dict("../epidome/DB/epi02_ref_aln.fasta") output_filename = "concatenated_sequences.fasta" with open(output_filename, 'w') as output_file: for epi01, epi02, st in combinations: if epi01 in g216_sequences and epi02 in yycH_sequences: concatenated_seq = g216_sequences[epi01] + "NNN" + yycH_sequences[epi02] header = f">g216-{epi01}_yycH-{epi02}_ST{st}" output_file.write(f"{header}\n{concatenated_seq}\n") print(f"Concatenated sequences saved to {output_filename}") sed -i -e 's/ST-/-/g' concatenated_sequences.fasta muscle -in concatenated_sequences.fasta -out aligned.fasta muscle -clw -in concatenated_sequences.fasta -out aligned.clw FastTree -nt < aligned.fasta > concatenated_sequences.tree # Run plotTree.R to draw ggtree_and_gheatmap.png. library(ggtree) library(ggplot2) setwd("/home/jhuang/DATA/Data_Luise_Epidome_batch3/run_2023_98_2/") # -- edit tree -- #https://icytree.org/ info <- read.csv("typing_34.csv") info$name <- info$Isolate tree <- read.tree("concatenated_sequences.tree") #cols <- c(infection='purple2', commensalism='skyblue2') library(dplyr) heatmapData2 <- info %>% select(Isolate, g216, ST) rn <- heatmapData2$Isolate heatmapData2$Isolate <- NULL heatmapData2 <- as.data.frame(sapply(heatmapData2, as.character)) rownames(heatmapData2) <- rn #https://bookdown.org/hneth/ds4psy/D-3-apx-colors-basics.html #"g216:16","g216:18","g216:19","g216:20"(4),"g216:21"(2),"g216:24"(1),"g216:26"(2) heatmap.colours <- c("darkgrey", "palegreen","seagreen","seagreen","seagreen4","seagreen4","yellowgreen", "orange4","orange4","orange4","orange4","orange4", "dodgerblue3","blue4","dodgerblue3","dodgerblue3", "azure4", "magenta2","maroon2","mediumorchid1","mediumorchid1","mediumorchid1","mediumorchid1","mediumorchid1", "mediumorchid3","mediumorchid3","mediumorchid3","mediumpurple1","mediumpurple4","mediumpurple4", "red4", "olivedrab3","olivedrab4","seagreen","seagreen4","yellowgreen", "orange4", "blue4","dodgerblue3", "magenta2","maroon2","mediumorchid1", "mediumorchid3","mediumpurple1","mediumpurple4", "red4") names(heatmap.colours) <- c("-", "ST225","ST130","ST278","ST19","ST200", "ST297", "ST5","ST59","ST114","ST83","ST487", "ST2","ST215","ST73","ST88", "ST8", "ST329","ST170", "ST89","ST14","ST570","ST384","ST210", "ST10","ST100","ST290","ST23","ST87", "ST331", "ST218", "g216:1","g216:3","g216:5","g216:11","g216:14", "g216:40", "g216:29","g216:37", "g216:16","g216:19","g216:20","g216:21","g216:24","g216:26", "g216:18") #mydat$Regulation <- factor(mydat$Regulation, levels=c("up","down")) #circular #geom_tippoint(aes(color=Type)) + scale_color_manual(values=cols) + p <- ggtree(tree, layout='circular', branch.length='none') %<+% info + geom_tiplab2(aes(label=name), offset=1) #, geom='text', align=TRUE, linetype=NA, hjust=1.8,check.overlap=TRUE, size=3.3 #difference between geom_tiplab and geom_tiplab2? #+ theme(axis.text.x = element_text(angle = 30, vjust = 0.5)) + theme(axis.text = element_text(size = 20)) + scale_size(range = c(1, 20)) #font.size=10, png("ggtree.png", width=1260, height=1260) #svg("ggtree.svg", width=1260, height=1260) p dev.off() desired_order <- c("-", "ST225","ST130","ST278","ST19","ST200", "ST297", "ST5","ST59","ST114","ST83","ST487", "ST2","ST215","ST73","ST88", "ST8", "ST329","ST170", "ST89","ST14","ST570","ST384","ST210", "ST10","ST100","ST290","ST23","ST87", "ST331", "ST218", "g216:1","g216:3","g216:5","g216:11","g216:14", "g216:40", "g216:29","g216:37", "g216:16","g216:19","g216:20","g216:21","g216:24","g216:26", "g216:18") png("ggtree_and_gheatmap.png", width=1290, height=1000) #svg("ggtree_and_gheatmap.svg", width=17, height=15) gheatmap(p, heatmapData2, width=0.15,colnames_position="top", colnames_angle=90, colnames_offset_y = 0.1, hjust=0.5, font.size=4, offset = 9) + scale_fill_manual(values=heatmap.colours, breaks=desired_order) + theme(legend.text = element_text(size = 14)) + theme(legend.title = element_text(size = 14)) + guides(fill=guide_legend(title=""), color = guide_legend(override.aes = list(size = 5))) dev.off()
-
prepre the table for the alpha diversity calculation
To calculate the alpha diversity values for each row based on the provided metrics (chao1, observed_otus, shannon, PD_whole_tree), you’ll typically use specialized software packages such as QIIME or R packages like vegan. Here, I can provide a high-level approach and calculations for some of the metrics. However, some of the metrics like PD_whole_tree require phylogenetic trees, which can’t be derived from the table you provided.
import pandas as pd import numpy as np # Read the data from CSV df = pd.read_csv('count_table_df_reordered.csv', index_col=0) def chao1(s): s = np.array(s) n = np.sum(s) s_obs = np.count_nonzero(s) s1 = np.sum(s == 1) s2 = np.sum(s == 2) return s_obs + ((s1 ** 2) / (2 * s2)) if s2 != 0 else s_obs def observed_sts(s): return np.count_nonzero(s) def shannon2(s): s = np.array(s) p = s / np.sum(s) return -np.sum(p * np.log(p + 1e-10)) # added small value to avoid log(0) def shannon(s): s = np.array(s) total = np.sum(s) p = s / total # Convert absolute counts to proportions return -np.sum(p * np.log(p + 1e-10)) # added small value to avoid log(0) # Calculate the metrics for each sample (column) results = { 'sample': [], 'chao1': [], 'observed_sts': [], 'shannon': [] } for column in df.columns: results['sample'].append(column) results['chao1'].append(chao1(df[column])) results['observed_sts'].append(observed_sts(df[column])) results['shannon'].append(shannon(df[column])) # Convert results to DataFrame and save to a new CSV result_df = pd.DataFrame(results) result_df.to_csv('alpha_diversity_metrics_samples.csv', index=False)
For PD_whole_tree, you would require a phylogenetic tree of the OTUs and then compute the sum of branch lengths. This is more complex and requires specialized software and the phylogenetic tree information, which isn’t present in the table you provided.
The input data being absolute counts versus relative abundance does influence certain alpha diversity metrics. Let’s re-evaluate:
-
Observed OTUs: Whether the data is in absolute counts or relative abundance doesn’t impact the “Observed OTUs” metric. It’s simply counting how many OTUs have a non-zero count.
-
Chao1: Chao1 is intended for absolute counts. The metric is based on the number of singletons and doubletons, which are derived from raw counts and not relative abundances. Hence, the calculation remains accurate.
-
Shannon Diversity Index: The Shannon Diversity Index is based on the relative abundances of OTUs, not their absolute counts. The formula requires the proportion of each species (or OTU) in the sample. Thus, you’d need to convert absolute counts to proportions (or relative abundance) before calculating this metric.
-
-
calculate the alpha diversity in Phyloseq.Rmd. The code for the calculation is as follows.
\pagebreak # Alpha diversity for ST Plot Shannon index and observed STs. Regroup together samples from the same group. ```{r, echo=TRUE, warning=FALSE} hmp.div_st <- read.csv("alpha_diversity_metrics_samples.csv", sep=",") colnames(hmp.div_st) <- c("sample", "chao1", "observed_sts", "shannon") row.names(hmp.div_st) <- hmp.div_st$sample div.df <- merge(hmp.div_st, hmp.meta, by.x="sample", by.y = "sam_name") div.df2 <- div.df[, c("Group", "shannon", "observed_sts")] colnames(div.df2) <- c("Group", "Shannon", "ST") #colnames(div.df2) options(max.print=999999) stat.test.Shannon <- compare_means( Shannon ~ Group, data = div.df2, method = "t.test" ) knitr::kable(stat.test.Shannon) %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) stat.test.ST <- compare_means( ST ~ Group, data = div.df2, method = "t.test" ) knitr::kable(stat.test.ST) %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) div_df_melt <- reshape2::melt(div.df2) p <- ggboxplot(div_df_melt, x = "Group", y = "value", facet.by = "variable", scales = "free", width = 0.5, fill = "gray", legend= "right") #ggpar(p, xlab = FALSE, ylab = FALSE) lev <- levels(factor(div_df_melt$Group)) # get the variables L.pairs <- combn(seq_along(lev), 2, simplify = FALSE, FUN = function(i) lev[i]) #%>% filter(p.signif != "ns") my_stat_compare_means <- function (mapping = NULL, data = NULL, method = NULL, paired = FALSE, method.args = list(), ref.group = NULL, comparisons = NULL, hide.ns = FALSE, label.sep = ", ", label = NULL, label.x.npc = "left", label.y.npc = "top", label.x = NULL, label.y = NULL, tip.length = 0.03, symnum.args = list(), geom = "text", position = "identity", na.rm = FALSE, show.legend = NA, inherit.aes = TRUE, ...) { if (!is.null(comparisons)) { method.info <- ggpubr:::.method_info(method) method <- method.info$method method.args <- ggpubr:::.add_item(method.args, paired = paired) if (method == "wilcox.test") method.args$exact <- FALSE pms <- list(...) size <- ifelse(is.null(pms$size), 0.3, pms$size) color <- ifelse(is.null(pms$color), "black", pms$color) map_signif_level <- FALSE if (is.null(label)) label <- "p.format" if (ggpubr:::.is_p.signif_in_mapping(mapping) | (label %in% "p.signif")) { if (ggpubr:::.is_empty(symnum.args)) { map_signif_level <- c(`****` = 1e-04, `***` = 0.001, `**` = 0.01, `*` = 0.05, ns = 1) } else { map_signif_level <- symnum.args } if (hide.ns) names(map_signif_level)[5] <- " " } step_increase <- ifelse(is.null(label.y), 0.12, 0) ggsignif::geom_signif(comparisons = comparisons, y_position = label.y, test = method, test.args = method.args, step_increase = step_increase, size = size, color = color, map_signif_level = map_signif_level, tip_length = tip.length, data = data) } else { mapping <- ggpubr:::.update_mapping(mapping, label) layer(stat = StatCompareMeans, data = data, mapping = mapping, geom = geom, position = position, show.legend = show.legend, inherit.aes = inherit.aes, params = list(label.x.npc = label.x.npc, label.y.npc = label.y.npc, label.x = label.x, label.y = label.y, label.sep = label.sep, method = method, method.args = method.args, paired = paired, ref.group = ref.group, symnum.args = symnum.args, hide.ns = hide.ns, na.rm = na.rm, ...)) } } p3 <- p + stat_compare_means( method="t.test", comparisons = list(c("P16-P20.Foot", "P16-P20.Nose"), c("AH-XN.LH", "AH-XN.Nose"), c("AH-XN.NLH", "AH-XN.Nose")), label = "p.signif", symnum.args <- list(cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 1), symbols = c("****", "***", "**", "*", "ns")), ) + theme(axis.text.x = element_text(angle = 30, hjust = 1)) # add this line to rotate x-axis text print(p3) ggsave("./figures/alpha_diversity_ST_Group2.png", device="png", height = 10, width = 12) ggsave("./figures/alpha_diversity_ST_Group2.svg", device="svg", height = 10, width = 12) ```
From epidome to treeHeatmap
Leave a reply