From epidome to treeHeatmap

(namely epidome-processing step 2.6) Classification using 98% similarity.

#Input Files: yycH_seqtab_nochim.csv and ../epidome/DB/yycH_ref_aln.fasta
#Output Files: yycH_seqtab_ASV_seqs.fasta, yycH_seqtab_ASV_blast.txt, and yycH_seqtab.csv.classified.csv

python3 ../epidome/scripts/ASV_blast_classification.py yycH_seqtab_nochim.csv yycH_seqtab_ASV_seqs.fasta ../epidome/DB/yycH_ref_aln.fasta yycH_seqtab_ASV_blast.txt yycH_seqtab.csv.classified.csv 98
python3 ../epidome/scripts/ASV_blast_classification.py g216_seqtab_nochim.csv g216_seqtab_ASV_seqs.fasta ../epidome/DB/g216_ref_aln.fasta g216_seqtab_ASV_blast.txt g216_seqtab.csv.classified.csv 98

Manually check the reults, ensuring no seq34;seq35 occurs.

#Delete all records in *_seqtab.csv.classified.csv the alignment length < 448 (delete records 110 and 111) in g216, or < 440 (del record 90) in yycH.

#Confirm the results
jhuang@hamburg:/media/jhuang/Elements/Data_Luise_Epidome_batch3/run_2023_98$ cut -d$'\t' -f4 g216_seqtab_ASV_blast.txt | sort -u
446
447
448
jhuang@hamburg:/media/jhuang/Elements/Data_Luise_Epidome_batch3/run_2023_98$ cut -d$'\t' -f4 yycH_seqtab_ASV_blast.txt | sort -u
475
476

#It is not neccesary to delete correponding records 90 in yycH_* and g216_seqtab_ASV_blast.txt, since the two files are not used. 
#sed -i -e s/seq//g yycH_seqtab_ASV_blast.txt   #length=476
#sed -i -e s/seq//g g216_seqtab_ASV_blast.txt   #length=448
##;-->""
#sed -i -e s/';'//g yycH_seqtab_ASV_blast.txt
#sed -i -e s/';'//g g216_seqtab_ASV_blast.txt

cp g216_seqtab.csv.classified.csv g216_seqtab.csv.classified.backup
cp yycH_seqtab.csv.classified.csv yycH_seqtab.csv.classified.backup

# (Option 1): Processing *.classified.csv, ensuring the format seq24,21 or seq24.
#https://github.com/ssi-dk/epidome/blob/master/example_data/190920_run1_G216_seqtab_from_dada2.csv.classified.csv
##DEBUG using LibreOffice, e.g. libreoffice --calc yycH_seqtab.csv.classified_noNA.csv after adding "ID"; at the corner.
#Final goal: "seqseq31;,seq30;" --> "seq31,30"
#;,seq --> ,
sed -i -e s/";,seq"/","/g g216_seqtab.csv.classified.csv
sed -i -e s/";,seq"/","/g yycH_seqtab.csv.classified.csv
#;"; --> ";
sed -i -e s/";\";"/"\";"/g g216_seqtab.csv.classified.csv
sed -i -e s/";\";"/"\";"/g yycH_seqtab.csv.classified.csv
sed -i -e s/seqseq/seq/g g216_seqtab.csv.classified.csv
sed -i -e s/seqseq/seq/g yycH_seqtab.csv.classified.csv
#confirm!
cut -d';' -f3 g216_seqtab.csv.classified.csv
cut -d';' -f3 yycH_seqtab.csv.classified.csv

# (Option 2): To reduce the unclassified, rename seq31,seq30 --> seq31 in g216,  seq37,seq36 --> seq36 in yycH.
sed -i -e s/"seq40,seq31,seq30"/"seq40"/g g216_seqtab.csv.classified.csv
sed -i -e s/"seq31,seq30"/"seq31"/g g216_seqtab.csv.classified.csv
sed -i -e s/"seq5,seq2"/"seq2"/g        sed -i -e s/"seq40,seq31,seq30"/"seq40"/g g216_seqtab.csv.classified.csv
sed -i -e s/"seq31,seq30"/"seq31"/g g216_seqtab.csv.classified.csv
sed -i -e s/"seq5,seq2"/"seq2"/g g216_seqtab.csv.classified.csv
grep ",seq" g216_seqtab.csv.classified.csv  #should return no record.
#sed -i -e s/"seq22,20"/"seq20"/g g216_seqtab.csv.classified.csv
#sed -i -e s/"seq24,21"/"seq21"/g g216_seqtab.csv.classified.csv
#sed -i -e s/"seq21,17"/"seq21"/g g216_seqtab.csv.classified.csv
> epidome_object$p1_seqs    #vim g216_seqtab.csv.classified_noNA.csv
  [1] "seq28"       "seq5"        "seq40"       "seq31,30"(seq31)     "seq14"      
  [6] "seq20"       "seq22"       "seq26"       "seq37"       "seq21"      
[11] "seq29"       "seq24"       "seq1"        "seq26"       "seq40"      
[16] "seq21"       "seq18"       "seq40"       "seq16"       "seq37"      
[21] "seq5,2"      "seq18"       "seq11"       "seq14"       "seq26"      
[26] "seq31,30"(seq31)     "seq37"       "seq26"       "seq1"        "seq21"      
[31] "seq3"        "seq40"       "seq26"       "seq28"       "seq40"      
[36] "seq10"       "seq5"        "seq40,31,30"(seq40)  "seq37"       "seq37"      
[41] "seq37"       "seq40"       "seq28"       "seq26"       "seq37"      
[46] "seq22"       "seq28"       "seq19"       "seq5,2"      "seq26"      
[51] "seq40"       "seq40"       "seq26"       "seq40"       "seq28"      
[56] "seq28"       "seq28"       "seq37"       "seq26"       "seq40"      
[61] "seq26"       "seq37"       "seq28"       "seq28"       "seq1"       
[66] "seq28"       "seq5,2"(seq5)      "seq40"       "seq21"       "seq28"      
[71] "seq29"       
#"seq37"       "seq22"       "seq22,20"(seq20)    "seq31,30"(seq31)    
[76] "seq22"       "seq20"       "seq28"       "seq37"       "seq14"      
[81] "seq28"       "seq20"       "seq40"       "seq28"       "seq24,21" 
[86] "seq40"       "seq14"       "seq31,30"(seq31)      "seq1"        "seq1"       
[91] "seq31,30"(seq31)     "seq1"        "seq40"       "seq20"       "seq1"       
[96] "seq21"       "seq26"       "seq20"       "seq20"       "seq21,17"   
[101] "seq37"       "seq26"       "seq31,30"(seq31)      "seq31,30"(seq31)    "seq40"      
[106] "seq5"        "seq12"             

sed -i -e s/"seq37,seq36"/"seq36"/g yycH_seqtab.csv.classified.csv
sed -i -e s/"seq58,seq25,seq21,seq10"/"seq21"/g yycH_seqtab.csv.classified.csv
sed -i -e s/"seq23,seq22"/"seq23"/g yycH_seqtab.csv.classified.csv
sed -i -e s/"seq65,seq63,seq61,seq60"/"seq63"/g yycH_seqtab.csv.classified.csv
sed -i -e s/"seq65,seq63,seq53"/"seq63"/g yycH_seqtab.csv.classified.csv
sed -i -e s/"seq61,seq55"/"seq55"/g yycH_seqtab.csv.classified.csv
sed -i -e s/"seq34,seq28,seq25,seq10,seq9"/"seq34"/g yycH_seqtab.csv.classified.csv
sed -i -e s/"seq25,seq10,seq9"/"seq9"/g yycH_seqtab.csv.classified.csv
sed -i -e s/"seq25,seq10"/"seq10"/g yycH_seqtab.csv.classified.csv
grep ",seq" yycH_seqtab.csv.classified.csv  #should return no record.
> epidome_object$p2_seqs
[1] "34"            "43"            "seq37,seq36"-->36         "42"           
[5] "26"            "63"            "3"             "2"            
[9] "11"            "seq65,seq63,seq53"-->63      "14"            "38"           
[13] "55"            "3"             "65"            "seq58,seq25,seq21,seq10"-->21  
[17] "seq23,seq22"-->23         "53"            "56"            "34"           
[21] "62"            "14"            "35"            "seq61,seq55"-->55        
[25] "52"            "seq25,seq10"-->10         "seq65,seq63,seq61,seq60"-->63   "21"           
[29] "55"            "15"            "34"            "49"           
[33] "19"            "63"            "63"            "34"           
[37] "43"            "12"            "34"            "seq34,seq28,seq25,seq10,seq9"-->34
[41] "43"            "61"            "25"            "34"           
[45] "34"            "5"             "18"            "43"           
[49] "62"            "34"            "19"            "63"           
[53] "7"             
--------------
"seq37,seq36"-->36         "seq25,seq10,seq9"-->9       "56"           
[57] "11"            "49"            "10"            "63"           
[61] "seq37,seq36"-->36         "14"            "55"  
#-------------
"55"           
[65] "14"            "37,36"         "43"            "45"           
[69] "34"            "34"            "43"            "43"           
[73] "25"            "14"            "65,64,61,56"   "15,9"         
[77] "43"            "38"            "43"            "53"           
[81] "11"            "9"             "34"            "15"           
[85] "9"             "23,22"         "62"            "43"           
[89] "37,36"

# Common step after option 1 or option 2.
grep -v ";NA;" g216_seqtab.csv.classified.csv > g216_seqtab.csv.classified_noNA.csv
grep -v ";NA;" yycH_seqtab.csv.classified.csv > yycH_seqtab.csv.classified_noNA.csv

#filtering the small ASV in which the number < 500: INPUT: *_noNA.csv; OUTPUT: *_seqtab.csv.classified_filtered.csv
python3 g216_filtering.py  
    import pandas as pd
    # Load the file
    data = pd.read_csv('g216_seqtab.csv.classified_noNA.csv', delimiter=';')
    # Compute row sums excluding the first 2 columns
    data['row_sum'] = data.iloc[:, 2:].sum(axis=1)
    print(data.iloc[:, 2:])
    print(data.iloc[:, 2:].sum(axis=1))
    # Filter out rows where the sum is less than 100
    filtered_data = data[data['row_sum'] >= 500].drop(columns=['row_sum'])
    # Save the resulting dataframe to a new file
    filtered_data.to_csv('g216_seqtab.csv.classified_filtered.csv', sep=';', index=False)
python3 yycH_filtering.py

draw plot from three amplicons: cutadapted_g216, cutadapted_yycH

#Taxonomic database setup and classification
#- Custom databases of all unique g216 and yycH target sequences can be found at https://github.com/ssi-dk/epidome/tree/master/DB. 
#- We formatted our g216 and yycH gene databases to be compatible with DADA2’s assign-Taxonomy function and used it to classify the S. epidermidis ASVs with the RDP naive Bayesian classifier method (https://github.com/ssi-dk/epidome/tree/master/scripts).
#- ST classification of samples was performed using the g216 target sequence as the primary identifier. 
#- All g216 sequences unique to a single clonal cluster in the database were immediately classified as the matching clone, and in cases were the g216 sequence matched multiple clones, the secondary yycH target sequences were parsed to determine which clone was present. When this classification failed to resolve due to multiple potential combinations of sequences, ASVs were categorized as “Unclassified”. Similarly, g216 sequences not found in the database were labelled as “Novel”. 

#install.packages("pls")
#library(pls)
#install.packages("reshape")
#library(reshape)
#install.packages("vegan")
library('vegan') 
library(scales)
library(RColorBrewer)

#~/Tools/csv2xls-0.4/csv_to_xls.py g216_seqtab.csv.classified_noNA.csv yycH_seqtab.csv.classified_noNA.csv -d$'\t' -o counts.xls

setwd("/media/jhuang/Elements/Data_Luise_Epidome_batch3/run_2023_98_2")
#under r4-base
source("../epidome/scripts/epidome_functions.R")

ST_amplicon_table = read.table("../epidome/DB/epidome_ST_amplicon_frequencies.txt",sep = "\t")

epi01_table = read.table("g216_seqtab.csv.classified_filtered.csv",sep = ";",header=TRUE,row.names=1)
epi02_table = read.table("yycH_seqtab.csv.classified_filtered.csv",sep = ";",header=TRUE,row.names=1)
#> sum(epi01_table$AH.LH)
#[1] 78872
#> sum(epi02_table$AH.LH)
#[1] 86949

metadata_table = read.table("../metadata.txt",header=TRUE,row.names=1)
metadata_table$patient.ID <- factor(metadata_table$patient.ID, levels=c("P7","P8","P9","P10","P11","P12","P13","P14","P15","P16","P17","P18","P19","P20", "AH","AL","CB","HR","KK",  "MC","MR","PT2","RP","SA","XN"))

#epidome_object = setup_epidome_object(epi01_table,epi02_table, metadata_table=metadata_table)
primer1_table <- epi01_table
primer2_table <- epi02_table
#setup_epidome_object <- function(primer1_table,primer2_table,metadata_table) {
  #DEBUG: 3:ncol --> 2:ncol
  primer1_counts = primer1_table[,2:ncol(primer1_table)]
  primer2_counts = primer2_table[,2:ncol(primer2_table)]
  primer1_all_sample_names = colnames(primer1_counts)
  primer2_all_sample_names = colnames(primer2_counts)
  samples_with_both_primers = primer1_all_sample_names[which(primer1_all_sample_names %in% primer2_all_sample_names)]
  samples_missing_primer1_data = primer2_all_sample_names[which(!primer2_all_sample_names %in% primer1_all_sample_names)]
  samples_missing_primer2_data = primer1_all_sample_names[which(!primer1_all_sample_names %in% primer2_all_sample_names)]
  primer1_seqs = as.vector(primer1_table$Seq_number)
  primer2_seqs = as.vector(primer2_table$Seq_number)
  primer1_seqs[which(is.na(primer1_seqs))] = "seqUnclassified"
  primer2_seqs[which(is.na(primer2_seqs))] = "seqUnclassified"
  primer1_counts = primer1_table[,which(colnames(primer1_table) %in% samples_with_both_primers)]
  primer2_counts = primer2_table[,which(colnames(primer2_table) %in% samples_with_both_primers)]
  if (!missing(metadata_table)) {
    metadata_names = rownames(metadata_table)
    samples_missing_metadata = samples_with_both_primers[which(!samples_with_both_primers %in% metadata_names)]
    samples_missing_primer_data = metadata_names[which(!metadata_names %in% samples_with_both_primers)]
    include_names = metadata_names[which(metadata_names %in% samples_with_both_primers)]
    metadata_include = match(include_names,metadata_names)
    metadata_include = metadata_include[which(!is.na(metadata_include))]
    metadata_table = metadata_table[metadata_include,]
    #primer1_include = match(colnames(primer1_counts),include_names)
    primer1_include = match(include_names,colnames(primer1_counts))
    primer1_include = primer1_include[which(!is.na(primer1_include))]
    #primer2_include = match(colnames(primer2_counts),include_names)
    primer2_include = match(include_names,colnames(primer2_counts))
    primer2_include = primer2_include[which(!is.na(primer2_include))]
    epi1_table = primer1_counts[,primer1_include]
    epi2_table = primer2_counts[,primer2_include]
    epidome_object = list('p1_seqs'=primer1_seqs,'p1_table'=epi1_table,'p2_seqs'=primer2_seqs,'p2_table'=epi2_table,'metadata'=metadata_table,'sample_names'=include_names,'meta_variables'=colnames(metadata_table))
    print(paste0("Metadata loaded with ",length(metadata_names)," samples and ",ncol(metadata_table)," variables"))
    print(paste0(length(samples_missing_metadata)," samples are found in both primer sets but not in metadata: ",paste0(samples_missing_metadata,collapse = " ")))
    print(paste0(length(samples_missing_primer_data)," samples are found in metadata but is missing from one or both primer sets: ",paste0(samples_missing_primer_data,collapse = " ")))
    print(paste0(length(include_names)," samples are found in metadata and both tables and are included in epidome object"))

  } else {
    epi1_table = primer1_counts[,match(colnames(primer1_counts),samples_with_both_primers)]
    epi2_table = primer2_counts[,match(colnames(primer2_counts),samples_with_both_primers)]
    epidome_object = list('p1_seqs'=primer1_seqs,'p1_table'=epi1_table,'p2_seqs'=primer2_seqs,'p2_table'=epi2_table,'sample_names'=samples_with_both_primers)
    print(paste0("No metadata loaded"))
    print(paste0(length(samples_missing_primer2_data)," samples are found in p1 table but not in p2 table: ",paste0(samples_missing_primer2_data,collapse = " ")))
    print(paste0(length(samples_missing_primer1_data)," samples are found in p2 table but not in p1 table: ",paste0(samples_missing_primer1_data,collapse = " ")))
    print(paste0(length(samples_with_both_primers)," samples are found in both tables and are included in epidome object"))
  }
  #return(epidome_object)
#}

str(epidome_object)
#List of 7
# $ p1_seqs       : chr [1:57] "seq28" "seq5" "seq40" "seq31,30" ...
# $ p1_table      :'data.frame':    57 obs. of  51 variables:
# $ p2_seqs       : chr [1:53] "seq34" "seq43" "seq37,36" "seq42" ...
# $ p2_table      :'data.frame':    53 obs. of  51 variables:
# $ metadata      :'data.frame':    51 obs. of  4 variables:
# $ sample_names  : chr [1:51] "P7.Nose" "P8.Nose" "P9.Nose" "P10.Nose" ...
# $ meta_variables: chr [1:4] "patient.ID" "sample.site" "sample.type" "patient.sample.site"
unique(sort(epidome_object$p1_seqs))  # 57
#[1] "seq28"       "seq5"        "seq40"       "seq31,30"    "seq14"      
#[6] "seq20"       "seq22"       "seq26"       "seq37"       "seq21"      
#[11] "seq29"       "seq24"       "seq1"        "seq26"       "seq40"      
#[16] "seq21"       "seq18"       "seq40"       "seq16"       "seq37"      
#[21] "seq5,2"      "seq18"       "seq11"       "seq14"       "seq26"      
#[26] "seq31,30"    "seq37"       "seq26"       "seq1"        "seq21"      
#[31] "seq3"        "seq40"       "seq26"       "seq28"       "seq40"      
#[36] "seq10"       "seq5"        "seq40,31,30" "seq37"       "seq37"      
#[41] "seq37"       "seq40"       "seq28"       "seq26"       "seq37"      
#[46] "seq22"       "seq28"       "seq19"       "seq5,2"      "seq26"      
#[51] "seq40"       "seq40"       "seq26"       "seq40"       "seq28"      
#[56] "seq28"       "seq28"
unique(sort(epidome_object$p2_seqs))

#Image1
primer_compare = compare_primer_output(epidome_object,color_variable = "sample.type")
png("image1.png")
primer_compare$plot
dev.off()

eo_ASV_combined = combine_ASVs_epidome(epidome_object)
#> str(eo_ASV_combined)
#List of 7
#$ p1_seqs       : chr [1:21] "seq28" "seq5" "seq40" "seq31,30" ...
#$ p1_table      :'data.frame': 21 obs. of  51 variables:
#> eo_ASV_combined$p1_seqs  #21
#[1] "seq28"       "seq5"        "seq40"       "seq31,30"    "seq14"      
#[6] "seq20"       "seq22"       "seq26"       "seq37"       "seq21"      
#[11] "seq29"       "seq24"       "seq1"        "seq18"       "seq16"      
#[16] "seq5,2"      "seq11"       "seq3"        "seq10"       "seq40,31,30"
#[21] "seq19"
#eo_filtered = filter_lowcount_samples_epidome(eo_ASV_combined,500,500)

#Note that we run the following code instead run the method from the script due to modified code: 
count_table_checked = classify_epidome(eo_ASV_combined,ST_amplicon_table)
write.csv(count_table_checked, file="count_table_checked.csv")
strict_classifier=FALSE
#classify_epidome = function(epidome_object,ST_amplicon_table,strict_classifier=FALSE) {
#Adapt the code to the true epi01 and epi02 values, please give the complete code:
epidome_object_norm = normalize_epidome_object(eo_ASV_combined)
p1_seqs = unlist(lapply(as.vector(eo_ASV_combined$p1_seqs),function(x) substr(x,4,nchar(x))))
#> p1_seqs
#[1] "28"       "5"        "40"       "31,30"    "14"       "20"      
#[7] "22"       "26"       "37"       "21"       "29"       "24"      
#[13] "1"        "26"       "40"       "21"       "18"       "40"      
#[19] "16"       "37"       "5,2"      "18"       "11"       "14"      
#[25] "26"       "31,30"    "37"       "26"       "1"        "21"      
#[31] "3"        "40"       "26"       "28"       "40"       "10"      
#[37] "5"        "40,31,30" "37"       "37"       "37"       "40"      
#[43] "28"       "26"       "37"       "22"       "28"       "19"      
#[49] "5,2"      "26"       "40"       "40"       "26"       "40"      
#[55] "28"       "28"       "28" 
p2_seqs = unlist(lapply(as.vector(eo_ASV_combined$p2_seqs),function(x) substr(x,4,nchar(x))))
n_samples = length(eo_ASV_combined$sample_names)
n_p1_seqs = length(p1_seqs)
count_table = matrix(nrow = 0, ncol = n_samples,dimnames = list('ST'=c(),'Samples'=eo_ASV_combined$sample_names))
match_type_table = matrix(nrow = n_p1_seqs, ncol = n_samples)
count_table_names = c()
unclassified_count_vec = rep(0,n_samples)
g1_unclassified_count_vec = rep(0,n_samples)
for (i in 1:n_p1_seqs) {
  #i=4: "31,30" --> length(p1_seq_split)>1 --> possible_STs==("-","-","8","215") --> Unclassified!
  p1_seq = p1_seqs[i]
  p1_seq_split = strsplit(p1_seq,',')[[1]]
  if (length(p1_seq_split)>1) {
    possible_STs = as.vector(ST_amplicon_table$ST)[which(ST_amplicon_table$epi01_ASV %in% p1_seq_split)]
    if (length(possible_STs)==1) {
      p1_seq = p1_seq_split[1]
    } else {
      p1_seq = "Unclassified"
    }
  }
  if (p1_seq != "Unclassified") {
    p1_seq_ST_table = ST_amplicon_table[which(ST_amplicon_table$epi01_ASV==p1_seq),]
    unique_p1_ASVs = unique(as.vector(p1_seq_ST_table$ST))
    p2_ASVs_matching_p1 = p1_seq_ST_table$epi02_ASV
    count_vec = rep(0,n_samples)
    for (j in 1:n_samples) {
      p1_percent = epidome_object_norm$p1_table[i,j]
      p1_count = eo_ASV_combined$p1_table[i,j]
      if (p1_percent > 0.01) {
        p2_seqs_present_within_difference_threshold_idx = which(epidome_object_norm$p2_table[,j]>(p1_percent-10) & epidome_object_norm$p2_table[,j]>1)
        p2_seqs_present_ASVs = p2_seqs[p2_seqs_present_within_difference_threshold_idx]
        p2_seqs_present_ASVs_matching_p1 = p2_seqs_present_ASVs[which(p2_seqs_present_ASVs %in% p2_ASVs_matching_p1)]
        p2_percent = epidome_object_norm$p2_table[which(p2_seqs %in% p2_seqs_present_ASVs_matching_p1),j]
        p2_count = eo_ASV_combined$p2_table[which(p2_seqs %in% p2_seqs_present_ASVs_matching_p1),j]
        if (length(p2_seqs_present_ASVs_matching_p1) == 0) {
          ST_order = order(p1_seq_ST_table$freq,decreasing = T)
          classification_group = as.vector(p1_seq_ST_table$ST)[ST_order[1]]
          if (classification_group %in% count_table_names) {
            classification_idx = which(count_table_names == classification_group)
            count_table[classification_idx,j] = count_table[classification_idx,j]+p1_count
          } else {
            count_vec = rep(0,n_samples)
            count_vec[j] = p1_count
            count_table = rbind(count_table,count_vec)
            count_table_names = c(count_table_names,classification_group)
          }
          #match_type_table[i,j] = "epi01 match without epi02 match"
          # Replace the match type with actual epi01 and epi02 values
          match_type_table[i, j] = paste0("epi01:", p1_seq, " epi02:NA", " ", classification_group)
        }
        else if (length(p2_seqs_present_ASVs_matching_p1) == 1) {
          p1_p2_seq_ST_table = p1_seq_ST_table[which(p1_seq_ST_table$epi02_ASV==p2_seqs_present_ASVs_matching_p1[1]),]
          ST_order = order(p1_p2_seq_ST_table$freq,decreasing = T)
          classification_group = as.vector(p1_p2_seq_ST_table$ST)[ST_order[1]]
          if (classification_group %in% count_table_names) {
            classification_idx = which(count_table_names == classification_group)
            count_table[classification_idx,j] = count_table[classification_idx,j]+p1_count
          } else {
            count_vec = rep(0,n_samples)
            count_vec[j] = p1_count
            count_table = rbind(count_table,count_vec)
            count_table_names = c(count_table_names,classification_group)
          }
          #match_type_table[i,j] = "Unique epi01 epi02 combination"
          # Replace the match type with actual epi01 and epi02 values
          match_type_table[i, j] = paste0("epi01:", p1_seq, " epi02:", p2_seqs_present_ASVs_matching_p1[1], " ", classification_group)
        } else {
          if (strict_classifier) {
            unclassified_count_vec[j] = unclassified_count_vec[j]+p1_count
          } else {
            p1_p2_seq_ST_table = p1_seq_ST_table[which(p1_seq_ST_table$epi02_ASV %in% p2_seqs_present_ASVs_matching_p1),]
            ST_order = order(p1_p2_seq_ST_table$freq,decreasing = T)
            classification_group = as.vector(p1_p2_seq_ST_table$ST)[ST_order[1]]
            if (classification_group %in% count_table_names) {
              classification_idx = which(count_table_names == classification_group)
              count_table[classification_idx,j] = count_table[classification_idx,j]+p1_count
            } else {
              count_vec = rep(0,n_samples)
              count_vec[j] = p1_count
              count_table = rbind(count_table,count_vec)
              count_table_names = c(count_table_names,classification_group)
            }
          }

          ##unclassified_count_vec[j] = unclassified_count_vec[j]+p1_count
          #match_type_table[i,j] = c("Non unique epi01 epi02 combination")
          # Concatenate all found epi02 values and replace the match type
          epi02_values = paste(p2_seqs_present_ASVs_matching_p1, collapse = ",")
          match_type_table[i, j] = paste0("epi01:", p1_seq, " epi02:", epi02_values, " ", classification_group)
        }

      } else {
        match_type_table[i,j] = "Low counts"
        unclassified_count_vec[j] = unclassified_count_vec[j]+p1_count
      }

    }
  } else {
    count_vec = as.numeric(as.vector(eo_ASV_combined$p1_table[i,]))
    unclassified_count_vec = unclassified_count_vec+count_vec
    g1_unclassified_count_vec = g1_unclassified_count_vec+count_vec
    match_type_vec = rep("Unclassified epi01 match",n_samples)
    match_type_table[i,] = match_type_vec
  }

}
count_table = rbind(count_table,unclassified_count_vec)
rownames(count_table) = c(count_table_names,"Unclassified")
count_table_df <- as.data.frame(count_table)
write.csv(count_table_df, file="count_table_df.csv")
#TODO: delete the last record "Unclassified" due to the few reads!
#count_table_df = count_table_df[-32,]

# --> FOR STEP4: prepare the file match_type_table.csv for treeHeatmap drawing.
write.csv(match_type_table, file = "match_type_table.csv", row.names = TRUE)

count_table_df_ordered = count_table_df[order(rowSums(count_table_df),decreasing = T),]
row_sums <- apply(count_table_df_ordered, MARGIN=1, FUN=sum)
#TODO: the ST in which the counts <= 500 not shown!
#NOTE: ST225 is "epi01:3 epi02:3 225"!
# Since "epi01:3 epi02:NA 225","epi01:3 epi02:NA 225" are 225 disappeared in the match_type_table.csv_____
#Note that epi01 is g216, epi02 is yycH.
#        215            -            8           59          130          297 
#      611637       401179       341187       337409       316389       316273 
#         331            2           73          384          200            5 
#      178616       125734        99899        97885        93613        92385 
#          14          218           23          278          487          290 
#       71417        62467        60192        57213        53630        51454 
#          87           89          100          329          153           83 
#       49000        23390        21880        21635        19453        10539 
#          19          170          225*         570          184           88 
#        9465         1839         1487          863          437          318 
#         114 Unclassified 
#         108           11 
#> row_sums
#         215            -           59 Unclassified          297          130 
#      611915       442192       359708       357868       335880       315734 
#           5           73          331          384          200           14 
#      145578       134301       127734       126465        94917        85440 
#           2          218           23          278          290           87 
#       85438        62492        60192        57213        51363        49000 
#          89          100          329          210           83           19 
#       23390        21880        21635        12663        10642         9465 
#         170          225          570           88          114           10 
#        1839         1487          863          318          108           91 
# > unique(sort(eo_ASV_combined$p1_seqs))
#  [1] "seq1"  "seq10" "seq11" "seq14" "seq16" "seq18" "seq19" "seq2"  "seq20"
# [10] "seq21" "seq22" "seq24" "seq26" "seq28" "seq29" "seq3"  "seq31" "seq37"
# [19] "seq40" "seq5" 
# > unique(sort(eo_ASV_combined$p2_seqs))
#  [1] "seq10" "seq11" "seq12" "seq14" "seq15" "seq18" "seq19" "seq2"  "seq21"
# [10] "seq23" "seq25" "seq26" "seq3"  "seq34" "seq35" "seq36" "seq38" "seq42"
# [19] "seq43" "seq49" "seq5"  "seq52" "seq53" "seq55" "seq56" "seq61" "seq62"
# [28] "seq63" "seq65" "seq7"
#row.names(count_table_df) <- c("ST215","ST130","ST278","ST200","ST184","ST5","ST59","ST83","ST487","ST114","ST8","ST297","ST153","ST384","ST2","ST14","ST89","ST570","-","ST290","ST331","ST73","ST88","ST100","ST87","ST23","ST218","ST329","ST19","ST225","ST170")
#row.names(count_table_df) <- c("215","130","278","200","184","5","59","83","487","114","8","297","153","384","2","14","89","570","-","290","331","73","88","100","87","23","218","329","19","225","170")
#row.names(count_table_df) <- c("ST215","ST130","ST278","ST200","ST5","ST59","ST83","ST114","ST297","ST384","ST14","ST89","ST210","-","ST328","ST331","ST73","ST2","ST88","ST100","ST10","ST290","ST87","ST23","ST218","ST329","ST19","ST225","ST170","Unclassified")

col_order <- c("P7.Nose","P8.Nose","P9.Nose","P10.Nose","P11.Nose","P12.Nose","P13.Nose","P14.Nose","P15.Nose",     "P16.Foot","P17.Foot","P18.Foot","P19.Foot","P20.Foot","P16.Nose","P17.Nose","P18.Nose","P19.Nose","P20.Nose",    "AH.LH","AH.NLH","AH.Nose", "AL.LH","AL.NLH","AL.Nose", "CB.LH","CB.NLH","CB.Nose", "HR.LH","HR.NLH","HR.Nose", "KK.LH","KK.NLH","KK.Nose", "MC.LH","MC.NLH","MC.Nose", "MR.LH","MR.NLH","MR.Nose",  "PT2.LH","PT2.NLH","PT2.Nose", "RP.LH","RP.NLH","RP.Nose", "SA.LH","SA.NLH","SA.Nose", "XN.LH","XN.NLH","XN.Nose") 
count_table_df_reordered <- count_table_df_ordered[,col_order]
p = make_barplot_epidome(count_table_df_reordered,reorder=FALSE,normalize=TRUE)
#p = make_barplot_epidome(count_table_df_reordered,reorder=TRUE,normalize=TRUE)
png("Barplot_All.png", width=1600, height=900)
p
dev.off()

#Change rowname from '-' to 'Novel'
rownames(count_table_df_reordered)[rownames(count_table_df_reordered) == "-"] <- "Novel"
write.csv(file="count_table_df_reordered.txt", count_table_df_reordered)

prepare data for plotTreeHeatmap: table+tree.

  #replace "," to \n in match_type_table.csv
  sed 's/\",\"/\n/g' match_type_table.csv > match_type_table.csv_
  grep "epi02" match_type_table.csv_ > match_type_table.csv__
  grep -v "NA" match_type_table.csv__ > match_type_table.csv___
  #(Deprecated) awk -F' ' '{ split($2, a, ":"); split(a[2], b, ","); print $1 " epi02:" b[1] " " $3 }' match_type_table.csv___ > match_type_table.csv____
  #(Deprecated) sed -i -e 's/\"//g' match_type_table.csv____
  #(Deprecated) sort match_type_table.csv____ -u > match_type_table.csv_____

  #Manually select and merge items from match_type_table.csv___ and save it to match_type_table.csv____ (see content as below)!
      epi01:1 epi02:36 2
      epi01:3 epi02:3 225
      epi01:5 epi02:34 130
      epi01:5 epi02:3 278
      epi01:5 epi02:43 200
      epi01:5 epi02:36 184
      epi01:11 epi02:19 19
      epi01:14 epi02:34 297
      epi01:14 epi02:36 153
      epi01:16 epi02:56 329
      epi01:16 epi02:55 329
      epi01:18 epi02:14 218
      epi01:19 epi02:43 170
      epi01:2 epi02:15 -
      epi01:20 epi02:36 2
      epi01:20 epi02:43 14
      epi01:20 epi02:42 89
      epi01:20 epi02:34 384
      epi01:20 epi02:11 570
      epi01:21 epi02:34 100
      epi01:21 epi02:63 290
      epi01:24 epi02:38 23
      epi01:24 epi02:2 87
      epi01:26 epi02:34 -
      epi01:26 epi02:53 290
      epi01:26 epi02:63 331
      epi01:28 epi02:43 215
      epi01:29 epi02:43 215
      epi01:29 epi02:55 -
      epi01:31 epi02:43 8
      epi01:37 epi02:36 2
      epi01:37 epi02:26 88
      epi01:37 epi02:43 73
      epi01:40 epi02:2 5
      epi01:40 epi02:34 5
      epi01:40 epi02:34 59
      epi01:40 epi02:26 114
      epi01:40 epi02:43 83
      epi01:40 epi02:36 487
      #---- v2 ('-' three times, ST215, ST290, ST329, ST59 two times) --> delete 'epi01:28 epi02:43 215' ----
      epi01:1 epi02:34 -
      epi01:3 epi02:3 225
      epi01:5 epi02:3 278
      epi01:5 epi02:34 130
      epi01:5 epi02:43 200
      epi01:11 epi02:19 19
      epi01:14 epi02:34 297
      epi01:16 epi02:55 329
      epi01:16 epi02:56 329
      epi01:18 epi02:14 218
      epi01:19 epi02:43 170
      epi01:20 epi02:42 89
      epi01:20 epi02:43 14
      epi01:20 epi02:34 384
      epi01:20 epi02:11 570
      epi01:20 epi02:3 210
      epi01:21 epi02:38 10
      epi01:21 epi02:34 100
      epi01:21 epi02:63 290
      epi01:24 epi02:38 23
      epi01:24 epi02:2 87
      epi01:26 epi02:34 -
      epi01:26 epi02:53 290
      epi01:26 epi02:63 331
      epi01:29 epi02:43 215
      epi01:29 epi02:55 -
      epi01:37 epi02:62 2
      epi01:37 epi02:26 88
      epi01:37 epi02:43 73
      epi01:37 epi02:34 59
      epi01:40 epi02:2 5
      epi01:40 epi02:26 114
      epi01:40 epi02:43 83
      epi01:40 epi02:34 59

python3 concat_epi01_epi02.py
    # Define the function to extract fasta sequences
    def fasta_to_dict(filename):
        sequences = {}
        header = None
        sequence = []
        with open(filename, 'r') as file:
            for line in file:
                line = line.strip()
                if line.startswith('>'):
                    if header:
                        sequences[header] = ''.join(sequence)
                    header = line[1:]
                    sequence = []
                else:
                    sequence.append(line)
            if header:
                sequences[header] = ''.join(sequence)
        return sequences
    # Read combinations from match_type_table.csv____
    combinations = []
    with open("match_type_table.csv____", 'r') as csv_file:
        for line in csv_file:
            parts = line.strip().split()
            epi01 = parts[0].split(":")[1]
            epi02 = parts[1].split(":")[1]
            st = parts[2]
            combinations.append((epi01, epi02, st))

    # Read both fasta files into dictionaries
    g216_sequences = fasta_to_dict("../epidome/DB/epi01_ref_aln.fasta")
    yycH_sequences = fasta_to_dict("../epidome/DB/epi02_ref_aln.fasta")

    output_filename = "concatenated_sequences.fasta"
    with open(output_filename, 'w') as output_file:
        for epi01, epi02, st in combinations:
            if epi01 in g216_sequences and epi02 in yycH_sequences:
                concatenated_seq = g216_sequences[epi01] + "NNN" + yycH_sequences[epi02]
                header = f">g216-{epi01}_yycH-{epi02}_ST{st}"
                output_file.write(f"{header}\n{concatenated_seq}\n")

    print(f"Concatenated sequences saved to {output_filename}")

sed -i -e 's/ST-/-/g' concatenated_sequences.fasta
muscle -in concatenated_sequences.fasta -out aligned.fasta
muscle -clw -in concatenated_sequences.fasta -out aligned.clw
FastTree -nt < aligned.fasta > concatenated_sequences.tree
# Run plotTree.R to draw ggtree_and_gheatmap.png.

library(ggtree)
library(ggplot2)

setwd("/home/jhuang/DATA/Data_Luise_Epidome_batch3/run_2023_98_2/")

# -- edit tree --
#https://icytree.org/
info <- read.csv("typing_34.csv")
info$name <- info$Isolate
tree <- read.tree("concatenated_sequences.tree")
#cols <- c(infection='purple2', commensalism='skyblue2')     

library(dplyr)
heatmapData2 <- info %>% select(Isolate,  g216, ST)
rn <- heatmapData2$Isolate
heatmapData2$Isolate <- NULL
heatmapData2 <- as.data.frame(sapply(heatmapData2, as.character))
rownames(heatmapData2) <- rn

#https://bookdown.org/hneth/ds4psy/D-3-apx-colors-basics.html
#"g216:16","g216:18","g216:19","g216:20"(4),"g216:21"(2),"g216:24"(1),"g216:26"(2)
heatmap.colours <- c("darkgrey",   "palegreen","seagreen","seagreen","seagreen4","seagreen4","yellowgreen",    "orange4","orange4","orange4","orange4","orange4",    "dodgerblue3","blue4","dodgerblue3","dodgerblue3",  "azure4",    "magenta2","maroon2","mediumorchid1","mediumorchid1","mediumorchid1","mediumorchid1","mediumorchid1", "mediumorchid3","mediumorchid3","mediumorchid3","mediumpurple1","mediumpurple4","mediumpurple4",         "red4",                                            "olivedrab3","olivedrab4","seagreen","seagreen4","yellowgreen",    "orange4",    "blue4","dodgerblue3",           "magenta2","maroon2","mediumorchid1", "mediumorchid3","mediumpurple1","mediumpurple4",         "red4")
names(heatmap.colours) <- c("-",    "ST225","ST130","ST278","ST19","ST200", "ST297",                                             "ST5","ST59","ST114","ST83","ST487",   "ST2","ST215","ST73","ST88",   "ST8",                                                               "ST329","ST170", "ST89","ST14","ST570","ST384","ST210",                  "ST10","ST100","ST290","ST23","ST87", "ST331",     "ST218",      "g216:1","g216:3","g216:5","g216:11","g216:14",   "g216:40",   "g216:29","g216:37",        "g216:16","g216:19","g216:20","g216:21","g216:24","g216:26",    "g216:18")
#mydat$Regulation <- factor(mydat$Regulation, levels=c("up","down"))

#circular
#geom_tippoint(aes(color=Type)) + scale_color_manual(values=cols) + 
p <- ggtree(tree, layout='circular', branch.length='none') %<+% info + geom_tiplab2(aes(label=name), offset=1)
#, geom='text', align=TRUE,  linetype=NA, hjust=1.8,check.overlap=TRUE, size=3.3
#difference between geom_tiplab and geom_tiplab2?
#+ theme(axis.text.x = element_text(angle = 30, vjust = 0.5)) + theme(axis.text = element_text(size = 20))  + scale_size(range = c(1, 20))
#font.size=10, 
png("ggtree.png", width=1260, height=1260)
#svg("ggtree.svg", width=1260, height=1260)
p
dev.off()

desired_order <- c("-",    "ST225","ST130","ST278","ST19","ST200", "ST297",                                             "ST5","ST59","ST114","ST83","ST487",   "ST2","ST215","ST73","ST88",   "ST8",                                                               "ST329","ST170", "ST89","ST14","ST570","ST384","ST210",                  "ST10","ST100","ST290","ST23","ST87", "ST331",     "ST218",      "g216:1","g216:3","g216:5","g216:11","g216:14",   "g216:40",   "g216:29","g216:37",        "g216:16","g216:19","g216:20","g216:21","g216:24","g216:26",    "g216:18")

png("ggtree_and_gheatmap.png", width=1290, height=1000)
#svg("ggtree_and_gheatmap.svg", width=17, height=15)
gheatmap(p, heatmapData2, width=0.15,colnames_position="top", colnames_angle=90, colnames_offset_y = 0.1, hjust=0.5, font.size=4, offset = 9) + scale_fill_manual(values=heatmap.colours, breaks=desired_order) +  theme(legend.text = element_text(size = 14)) + theme(legend.title = element_text(size = 14)) + guides(fill=guide_legend(title=""), color = guide_legend(override.aes = list(size = 5)))  
dev.off()

prepre the table for the alpha diversity calculation

To calculate the alpha diversity values for each row based on the provided metrics (chao1, observed_otus, shannon, PD_whole_tree), you’ll typically use specialized software packages such as QIIME or R packages like vegan. Here, I can provide a high-level approach and calculations for some of the metrics. However, some of the metrics like PD_whole_tree require phylogenetic trees, which can’t be derived from the table you provided.
```
import pandas as pd
import numpy as np

# Read the data from CSV
df = pd.read_csv('count_table_df_reordered.csv', index_col=0)

def chao1(s):
s = np.array(s)
n = np.sum(s)
s_obs = np.count_nonzero(s)
s1 = np.sum(s == 1)
s2 = np.sum(s == 2)
return s_obs + ((s1 ** 2) / (2 * s2)) if s2 != 0 else s_obs

def observed_sts(s):
return np.count_nonzero(s)

def shannon2(s):
s = np.array(s)
p = s / np.sum(s)
return -np.sum(p * np.log(p + 1e-10))  # added small value to avoid log(0)

def shannon(s):
s = np.array(s)
total = np.sum(s)
p = s / total  # Convert absolute counts to proportions
return -np.sum(p * np.log(p + 1e-10))  # added small value to avoid log(0)

# Calculate the metrics for each sample (column)
results = {
'sample': [],
'chao1': [],
'observed_sts': [],
'shannon': []
}

for column in df.columns:
results['sample'].append(column)
results['chao1'].append(chao1(df[column]))
results['observed_sts'].append(observed_sts(df[column]))
results['shannon'].append(shannon(df[column]))

# Convert results to DataFrame and save to a new CSV
result_df = pd.DataFrame(results)
result_df.to_csv('alpha_diversity_metrics_samples.csv', index=False)
```
For PD_whole_tree, you would require a phylogenetic tree of the OTUs and then compute the sum of branch lengths. This is more complex and requires specialized software and the phylogenetic tree information, which isn’t present in the table you provided.

The input data being absolute counts versus relative abundance does influence certain alpha diversity metrics. Let’s re-evaluate:
- Observed OTUs: Whether the data is in absolute counts or relative abundance doesn’t impact the “Observed OTUs” metric. It’s simply counting how many OTUs have a non-zero count.
- Chao1: Chao1 is intended for absolute counts. The metric is based on the number of singletons and doubletons, which are derived from raw counts and not relative abundances. Hence, the calculation remains accurate.
- Shannon Diversity Index: The Shannon Diversity Index is based on the relative abundances of OTUs, not their absolute counts. The formula requires the proportion of each species (or OTU) in the sample. Thus, you’d need to convert absolute counts to proportions (or relative abundance) before calculating this metric.

calculate the alpha diversity in Phyloseq.Rmd. The code for the calculation is as follows.

\pagebreak

# Alpha diversity for ST
Plot Shannon index and observed STs. 
Regroup together samples from the same group.

```{r, echo=TRUE, warning=FALSE}
hmp.div_st <- read.csv("alpha_diversity_metrics_samples.csv", sep=",") 
colnames(hmp.div_st) <- c("sample", "chao1", "observed_sts", "shannon")
row.names(hmp.div_st) <- hmp.div_st$sample
div.df <- merge(hmp.div_st, hmp.meta, by.x="sample", by.y = "sam_name")
div.df2 <- div.df[, c("Group", "shannon", "observed_sts")]
colnames(div.df2) <- c("Group", "Shannon", "ST")
#colnames(div.df2)
options(max.print=999999)

stat.test.Shannon <- compare_means(
Shannon ~ Group, data = div.df2,
method = "t.test"
)
knitr::kable(stat.test.Shannon) %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))

stat.test.ST <- compare_means(
ST ~ Group, data = div.df2,
method = "t.test"
)
knitr::kable(stat.test.ST) %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))

div_df_melt <- reshape2::melt(div.df2)

p <- ggboxplot(div_df_melt, x = "Group", y = "value",
              facet.by = "variable", 
              scales = "free",
              width = 0.5,
              fill = "gray", legend= "right")
#ggpar(p, xlab = FALSE, ylab = FALSE)
lev <- levels(factor(div_df_melt$Group)) # get the variables
L.pairs <- combn(seq_along(lev), 2, simplify = FALSE, FUN = function(i) lev[i]) #%>% filter(p.signif != "ns")
my_stat_compare_means  <- function (mapping = NULL, data = NULL, method = NULL, paired = FALSE, 
    method.args = list(), ref.group = NULL, comparisons = NULL, 
    hide.ns = FALSE, label.sep = ", ", label = NULL, label.x.npc = "left", 
    label.y.npc = "top", label.x = NULL, label.y = NULL, tip.length = 0.03, 
    symnum.args = list(), geom = "text", position = "identity", 
    na.rm = FALSE, show.legend = NA, inherit.aes = TRUE, ...) 
{
    if (!is.null(comparisons)) {
        method.info <- ggpubr:::.method_info(method)
        method <- method.info$method
        method.args <- ggpubr:::.add_item(method.args, paired = paired)
        if (method == "wilcox.test") 
            method.args$exact <- FALSE
        pms <- list(...)
        size <- ifelse(is.null(pms$size), 0.3, pms$size)
        color <- ifelse(is.null(pms$color), "black", pms$color)
        map_signif_level <- FALSE
        if (is.null(label)) 
            label <- "p.format"
        if (ggpubr:::.is_p.signif_in_mapping(mapping) | (label %in% "p.signif")) {
            if (ggpubr:::.is_empty(symnum.args)) {
                map_signif_level <- c(`****` = 1e-04, `***` = 0.001, 
                  `**` = 0.01, `*` = 0.05, ns = 1)
            } else {
              map_signif_level <- symnum.args
            } 
            if (hide.ns) 
                names(map_signif_level)[5] <- " "
        }
        step_increase <- ifelse(is.null(label.y), 0.12, 0)
        ggsignif::geom_signif(comparisons = comparisons, y_position = label.y, 
            test = method, test.args = method.args, step_increase = step_increase, 
            size = size, color = color, map_signif_level = map_signif_level, 
            tip_length = tip.length, data = data)
    } else {
        mapping <- ggpubr:::.update_mapping(mapping, label)
        layer(stat = StatCompareMeans, data = data, mapping = mapping, 
            geom = geom, position = position, show.legend = show.legend, 
            inherit.aes = inherit.aes, params = list(label.x.npc = label.x.npc, 
                label.y.npc = label.y.npc, label.x = label.x, 
                label.y = label.y, label.sep = label.sep, method = method, 
                method.args = method.args, paired = paired, ref.group = ref.group, 
                symnum.args = symnum.args, hide.ns = hide.ns, 
                na.rm = na.rm, ...))
    }
}

p3 <- p + 
  stat_compare_means(
    method="t.test",
    comparisons = list(c("P16-P20.Foot", "P16-P20.Nose"), c("AH-XN.LH", "AH-XN.Nose"), c("AH-XN.NLH", "AH-XN.Nose")), 
    label = "p.signif",
    symnum.args <- list(cutpoints = c(0, 0.0001, 0.001, 0.01, 0.05, 1), symbols = c("****", "***", "**", "*", "ns")),
  ) +
  theme(axis.text.x = element_text(angle = 30, hjust = 1))  # add this line to rotate x-axis text
print(p3)

ggsave("./figures/alpha_diversity_ST_Group2.png", device="png", height = 10, width = 12)
ggsave("./figures/alpha_diversity_ST_Group2.svg", device="svg", height = 10, width = 12)
```

Microbial bioinformatics

Microbial bioinformatics uses computational tools to analyze genomes, track evolution, and study functions in microorganisms, including bacteria and viruses.

Leave a Reply Cancel reply